NVIDIA · miscco · May 28, 2025 · May 14, 2025 · May 15, 2025 · May 15, 2025
@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__cmath/pow2.h>
 #include <cuda/std/cstddef>
 
 #include <cuda/std/__cccl/prologue.h>
@@ -30,18 +31,29 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 template <_CUDA_VSTD::size_t _Alignment>
 struct aligned_size_t
 {
+  static_assert(::cuda::is_power_of_two(_Alignment), "alignment must be a power of two");
+
   static constexpr _CUDA_VSTD::size_t align = _Alignment;
   _CUDA_VSTD::size_t value;
 
-  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(size_t __s)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(_CUDA_VSTD::size_t __s)
       : value(__s)
-  {}
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr operator size_t() const
+  {
+    _CCCL_ASSERT(value % align == 0,
+                 "aligned_size_t must be constructed with a size that is a multiple of the alignment");
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr operator _CUDA_VSTD::size_t() const
   {
     return value;
   }
 };
 
+template <class, class = void>
+inline constexpr _CUDA_VSTD::size_t __get_size_align_v = 1;
+
+template <class _Tp>
+inline constexpr _CUDA_VSTD::size_t __get_size_align_v<_Tp, _CUDA_VSTD::void_t<decltype(_Tp::align)>> = _Tp::align;
+
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
 #include <cuda/std/__cccl/epilogue.h>

@@ -0,0 +1,79 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___MEMCPY_ASYNC_CHECK_PRECONDITIONS_H
+#define _CUDA___MEMCPY_ASYNC_CHECK_PRECONDITIONS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__barrier/aligned_size.h>
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/cstdint>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+#ifndef _LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING
+#  define _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(_Cond, _Msg) _CCCL_ASSERT(_Cond, _Msg)
+#else // ^^^ _LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING ^^^ / vvv !_LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING vvv
+#  define _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(_Cond, _Msg) \
+    do                                                     \
+    {                                                      \
+      if (!(_Cond))                                        \
+      {                                                    \
+        return false;                                      \
+      }                                                    \
+    } while (false)
+#endif // _LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING
+
+// Check the memcpy_async preconditions, return value is intended for testing purposes exclusively
+template <class _Tp, class _Size>
+_LIBCUDACXX_HIDE_FROM_ABI bool __memcpy_async_check_pre(_Tp* __dst, const _Tp* __src, _Size __size)
+{
+  constexpr auto __align = _CUDA_VSTD::max(alignof(_Tp), __get_size_align_v<_Size>);
+
+  const auto __dst_val = reinterpret_cast<uintptr_t>(__dst);
+  const auto __src_val = reinterpret_cast<uintptr_t>(__src);
+
+  // check src and dst alignment
+  _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(
+    __dst_val % __align == 0, "destination pointer must be aligned to the specified alignment");
+  _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(
+    __src_val % __align == 0, "source pointer must be aligned to the specified alignment");
+
+  // check src and dst overlap
+  _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(
+    !((__dst_val <= __src_val && __src_val < __dst_val + __size)
+      || (__src_val <= __dst_val && __dst_val < __src_val + __size)),
+    "destination and source buffers must not overlap");
+  return true;
+}
+
+template <class _Size>
+_LIBCUDACXX_HIDE_FROM_ABI bool __memcpy_async_check_pre(void* __dst, const void* __src, _Size __size)
+{
+  return ::cuda::__memcpy_async_check_pre(reinterpret_cast<char*>(__dst), reinterpret_cast<const char*>(__src), __size);
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA___MEMCPY_ASYNC_CHECK_PRECONDITIONS_H
@@ -29,10 +29,12 @@
 #  include <cuda/__barrier/barrier.h>
 #  include <cuda/__barrier/barrier_block_scope.h>
 #  include <cuda/__barrier/barrier_thread_scope.h>
+#  include <cuda/__memcpy_async/check_preconditions.h>
 #  include <cuda/__memcpy_async/memcpy_async_barrier.h>
 #  include <cuda/std/__atomic/scopes.h>
 #  include <cuda/std/__type_traits/void_t.h>
 #  include <cuda/std/cstddef>
+#  include <cuda/std/cstdint>
 
 #  include <cuda/std/__cccl/prologue.h>
 
@@ -104,13 +106,16 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   aligned_size_t<_Alignment> __size,
   barrier<_Sco, _CompF>& __barrier)
 {
+  static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)");
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return ::cuda::__memcpy_async_barrier(__group, __destination, __source, __size, __barrier);
 }
 
 template <class _Tp, typename _Size, thread_scope _Sco, typename _CompF>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
 memcpy_async(_Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
 {
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return ::cuda::__memcpy_async_barrier(__single_thread_group{}, __destination, __source, __size, __barrier);
 }
 
@@ -122,6 +127,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _CUDA_VSTD::size_t __size,
   barrier<_Sco, _CompF>& __barrier)
 {
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return ::cuda::__memcpy_async_barrier(__group, __destination, __source, __size, __barrier);
 }
 
@@ -133,6 +139,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _CUDA_VSTD::size_t __size,
   barrier<_Sco, _CompF>& __barrier)
 {
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return ::cuda::__memcpy_async_barrier(
     __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __barrier);
 }
@@ -145,6 +152,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   aligned_size_t<_Alignment> __size,
   barrier<_Sco, _CompF>& __barrier)
 {
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return ::cuda::__memcpy_async_barrier(
     __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __barrier);
 }
@@ -153,6 +161,7 @@ template <typename _Size, thread_scope _Sco, typename _CompF>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
 memcpy_async(void* __destination, void const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
 {
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return ::cuda::__memcpy_async_barrier(
     __single_thread_group{},
     reinterpret_cast<char*>(__destination),

@@ -22,6 +22,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__barrier/aligned_size.h>
 #include <cuda/__barrier/barrier.h>
 #include <cuda/__barrier/barrier_block_scope.h>
 #include <cuda/__barrier/barrier_thread_scope.h>
@@ -30,6 +31,7 @@
 #include <cuda/__memcpy_async/is_local_smem_barrier.h>
 #include <cuda/__memcpy_async/memcpy_completion.h>
 #include <cuda/__memcpy_async/try_get_barrier_handle.h>
+#include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/cstddef>
@@ -39,36 +41,6 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-/***********************************************************************
- * cuda::memcpy_async dispatch helper functions
- *
- * - __get_size_align struct to determine the alignment from a size type.
- ***********************************************************************/
-
-// The __get_size_align struct provides a way to query the guaranteed
-// "alignment" of a provided size. In this case, an n-byte aligned size means
-// that the size is a multiple of n.
-//
-// Use as follows:
-// static_assert(__get_size_align<size_t>::align == 1)
-// static_assert(__get_size_align<aligned_size_t<n>>::align == n)
-
-// Default impl: always returns 1.
-template <typename, typename = void>
-struct __get_size_align
-{
-  static constexpr int align = 1;
-};
-
-// aligned_size_t<n> overload: return n.
-template <typename T>
-struct __get_size_align<T, _CUDA_VSTD::void_t<decltype(T::align)>>
-{
-  static constexpr int align = T::align;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
 struct __single_thread_group
 {
   _LIBCUDACXX_HIDE_FROM_ABI void sync() const {}
@@ -99,8 +71,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_barrier(
       : _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group);
 
   // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t.
-  constexpr _CUDA_VSTD::size_t __size_align = __get_size_align<_Size>::align;
-  constexpr _CUDA_VSTD::size_t __align      = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp);
+  constexpr auto __align = _CUDA_VSTD::max(alignof(_Tp), __get_size_align_v<_Size>);
   // Cast to char pointers. We don't need the type for alignment anymore and
   // erasing the types reduces the number of instantiations of down-stream
   // functions.

@@ -29,6 +29,7 @@
 #    include <cuda/__barrier/async_contract_fulfillment.h>
 #    include <cuda/__barrier/barrier_block_scope.h>
 #    include <cuda/__barrier/barrier_native_handle.h>
+#    include <cuda/__memcpy_async/check_preconditions.h>
 #    include <cuda/__ptx/instructions/cp_async_bulk.h>
 #    include <cuda/__ptx/ptx_dot_variants.h>
 #    include <cuda/__ptx/ptx_helper_functions.h>
@@ -56,6 +57,9 @@ _CCCL_DEVICE inline async_contract_fulfillment memcpy_async_tx(
   static_assert(_CUDA_VSTD::is_trivially_copyable<_Tp>::value, "memcpy_async_tx requires a trivially copyable type");
 #    endif
   static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned.");
+  static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)");
+
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__dest, __src, __size), "memcpy_async_tx preconditions unmet");
 
   _CCCL_ASSERT(::__isShared(_CUDA_DEVICE::barrier_native_handle(__b)),
                "Barrier must be located in local shared memory.");

@@ -21,11 +21,15 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__barrier/aligned_size.h>
+#include <cuda/__memcpy_async/check_preconditions.h>
 #include <cuda/__memcpy_async/completion_mechanism.h>
 #include <cuda/__memcpy_async/memcpy_async_barrier.h>
 #include <cuda/atomic>
 #include <cuda/barrier>
+#include <cuda/std/__algorithm/max.h>
 #include <cuda/std/chrono>
+#include <cuda/std/cstdint>
 
 #include <cuda/std/__cccl/prologue.h>
 
@@ -499,8 +503,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_pipeline(
   _CUDA_VSTD::uint32_t __allowed_completions = _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group);
 
   // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t.
-  constexpr _CUDA_VSTD::size_t __size_align = __get_size_align<_Size>::align;
-  constexpr _CUDA_VSTD::size_t __align      = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp);
+  constexpr auto __align = _CUDA_VSTD::max(alignof(_Tp), __get_size_align_v<_Size>);
   // Cast to char pointers. We don't need the type for alignment anymore and
   // erasing the types reduces the number of instantiations of down-stream
   // functions.
@@ -518,35 +521,36 @@ template <typename _Group, class _Type, thread_scope _Scope>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _Group const& __group, _Type* __destination, _Type const* __source, std::size_t __size, pipeline<_Scope>& __pipeline)
 {
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return ::cuda::__memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
 }
 
-template <typename _Group,
-          class _Type,
-          std::size_t _Alignment,
-          thread_scope _Scope,
-          std::size_t _Larger_alignment = (alignof(_Type) > _Alignment) ? alignof(_Type) : _Alignment>
+template <typename _Group, class _Type, std::size_t _Alignment, thread_scope _Scope>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _Group const& __group,
   _Type* __destination,
   _Type const* __source,
   aligned_size_t<_Alignment> __size,
   pipeline<_Scope>& __pipeline)
 {
+  static_assert(_Alignment >= alignof(_Type), "alignment must be at least the alignof(T)");
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return ::cuda::__memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
 }
 
 template <class _Type, typename _Size, thread_scope _Scope>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
 memcpy_async(_Type* __destination, _Type const* __source, _Size __size, pipeline<_Scope>& __pipeline)
 {
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return ::cuda::__memcpy_async_pipeline(__single_thread_group{}, __destination, __source, __size, __pipeline);
 }
 
 template <typename _Group, thread_scope _Scope>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _Group const& __group, void* __destination, void const* __source, std::size_t __size, pipeline<_Scope>& __pipeline)
 {
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return ::cuda::__memcpy_async_pipeline(
     __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __pipeline);
 }
@@ -559,6 +563,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   aligned_size_t<_Alignment> __size,
   pipeline<_Scope>& __pipeline)
 {
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return ::cuda::__memcpy_async_pipeline(
     __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __pipeline);
 }
@@ -567,6 +572,7 @@ template <typename _Size, thread_scope _Scope>
 _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
 memcpy_async(void* __destination, void const* __source, _Size __size, pipeline<_Scope>& __pipeline)
 {
+  _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
   return ::cuda::__memcpy_async_pipeline(
     __single_thread_group{},
     reinterpret_cast<char*>(__destination),

@@ -22,8 +22,8 @@
 __host__ __device__ constexpr bool test()
 {
   using aligned_t = cuda::aligned_size_t<1>;
-  static_assert(!cuda::std::is_default_constructible<aligned_t>::value, "");
-  static_assert(aligned_t::align == 1, "");
+  static_assert(!cuda::std::is_default_constructible<aligned_t>::value);
+  static_assert(aligned_t::align == 1);
   {
     const aligned_t aligned{42};
     assert(aligned.value == 42);
@@ -33,11 +33,11 @@ __host__ __device__ constexpr bool test()
 }
 
 // test C++11 differently
-static_assert(cuda::aligned_size_t<42>{1337}.value == 1337, "");
+static_assert(cuda::aligned_size_t<32>{1024}.value == 1024);
 
 int main(int, char**)
 {
   test();
-  static_assert(test(), "");
+  static_assert(test());
   return 0;
 }