From 362387a67cb4f87f7aec1451cb9431296fe85efb Mon Sep 17 00:00:00 2001 From: David Bayer Date: Wed, 14 May 2025 19:56:31 +0200 Subject: [PATCH 1/5] Check `cuda::memcpy_async` preconditions --- .../include/cuda/__barrier/aligned_size.h | 12 +++++++--- .../cuda/__memcpy_async/memcpy_async.h | 9 +++++++ .../__memcpy_async/memcpy_async_barrier.h | 24 +++++++------------ .../cuda/__memcpy_async/memcpy_async_tx.h | 5 ++++ libcudacxx/include/cuda/pipeline | 17 ++++++++----- .../cuda/barrier/aligned_size_t.pass.cpp | 8 +++---- 6 files changed, 47 insertions(+), 28 deletions(-) diff --git a/libcudacxx/include/cuda/__barrier/aligned_size.h b/libcudacxx/include/cuda/__barrier/aligned_size.h index 9926023f05f..59ea50ff161 100644 --- a/libcudacxx/include/cuda/__barrier/aligned_size.h +++ b/libcudacxx/include/cuda/__barrier/aligned_size.h @@ -21,6 +21,7 @@ # pragma system_header #endif // no system header +#include #include #include @@ -30,13 +31,18 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA template <_CUDA_VSTD::size_t _Alignment> struct aligned_size_t { + static_assert(::cuda::is_power_of_two(_Alignment), "alignment must be a power of two"); + static constexpr _CUDA_VSTD::size_t align = _Alignment; _CUDA_VSTD::size_t value; - _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(size_t __s) + _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(_CUDA_VSTD::size_t __s) : value(__s) - {} - _LIBCUDACXX_HIDE_FROM_ABI constexpr operator size_t() const + { + _CCCL_ASSERT(value % align == 0, + "aligned_size_t must be constructed with a size that is a multiple of the alignment"); + } + _LIBCUDACXX_HIDE_FROM_ABI constexpr operator _CUDA_VSTD::size_t() const { return value; } diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h index ca58faefd88..a40d4abf77e 100644 --- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h +++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h @@ -33,6 +33,7 @@ # include # include # include +# include # include @@ -104,6 +105,10 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF>& __barrier) { + _CCCL_ASSERT(reinterpret_cast(__destination) % _Alignment == 0, + "destination pointer must be aligned to the specified alignment"); + _CCCL_ASSERT(reinterpret_cast(__source) % _Alignment == 0, + "source pointer must be aligned to the specified alignment"); return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier); } @@ -145,6 +150,10 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF>& __barrier) { + _CCCL_ASSERT(reinterpret_cast(__destination) % _Alignment == 0, + "destination pointer must be aligned to the specified alignment"); + _CCCL_ASSERT(reinterpret_cast(__source) % _Alignment == 0, + "source pointer must be aligned to the specified alignment"); return __memcpy_async_barrier( __group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __barrier); } diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h index 67acc3fd87f..9e04b577f18 100644 --- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h +++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h @@ -42,30 +42,24 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA /*********************************************************************** * cuda::memcpy_async dispatch helper functions * - * - __get_size_align struct to determine the alignment from a size type. + * - __get_size_align_v get the alignment from a size type. ***********************************************************************/ -// The __get_size_align struct provides a way to query the guaranteed +// The __get_size_align provides a way to query the guaranteed // "alignment" of a provided size. In this case, an n-byte aligned size means // that the size is a multiple of n. // // Use as follows: -// static_assert(__get_size_align::align == 1) -// static_assert(__get_size_align>::align == n) +// static_assert(__get_size_align_v == 1) +// static_assert(__get_size_align_v> == n) // Default impl: always returns 1. -template -struct __get_size_align -{ - static constexpr int align = 1; -}; +template +inline constexpr _CUDA_VSTD::size_t __get_size_align_v = 1; // aligned_size_t overload: return n. -template -struct __get_size_align> -{ - static constexpr int align = T::align; -}; +template +inline constexpr _CUDA_VSTD::size_t __get_size_align_v<_Tp, _CUDA_VSTD::void_t> = _Tp::align; //////////////////////////////////////////////////////////////////////////////// @@ -99,7 +93,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_barrier( : _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group); // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t. - constexpr _CUDA_VSTD::size_t __size_align = __get_size_align<_Size>::align; + constexpr _CUDA_VSTD::size_t __size_align = __get_size_align_v<_Size>; constexpr _CUDA_VSTD::size_t __align = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp); // Cast to char pointers. We don't need the type for alignment anymore and // erasing the types reduces the number of instantiations of down-stream diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h index f3506dc6a37..f012156d211 100644 --- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h +++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h @@ -57,6 +57,11 @@ _CCCL_DEVICE inline async_contract_fulfillment memcpy_async_tx( # endif static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned."); + _CCCL_ASSERT(reinterpret_cast(__dest) % _Alignment == 0, + "destination pointer must be aligned to the specified alignment"); + _CCCL_ASSERT(reinterpret_cast(__src) % _Alignment == 0, + "source pointer must be aligned to the specified alignment"); + _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); _CCCL_ASSERT(__isShared(__dest), "dest must point to shared memory."); _CCCL_ASSERT(__isGlobal(__src), "src must point to global memory."); diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline index ed2abe9f19f..f3966963427 100644 --- a/libcudacxx/include/cuda/pipeline +++ b/libcudacxx/include/cuda/pipeline @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -499,7 +500,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_pipeline( _CUDA_VSTD::uint32_t __allowed_completions = _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group); // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t. - constexpr _CUDA_VSTD::size_t __size_align = __get_size_align<_Size>::align; + constexpr _CUDA_VSTD::size_t __size_align = __get_size_align_v<_Size>; constexpr _CUDA_VSTD::size_t __align = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp); // Cast to char pointers. We don't need the type for alignment anymore and // erasing the types reduces the number of instantiations of down-stream @@ -521,11 +522,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline); } -template _Alignment) ? alignof(_Type) : _Alignment> +template _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( _Group const& __group, _Type* __destination, @@ -533,6 +530,10 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( aligned_size_t<_Alignment> __size, pipeline<_Scope>& __pipeline) { + _CCCL_ASSERT(reinterpret_cast(__destination) % _Alignment == 0, + "destination pointer must be aligned to the specified alignment"); + _CCCL_ASSERT(reinterpret_cast(__source) % _Alignment == 0, + "source pointer must be aligned to the specified alignment"); return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline); } @@ -559,6 +560,10 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( aligned_size_t<_Alignment> __size, pipeline<_Scope>& __pipeline) { + _CCCL_ASSERT(reinterpret_cast(__destination) % _Alignment == 0, + "destination pointer must be aligned to the specified alignment"); + _CCCL_ASSERT(reinterpret_cast(__source) % _Alignment == 0, + "source pointer must be aligned to the specified alignment"); return __memcpy_async_pipeline( __group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __pipeline); } diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/aligned_size_t.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/aligned_size_t.pass.cpp index 24fb11bd7b6..25cae5e59b5 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/aligned_size_t.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/aligned_size_t.pass.cpp @@ -22,8 +22,8 @@ __host__ __device__ constexpr bool test() { using aligned_t = cuda::aligned_size_t<1>; - static_assert(!cuda::std::is_default_constructible::value, ""); - static_assert(aligned_t::align == 1, ""); + static_assert(!cuda::std::is_default_constructible::value); + static_assert(aligned_t::align == 1); { const aligned_t aligned{42}; assert(aligned.value == 42); @@ -33,11 +33,11 @@ __host__ __device__ constexpr bool test() } // test C++11 differently -static_assert(cuda::aligned_size_t<42>{1337}.value == 1337, ""); +static_assert(cuda::aligned_size_t<32>{1024}.value == 1024); int main(int, char**) { test(); - static_assert(test(), ""); + static_assert(test()); return 0; } From 123f07d95660274f33bc01b5f55c0d925a925c09 Mon Sep 17 00:00:00 2001 From: David Bayer Date: Thu, 15 May 2025 11:25:28 +0200 Subject: [PATCH 2/5] Improve pre condition checking --- .../include/cuda/__barrier/aligned_size.h | 6 + .../cuda/__memcpy_async/check_preconditions.h | 79 ++++++++++ .../cuda/__memcpy_async/memcpy_async.h | 16 +- .../__memcpy_async/memcpy_async_barrier.h | 29 +--- .../cuda/__memcpy_async/memcpy_async_tx.h | 7 +- libcudacxx/include/cuda/pipeline | 21 +-- .../cuda/memcpy_async/preconditions.pass.cpp | 148 ++++++++++++++++++ 7 files changed, 258 insertions(+), 48 deletions(-) create mode 100644 libcudacxx/include/cuda/__memcpy_async/check_preconditions.h create mode 100644 libcudacxx/test/libcudacxx/cuda/memcpy_async/preconditions.pass.cpp diff --git a/libcudacxx/include/cuda/__barrier/aligned_size.h b/libcudacxx/include/cuda/__barrier/aligned_size.h index 59ea50ff161..6aa5b4fa37c 100644 --- a/libcudacxx/include/cuda/__barrier/aligned_size.h +++ b/libcudacxx/include/cuda/__barrier/aligned_size.h @@ -48,6 +48,12 @@ struct aligned_size_t } }; +template +inline constexpr _CUDA_VSTD::size_t __get_size_align_v = 1; + +template +inline constexpr _CUDA_VSTD::size_t __get_size_align_v<_Tp, _CUDA_VSTD::void_t> = _Tp::align; + _LIBCUDACXX_END_NAMESPACE_CUDA #include diff --git a/libcudacxx/include/cuda/__memcpy_async/check_preconditions.h b/libcudacxx/include/cuda/__memcpy_async/check_preconditions.h new file mode 100644 index 00000000000..d415e8091ab --- /dev/null +++ b/libcudacxx/include/cuda/__memcpy_async/check_preconditions.h @@ -0,0 +1,79 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___MEMCPY_ASYNC_CHECK_PRECONDITIONS_H +#define _CUDA___MEMCPY_ASYNC_CHECK_PRECONDITIONS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +#ifndef _LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING +# define _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(_Cond, _Msg) _CCCL_ASSERT(_Cond, _Msg) +#else // ^^^ _LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING ^^^ / vvv !_LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING vvv +# define _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(_Cond, _Msg) \ + do \ + { \ + if (!(_Cond)) \ + { \ + return false; \ + } \ + } while (false) +#endif // _LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING + +// Check the memcpy_async preconditions, return value is intended for testing purposes exclusively +template +_LIBCUDACXX_HIDE_FROM_ABI bool __memcpy_async_check_pre(_Tp* __dst, const _Tp* __src, _Size __size) +{ + constexpr auto __align = _CUDA_VSTD::max(alignof(_Tp), __get_size_align_v<_Size>); + + const auto __dst_val = reinterpret_cast(__dst); + const auto __src_val = reinterpret_cast(__src); + + // check src and dst alignment + _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT( + __dst_val % __align == 0, "destination pointer must be aligned to the specified alignment"); + _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT( + __src_val % __align == 0, "source pointer must be aligned to the specified alignment"); + + // check src and dst overlap + _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT( + !((__dst_val <= __src_val && __src_val < __dst_val + __size) + || (__src_val <= __dst_val && __dst_val < __src_val + __size)), + "destination and source buffers must not overlap"); + return true; +} + +template +_LIBCUDACXX_HIDE_FROM_ABI bool __memcpy_async_check_pre(void* __dst, const void* __src, _Size __size) +{ + return ::cuda::__memcpy_async_check_pre(reinterpret_cast(__dst), reinterpret_cast(__src), __size); +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#include + +#endif // _CUDA___MEMCPY_ASYNC_CHECK_PRECONDITIONS_H diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h index a40d4abf77e..d32a315f80a 100644 --- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h +++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h @@ -29,6 +29,7 @@ # include # include # include +# include # include # include # include @@ -105,10 +106,8 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF>& __barrier) { - _CCCL_ASSERT(reinterpret_cast(__destination) % _Alignment == 0, - "destination pointer must be aligned to the specified alignment"); - _CCCL_ASSERT(reinterpret_cast(__source) % _Alignment == 0, - "source pointer must be aligned to the specified alignment"); + static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)"); + ::cuda::__memcpy_async_check_pre(__destination, __source, __size); return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier); } @@ -116,6 +115,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(_Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier) { + ::cuda::__memcpy_async_check_pre(__destination, __source, __size); return __memcpy_async_barrier(__single_thread_group{}, __destination, __source, __size, __barrier); } @@ -127,6 +127,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( _CUDA_VSTD::size_t __size, barrier<_Sco, _CompF>& __barrier) { + ::cuda::__memcpy_async_check_pre(__destination, __source, __size); return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier); } @@ -138,6 +139,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( _CUDA_VSTD::size_t __size, barrier<_Sco, _CompF>& __barrier) { + ::cuda::__memcpy_async_check_pre(__destination, __source, __size); return __memcpy_async_barrier( __group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __barrier); } @@ -150,10 +152,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF>& __barrier) { - _CCCL_ASSERT(reinterpret_cast(__destination) % _Alignment == 0, - "destination pointer must be aligned to the specified alignment"); - _CCCL_ASSERT(reinterpret_cast(__source) % _Alignment == 0, - "source pointer must be aligned to the specified alignment"); + ::cuda::__memcpy_async_check_pre(__destination, __source, __size); return __memcpy_async_barrier( __group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __barrier); } @@ -162,6 +161,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(void* __destination, void const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier) { + ::cuda::__memcpy_async_check_pre(__destination, __source, __size); return __memcpy_async_barrier( __single_thread_group{}, reinterpret_cast(__destination), diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h index 9e04b577f18..5cc7ac93761 100644 --- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h +++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h @@ -22,6 +22,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -30,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -39,30 +41,6 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA -/*********************************************************************** - * cuda::memcpy_async dispatch helper functions - * - * - __get_size_align_v get the alignment from a size type. - ***********************************************************************/ - -// The __get_size_align provides a way to query the guaranteed -// "alignment" of a provided size. In this case, an n-byte aligned size means -// that the size is a multiple of n. -// -// Use as follows: -// static_assert(__get_size_align_v == 1) -// static_assert(__get_size_align_v> == n) - -// Default impl: always returns 1. -template -inline constexpr _CUDA_VSTD::size_t __get_size_align_v = 1; - -// aligned_size_t overload: return n. -template -inline constexpr _CUDA_VSTD::size_t __get_size_align_v<_Tp, _CUDA_VSTD::void_t> = _Tp::align; - -//////////////////////////////////////////////////////////////////////////////// - struct __single_thread_group { _LIBCUDACXX_HIDE_FROM_ABI void sync() const {} @@ -93,8 +71,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_barrier( : _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group); // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t. - constexpr _CUDA_VSTD::size_t __size_align = __get_size_align_v<_Size>; - constexpr _CUDA_VSTD::size_t __align = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp); + constexpr auto __align = _CUDA_VSTD::max(alignof(_Tp), __get_size_align_v<_Size>); // Cast to char pointers. We don't need the type for alignment anymore and // erasing the types reduces the number of instantiations of down-stream // functions. diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h index f012156d211..f5e49be9fba 100644 --- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h +++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h @@ -29,6 +29,7 @@ # include # include # include +# include # include # include # include @@ -56,11 +57,9 @@ _CCCL_DEVICE inline async_contract_fulfillment memcpy_async_tx( static_assert(_CUDA_VSTD::is_trivially_copyable<_Tp>::value, "memcpy_async_tx requires a trivially copyable type"); # endif static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned."); + static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)"); - _CCCL_ASSERT(reinterpret_cast(__dest) % _Alignment == 0, - "destination pointer must be aligned to the specified alignment"); - _CCCL_ASSERT(reinterpret_cast(__src) % _Alignment == 0, - "source pointer must be aligned to the specified alignment"); + ::cuda::__memcpy_async_check_pre(__dest, __src, __size); _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); _CCCL_ASSERT(__isShared(__dest), "dest must point to shared memory."); diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline index f3966963427..607ac4cfa84 100644 --- a/libcudacxx/include/cuda/pipeline +++ b/libcudacxx/include/cuda/pipeline @@ -21,10 +21,13 @@ # pragma system_header #endif // no system header +#include +#include #include #include #include #include +#include #include #include @@ -500,8 +503,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_pipeline( _CUDA_VSTD::uint32_t __allowed_completions = _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group); // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t. - constexpr _CUDA_VSTD::size_t __size_align = __get_size_align_v<_Size>; - constexpr _CUDA_VSTD::size_t __align = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp); + constexpr auto __align = _CUDA_VSTD::max(alignof(_Tp), __get_size_align_v<_Size>); // Cast to char pointers. We don't need the type for alignment anymore and // erasing the types reduces the number of instantiations of down-stream // functions. @@ -519,6 +521,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( _Group const& __group, _Type* __destination, _Type const* __source, std::size_t __size, pipeline<_Scope>& __pipeline) { + ::cuda::__memcpy_async_check_pre(__destination, __source, __size); return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline); } @@ -530,10 +533,8 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( aligned_size_t<_Alignment> __size, pipeline<_Scope>& __pipeline) { - _CCCL_ASSERT(reinterpret_cast(__destination) % _Alignment == 0, - "destination pointer must be aligned to the specified alignment"); - _CCCL_ASSERT(reinterpret_cast(__source) % _Alignment == 0, - "source pointer must be aligned to the specified alignment"); + static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)"); + ::cuda::__memcpy_async_check_pre(__destination, __source, __size); return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline); } @@ -541,6 +542,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(_Type* __destination, _Type const* __source, _Size __size, pipeline<_Scope>& __pipeline) { + ::cuda::__memcpy_async_check_pre(__destination, __source, __size); return __memcpy_async_pipeline(__single_thread_group{}, __destination, __source, __size, __pipeline); } @@ -548,6 +550,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( _Group const& __group, void* __destination, void const* __source, std::size_t __size, pipeline<_Scope>& __pipeline) { + ::cuda::__memcpy_async_check_pre(__destination, __source, __size); return __memcpy_async_pipeline( __group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __pipeline); } @@ -560,10 +563,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( aligned_size_t<_Alignment> __size, pipeline<_Scope>& __pipeline) { - _CCCL_ASSERT(reinterpret_cast(__destination) % _Alignment == 0, - "destination pointer must be aligned to the specified alignment"); - _CCCL_ASSERT(reinterpret_cast(__source) % _Alignment == 0, - "source pointer must be aligned to the specified alignment"); + ::cuda::__memcpy_async_check_pre(__destination, __source, __size); return __memcpy_async_pipeline( __group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __pipeline); } @@ -572,6 +572,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(void* __destination, void const* __source, _Size __size, pipeline<_Scope>& __pipeline) { + ::cuda::__memcpy_async_check_pre(__destination, __source, __size); return __memcpy_async_pipeline( __single_thread_group{}, reinterpret_cast(__destination), diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/preconditions.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/preconditions.pass.cpp new file mode 100644 index 00000000000..7d4ead758dd --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/preconditions.pass.cpp @@ -0,0 +1,148 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#define _LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING + +#include +#include + +__host__ __device__ void test_typed() +{ + using T = int; + + constexpr cuda::std::size_t align_scale = 2; + constexpr cuda::std::size_t align = align_scale * alignof(T); + constexpr cuda::std::size_t n = 16; + constexpr cuda::std::size_t size = n * sizeof(T); + + alignas(align) T a[n * 2]{}; + alignas(align) const T b[n * 2]{}; + + const auto a_missaligned = reinterpret_cast(reinterpret_cast(a) + alignof(T) / 2); + const auto b_missaligned = reinterpret_cast(reinterpret_cast(b) + alignof(T) / 2); + + // 1. test ordinary size type + { + assert(cuda::__memcpy_async_check_pre(a, b, size)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size)); + assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size)); + } + + // 2. test overaligned cuda::aligned_size_t + { + cuda::aligned_size_t size_aligned(size); + assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); + } + + // 3. test cuda::aligned_size_t aligned to alignof(T) + { + cuda::aligned_size_t size_aligned(size); + assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); + } + + // 4. test underaligned cuda::aligned_size_t + { + cuda::aligned_size_t size_aligned(size); + assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); + } + + // 5. test overlap + { + assert(!cuda::__memcpy_async_check_pre(a, a, size)); + assert(!cuda::__memcpy_async_check_pre(a, a_missaligned, size)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, a, size)); + assert(cuda::__memcpy_async_check_pre(a, a + n, size)); + assert(cuda::__memcpy_async_check_pre(a + n, a, size)); + assert(!cuda::__memcpy_async_check_pre(a, a + n - 1, size)); + assert(!cuda::__memcpy_async_check_pre(a + n - 1, a, size)); + } +} + +__host__ __device__ void test_void() +{ + using T = int; + + constexpr cuda::std::size_t align_scale = 2; + constexpr cuda::std::size_t align = align_scale * alignof(T); + constexpr cuda::std::size_t n = 16; + constexpr cuda::std::size_t size = n * sizeof(T); + + alignas(align) T a_buff[n * 2]{}; + alignas(align) const T b_buff[n * 2]{}; + + void* a = a_buff; + const void* b = b_buff; + + const auto a_missaligned = reinterpret_cast(reinterpret_cast(a) + alignof(T) / 2); + const auto b_missaligned = reinterpret_cast(reinterpret_cast(b) + alignof(T) / 2); + + // 1. test ordinary size type + { + assert(cuda::__memcpy_async_check_pre(a, b, size)); + assert(cuda::__memcpy_async_check_pre(a_missaligned, b, size)); + assert(cuda::__memcpy_async_check_pre(a, b_missaligned, size)); + assert(cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size)); + } + + // 2. test overaligned cuda::aligned_size_t + { + cuda::aligned_size_t size_aligned(size); + assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); + } + + // 3. test cuda::aligned_size_t aligned to alignof(T) + { + cuda::aligned_size_t size_aligned(size); + assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); + } + + // 4. test underaligned cuda::aligned_size_t + { + cuda::aligned_size_t size_aligned(size); + assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); + assert(cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); + assert(cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); + assert(cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); + } + + // 5. test overlap + { + assert(!cuda::__memcpy_async_check_pre(a, a, size)); + assert(!cuda::__memcpy_async_check_pre(a, a_missaligned, size)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, a, size)); + assert(cuda::__memcpy_async_check_pre(a, (const void*) (a_buff + n), size)); + assert(cuda::__memcpy_async_check_pre((void*) (a_buff + n), a, size)); + assert(!cuda::__memcpy_async_check_pre(a, (const void*) (a_buff + n - 1), size)); + assert(!cuda::__memcpy_async_check_pre((void*) (a_buff + n - 1), a, size)); + } +} + +int main(int, char**) +{ + test_typed(); + test_void(); + return 0; +} From bfa7354d3e6efd468121dcbdc63fdac27331b8f5 Mon Sep 17 00:00:00 2001 From: David Bayer Date: Thu, 15 May 2025 12:46:55 +0200 Subject: [PATCH 3/5] fix invalid type name --- libcudacxx/include/cuda/pipeline | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline index 607ac4cfa84..3089b6874bb 100644 --- a/libcudacxx/include/cuda/pipeline +++ b/libcudacxx/include/cuda/pipeline @@ -533,7 +533,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( aligned_size_t<_Alignment> __size, pipeline<_Scope>& __pipeline) { - static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)"); + static_assert(_Alignment >= alignof(_Type), "alignment must be at least the alignof(T)"); ::cuda::__memcpy_async_check_pre(__destination, __source, __size); return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline); } From d7bbc1067002efd9b92cbf5485f220cac887e074 Mon Sep 17 00:00:00 2001 From: David Bayer Date: Tue, 20 May 2025 10:18:02 +0200 Subject: [PATCH 4/5] wrap checks with `_CCCL_ASSERT` --- .../include/cuda/__memcpy_async/memcpy_async.h | 12 ++++++------ .../include/cuda/__memcpy_async/memcpy_async_tx.h | 2 +- libcudacxx/include/cuda/pipeline | 12 ++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h index d32a315f80a..1b9be7568cd 100644 --- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h +++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async.h @@ -107,7 +107,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( barrier<_Sco, _CompF>& __barrier) { static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)"); - ::cuda::__memcpy_async_check_pre(__destination, __source, __size); + _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet"); return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier); } @@ -115,7 +115,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(_Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier) { - ::cuda::__memcpy_async_check_pre(__destination, __source, __size); + _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet"); return __memcpy_async_barrier(__single_thread_group{}, __destination, __source, __size, __barrier); } @@ -127,7 +127,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( _CUDA_VSTD::size_t __size, barrier<_Sco, _CompF>& __barrier) { - ::cuda::__memcpy_async_check_pre(__destination, __source, __size); + _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet"); return __memcpy_async_barrier(__group, __destination, __source, __size, __barrier); } @@ -139,7 +139,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( _CUDA_VSTD::size_t __size, barrier<_Sco, _CompF>& __barrier) { - ::cuda::__memcpy_async_check_pre(__destination, __source, __size); + _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet"); return __memcpy_async_barrier( __group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __barrier); } @@ -152,7 +152,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF>& __barrier) { - ::cuda::__memcpy_async_check_pre(__destination, __source, __size); + _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet"); return __memcpy_async_barrier( __group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __barrier); } @@ -161,7 +161,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(void* __destination, void const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier) { - ::cuda::__memcpy_async_check_pre(__destination, __source, __size); + _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet"); return __memcpy_async_barrier( __single_thread_group{}, reinterpret_cast(__destination), diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h index f5e49be9fba..cde45efd20c 100644 --- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h +++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h @@ -59,7 +59,7 @@ _CCCL_DEVICE inline async_contract_fulfillment memcpy_async_tx( static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned."); static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)"); - ::cuda::__memcpy_async_check_pre(__dest, __src, __size); + _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__dest, __src, __size), "memcpy_async_tx preconditions unmet"); _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); _CCCL_ASSERT(__isShared(__dest), "dest must point to shared memory."); diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline index 3089b6874bb..c78b93bd0d6 100644 --- a/libcudacxx/include/cuda/pipeline +++ b/libcudacxx/include/cuda/pipeline @@ -521,7 +521,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( _Group const& __group, _Type* __destination, _Type const* __source, std::size_t __size, pipeline<_Scope>& __pipeline) { - ::cuda::__memcpy_async_check_pre(__destination, __source, __size); + _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet"); return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline); } @@ -534,7 +534,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( pipeline<_Scope>& __pipeline) { static_assert(_Alignment >= alignof(_Type), "alignment must be at least the alignof(T)"); - ::cuda::__memcpy_async_check_pre(__destination, __source, __size); + _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet"); return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline); } @@ -542,7 +542,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(_Type* __destination, _Type const* __source, _Size __size, pipeline<_Scope>& __pipeline) { - ::cuda::__memcpy_async_check_pre(__destination, __source, __size); + _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet"); return __memcpy_async_pipeline(__single_thread_group{}, __destination, __source, __size, __pipeline); } @@ -550,7 +550,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( _Group const& __group, void* __destination, void const* __source, std::size_t __size, pipeline<_Scope>& __pipeline) { - ::cuda::__memcpy_async_check_pre(__destination, __source, __size); + _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet"); return __memcpy_async_pipeline( __group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __pipeline); } @@ -563,7 +563,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async( aligned_size_t<_Alignment> __size, pipeline<_Scope>& __pipeline) { - ::cuda::__memcpy_async_check_pre(__destination, __source, __size); + _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet"); return __memcpy_async_pipeline( __group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __pipeline); } @@ -572,7 +572,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(void* __destination, void const* __source, _Size __size, pipeline<_Scope>& __pipeline) { - ::cuda::__memcpy_async_check_pre(__destination, __source, __size); + _CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet"); return __memcpy_async_pipeline( __single_thread_group{}, reinterpret_cast(__destination), From 284945f581c3e06cb07b9b5b12136a08671bd62e Mon Sep 17 00:00:00 2001 From: David Bayer Date: Fri, 23 May 2025 09:21:30 +0200 Subject: [PATCH 5/5] avoid compiler bugs in test file --- .../cuda/memcpy_async/preconditions.pass.cpp | 223 +++++++++--------- 1 file changed, 109 insertions(+), 114 deletions(-) diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/preconditions.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/preconditions.pass.cpp index 7d4ead758dd..0f465f84514 100644 --- a/libcudacxx/test/libcudacxx/cuda/memcpy_async/preconditions.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/preconditions.pass.cpp @@ -13,7 +13,7 @@ #include #include -__host__ __device__ void test_typed() +__host__ __device__ void test() { using T = int; @@ -22,127 +22,122 @@ __host__ __device__ void test_typed() constexpr cuda::std::size_t n = 16; constexpr cuda::std::size_t size = n * sizeof(T); - alignas(align) T a[n * 2]{}; - alignas(align) const T b[n * 2]{}; - - const auto a_missaligned = reinterpret_cast(reinterpret_cast(a) + alignof(T) / 2); - const auto b_missaligned = reinterpret_cast(reinterpret_cast(b) + alignof(T) / 2); - - // 1. test ordinary size type - { - assert(cuda::__memcpy_async_check_pre(a, b, size)); - assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size)); - assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size)); - assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size)); - } - - // 2. test overaligned cuda::aligned_size_t - { - cuda::aligned_size_t size_aligned(size); - assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); - assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); - assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); - assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); - } - - // 3. test cuda::aligned_size_t aligned to alignof(T) - { - cuda::aligned_size_t size_aligned(size); - assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); - assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); - assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); - assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); - } - - // 4. test underaligned cuda::aligned_size_t - { - cuda::aligned_size_t size_aligned(size); - assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); - assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); - assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); - assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); - } - - // 5. test overlap - { - assert(!cuda::__memcpy_async_check_pre(a, a, size)); - assert(!cuda::__memcpy_async_check_pre(a, a_missaligned, size)); - assert(!cuda::__memcpy_async_check_pre(a_missaligned, a, size)); - assert(cuda::__memcpy_async_check_pre(a, a + n, size)); - assert(cuda::__memcpy_async_check_pre(a + n, a, size)); - assert(!cuda::__memcpy_async_check_pre(a, a + n - 1, size)); - assert(!cuda::__memcpy_async_check_pre(a + n - 1, a, size)); - } -} - -__host__ __device__ void test_void() -{ - using T = int; - - constexpr cuda::std::size_t align_scale = 2; - constexpr cuda::std::size_t align = align_scale * alignof(T); - constexpr cuda::std::size_t n = 16; - constexpr cuda::std::size_t size = n * sizeof(T); - - alignas(align) T a_buff[n * 2]{}; - alignas(align) const T b_buff[n * 2]{}; - - void* a = a_buff; - const void* b = b_buff; - - const auto a_missaligned = reinterpret_cast(reinterpret_cast(a) + alignof(T) / 2); - const auto b_missaligned = reinterpret_cast(reinterpret_cast(b) + alignof(T) / 2); - - // 1. test ordinary size type - { - assert(cuda::__memcpy_async_check_pre(a, b, size)); - assert(cuda::__memcpy_async_check_pre(a_missaligned, b, size)); - assert(cuda::__memcpy_async_check_pre(a, b_missaligned, size)); - assert(cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size)); - } - - // 2. test overaligned cuda::aligned_size_t - { - cuda::aligned_size_t size_aligned(size); - assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); - assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); - assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); - assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); - } - - // 3. test cuda::aligned_size_t aligned to alignof(T) - { - cuda::aligned_size_t size_aligned(size); - assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); - assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); - assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); - assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); - } - - // 4. test underaligned cuda::aligned_size_t + // test typed overloads { - cuda::aligned_size_t size_aligned(size); - assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); - assert(cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); - assert(cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); - assert(cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); + alignas(align) T a[n * 2]{}; + alignas(align) const T b[n * 2]{}; + + const auto a_missaligned = reinterpret_cast(reinterpret_cast(a) + alignof(T) / 2); + const auto b_missaligned = reinterpret_cast(reinterpret_cast(b) + alignof(T) / 2); + + // 1. test ordinary size type + { + assert(cuda::__memcpy_async_check_pre(a, b, size)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size)); + assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size)); + } + + // 2. test overaligned cuda::aligned_size_t + { + cuda::aligned_size_t size_aligned(size); + assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); + } + + // 3. test cuda::aligned_size_t aligned to alignof(T) + { + cuda::aligned_size_t size_aligned(size); + assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); + } + + // 4. test underaligned cuda::aligned_size_t + { + cuda::aligned_size_t size_aligned(size); + assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); + } + + // 5. test overlap + { + assert(!cuda::__memcpy_async_check_pre(a, a, size)); + assert(!cuda::__memcpy_async_check_pre(a, a_missaligned, size)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, a, size)); + assert(cuda::__memcpy_async_check_pre(a, a + n, size)); + assert(cuda::__memcpy_async_check_pre(a + n, a, size)); + assert(!cuda::__memcpy_async_check_pre(a, a + n - 1, size)); + assert(!cuda::__memcpy_async_check_pre(a + n - 1, a, size)); + } } - // 5. test overlap + // test void overloads { - assert(!cuda::__memcpy_async_check_pre(a, a, size)); - assert(!cuda::__memcpy_async_check_pre(a, a_missaligned, size)); - assert(!cuda::__memcpy_async_check_pre(a_missaligned, a, size)); - assert(cuda::__memcpy_async_check_pre(a, (const void*) (a_buff + n), size)); - assert(cuda::__memcpy_async_check_pre((void*) (a_buff + n), a, size)); - assert(!cuda::__memcpy_async_check_pre(a, (const void*) (a_buff + n - 1), size)); - assert(!cuda::__memcpy_async_check_pre((void*) (a_buff + n - 1), a, size)); + alignas(align) T a_buff[n * 2]{}; + alignas(align) const T b_buff[n * 2]{}; + + void* a = a_buff; + const void* b = b_buff; + + const auto a_missaligned = reinterpret_cast(reinterpret_cast(a) + alignof(T) / 2); + const auto b_missaligned = reinterpret_cast(reinterpret_cast(b) + alignof(T) / 2); + + // 1. test ordinary size type + { + assert(cuda::__memcpy_async_check_pre(a, b, size)); + assert(cuda::__memcpy_async_check_pre(a_missaligned, b, size)); + assert(cuda::__memcpy_async_check_pre(a, b_missaligned, size)); + assert(cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size)); + } + + // 2. test overaligned cuda::aligned_size_t + { + cuda::aligned_size_t size_aligned(size); + assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); + } + + // 3. test cuda::aligned_size_t aligned to alignof(T) + { + cuda::aligned_size_t size_aligned(size); + assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); + } + + // 4. test underaligned cuda::aligned_size_t + { + cuda::aligned_size_t size_aligned(size); + assert(cuda::__memcpy_async_check_pre(a, b, size_aligned)); + assert(cuda::__memcpy_async_check_pre(a_missaligned, b, size_aligned)); + assert(cuda::__memcpy_async_check_pre(a, b_missaligned, size_aligned)); + assert(cuda::__memcpy_async_check_pre(a_missaligned, b_missaligned, size_aligned)); + } + + // 5. test overlap + { + assert(!cuda::__memcpy_async_check_pre(a, a, size)); + assert(!cuda::__memcpy_async_check_pre(a, a_missaligned, size)); + assert(!cuda::__memcpy_async_check_pre(a_missaligned, a, size)); + assert(cuda::__memcpy_async_check_pre(a, (const void*) (a_buff + n), size)); + assert(cuda::__memcpy_async_check_pre((void*) (a_buff + n), a, size)); + assert(!cuda::__memcpy_async_check_pre(a, (const void*) (a_buff + n - 1), size)); + assert(!cuda::__memcpy_async_check_pre((void*) (a_buff + n - 1), a, size)); + } } } int main(int, char**) { - test_typed(); - test_void(); + test(); return 0; }