Skip to content

Check cuda::memcpy_async preconditions #4700

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions libcudacxx/include/cuda/__barrier/aligned_size.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# pragma system_header
#endif // no system header

#include <cuda/__cmath/pow2.h>
#include <cuda/std/cstddef>

#include <cuda/std/__cccl/prologue.h>
Expand All @@ -30,18 +31,29 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
template <_CUDA_VSTD::size_t _Alignment>
struct aligned_size_t
{
static_assert(::cuda::is_power_of_two(_Alignment), "alignment must be a power of two");

static constexpr _CUDA_VSTD::size_t align = _Alignment;
_CUDA_VSTD::size_t value;

_LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(size_t __s)
_LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(_CUDA_VSTD::size_t __s)
: value(__s)
{}
_LIBCUDACXX_HIDE_FROM_ABI constexpr operator size_t() const
{
_CCCL_ASSERT(value % align == 0,
"aligned_size_t must be constructed with a size that is a multiple of the alignment");
}
_LIBCUDACXX_HIDE_FROM_ABI constexpr operator _CUDA_VSTD::size_t() const
{
return value;
}
};

template <class, class = void>
inline constexpr _CUDA_VSTD::size_t __get_size_align_v = 1;

template <class _Tp>
inline constexpr _CUDA_VSTD::size_t __get_size_align_v<_Tp, _CUDA_VSTD::void_t<decltype(_Tp::align)>> = _Tp::align;

_LIBCUDACXX_END_NAMESPACE_CUDA

#include <cuda/std/__cccl/epilogue.h>
Expand Down
79 changes: 79 additions & 0 deletions libcudacxx/include/cuda/__memcpy_async/check_preconditions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA___MEMCPY_ASYNC_CHECK_PRECONDITIONS_H
#define _CUDA___MEMCPY_ASYNC_CHECK_PRECONDITIONS_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/__barrier/aligned_size.h>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__cstddef/types.h>
#include <cuda/std/cstdint>

#include <cuda/std/__cccl/prologue.h>

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

#ifndef _LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING
# define _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(_Cond, _Msg) _CCCL_ASSERT(_Cond, _Msg)
#else // ^^^ _LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING ^^^ / vvv !_LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING vvv
# define _LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(_Cond, _Msg) \
do \
{ \
if (!(_Cond)) \
{ \
return false; \
} \
} while (false)
#endif // _LIBCUDACXX_MEMCPY_ASYNC_PRE_TESTING

// Check the memcpy_async preconditions, return value is intended for testing purposes exclusively
template <class _Tp, class _Size>
_LIBCUDACXX_HIDE_FROM_ABI bool __memcpy_async_check_pre(_Tp* __dst, const _Tp* __src, _Size __size)
{
constexpr auto __align = _CUDA_VSTD::max(alignof(_Tp), __get_size_align_v<_Size>);

const auto __dst_val = reinterpret_cast<uintptr_t>(__dst);
const auto __src_val = reinterpret_cast<uintptr_t>(__src);

// check src and dst alignment
_LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(
__dst_val % __align == 0, "destination pointer must be aligned to the specified alignment");
_LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(
__src_val % __align == 0, "source pointer must be aligned to the specified alignment");

// check src and dst overlap
_LIBCUDACXX_MEMCPY_ASYNC_PRE_ASSERT(
!((__dst_val <= __src_val && __src_val < __dst_val + __size)
|| (__src_val <= __dst_val && __dst_val < __src_val + __size)),
"destination and source buffers must not overlap");
return true;
}

template <class _Size>
_LIBCUDACXX_HIDE_FROM_ABI bool __memcpy_async_check_pre(void* __dst, const void* __src, _Size __size)
{
return ::cuda::__memcpy_async_check_pre(reinterpret_cast<char*>(__dst), reinterpret_cast<const char*>(__src), __size);
}

_LIBCUDACXX_END_NAMESPACE_CUDA

#include <cuda/std/__cccl/epilogue.h>

#endif // _CUDA___MEMCPY_ASYNC_CHECK_PRECONDITIONS_H
9 changes: 9 additions & 0 deletions libcudacxx/include/cuda/__memcpy_async/memcpy_async.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,12 @@
# include <cuda/__barrier/barrier.h>
# include <cuda/__barrier/barrier_block_scope.h>
# include <cuda/__barrier/barrier_thread_scope.h>
# include <cuda/__memcpy_async/check_preconditions.h>
# include <cuda/__memcpy_async/memcpy_async_barrier.h>
# include <cuda/std/__atomic/scopes.h>
# include <cuda/std/__type_traits/void_t.h>
# include <cuda/std/cstddef>
# include <cuda/std/cstdint>

# include <cuda/std/__cccl/prologue.h>

Expand Down Expand Up @@ -104,13 +106,16 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
aligned_size_t<_Alignment> __size,
barrier<_Sco, _CompF>& __barrier)
{
static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)");
_CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
return ::cuda::__memcpy_async_barrier(__group, __destination, __source, __size, __barrier);
}

template <class _Tp, typename _Size, thread_scope _Sco, typename _CompF>
_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
memcpy_async(_Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
{
_CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
return ::cuda::__memcpy_async_barrier(__single_thread_group{}, __destination, __source, __size, __barrier);
}

Expand All @@ -122,6 +127,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
_CUDA_VSTD::size_t __size,
barrier<_Sco, _CompF>& __barrier)
{
_CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
return ::cuda::__memcpy_async_barrier(__group, __destination, __source, __size, __barrier);
}

Expand All @@ -133,6 +139,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
_CUDA_VSTD::size_t __size,
barrier<_Sco, _CompF>& __barrier)
{
_CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
return ::cuda::__memcpy_async_barrier(
__group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __barrier);
}
Expand All @@ -145,6 +152,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
aligned_size_t<_Alignment> __size,
barrier<_Sco, _CompF>& __barrier)
{
_CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
return ::cuda::__memcpy_async_barrier(
__group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __barrier);
}
Expand All @@ -153,6 +161,7 @@ template <typename _Size, thread_scope _Sco, typename _CompF>
_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
memcpy_async(void* __destination, void const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
{
_CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
return ::cuda::__memcpy_async_barrier(
__single_thread_group{},
reinterpret_cast<char*>(__destination),
Expand Down
35 changes: 3 additions & 32 deletions libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
# pragma system_header
#endif // no system header

#include <cuda/__barrier/aligned_size.h>
#include <cuda/__barrier/barrier.h>
#include <cuda/__barrier/barrier_block_scope.h>
#include <cuda/__barrier/barrier_thread_scope.h>
Expand All @@ -30,6 +31,7 @@
#include <cuda/__memcpy_async/is_local_smem_barrier.h>
#include <cuda/__memcpy_async/memcpy_completion.h>
#include <cuda/__memcpy_async/try_get_barrier_handle.h>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__atomic/scopes.h>
#include <cuda/std/__type_traits/is_trivially_copyable.h>
#include <cuda/std/cstddef>
Expand All @@ -39,36 +41,6 @@

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

/***********************************************************************
* cuda::memcpy_async dispatch helper functions
*
* - __get_size_align struct to determine the alignment from a size type.
***********************************************************************/

// The __get_size_align struct provides a way to query the guaranteed
// "alignment" of a provided size. In this case, an n-byte aligned size means
// that the size is a multiple of n.
//
// Use as follows:
// static_assert(__get_size_align<size_t>::align == 1)
// static_assert(__get_size_align<aligned_size_t<n>>::align == n)

// Default impl: always returns 1.
template <typename, typename = void>
struct __get_size_align
{
static constexpr int align = 1;
};

// aligned_size_t<n> overload: return n.
template <typename T>
struct __get_size_align<T, _CUDA_VSTD::void_t<decltype(T::align)>>
{
static constexpr int align = T::align;
};

////////////////////////////////////////////////////////////////////////////////

struct __single_thread_group
{
_LIBCUDACXX_HIDE_FROM_ABI void sync() const {}
Expand Down Expand Up @@ -99,8 +71,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_barrier(
: _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group);

// Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t.
constexpr _CUDA_VSTD::size_t __size_align = __get_size_align<_Size>::align;
constexpr _CUDA_VSTD::size_t __align = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp);
constexpr auto __align = _CUDA_VSTD::max(alignof(_Tp), __get_size_align_v<_Size>);
// Cast to char pointers. We don't need the type for alignment anymore and
// erasing the types reduces the number of instantiations of down-stream
// functions.
Expand Down
4 changes: 4 additions & 0 deletions libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
# include <cuda/__barrier/async_contract_fulfillment.h>
# include <cuda/__barrier/barrier_block_scope.h>
# include <cuda/__barrier/barrier_native_handle.h>
# include <cuda/__memcpy_async/check_preconditions.h>
# include <cuda/__ptx/instructions/cp_async_bulk.h>
# include <cuda/__ptx/ptx_dot_variants.h>
# include <cuda/__ptx/ptx_helper_functions.h>
Expand Down Expand Up @@ -56,6 +57,9 @@ _CCCL_DEVICE inline async_contract_fulfillment memcpy_async_tx(
static_assert(_CUDA_VSTD::is_trivially_copyable<_Tp>::value, "memcpy_async_tx requires a trivially copyable type");
# endif
static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned.");
static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)");

_CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__dest, __src, __size), "memcpy_async_tx preconditions unmet");

_CCCL_ASSERT(::__isShared(_CUDA_DEVICE::barrier_native_handle(__b)),
"Barrier must be located in local shared memory.");
Expand Down
20 changes: 13 additions & 7 deletions libcudacxx/include/cuda/pipeline
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,15 @@
# pragma system_header
#endif // no system header

#include <cuda/__barrier/aligned_size.h>
#include <cuda/__memcpy_async/check_preconditions.h>
#include <cuda/__memcpy_async/completion_mechanism.h>
#include <cuda/__memcpy_async/memcpy_async_barrier.h>
#include <cuda/atomic>
#include <cuda/barrier>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/chrono>
#include <cuda/std/cstdint>

#include <cuda/std/__cccl/prologue.h>

Expand Down Expand Up @@ -499,8 +503,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_pipeline(
_CUDA_VSTD::uint32_t __allowed_completions = _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group);

// Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t.
constexpr _CUDA_VSTD::size_t __size_align = __get_size_align<_Size>::align;
constexpr _CUDA_VSTD::size_t __align = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp);
constexpr auto __align = _CUDA_VSTD::max(alignof(_Tp), __get_size_align_v<_Size>);
// Cast to char pointers. We don't need the type for alignment anymore and
// erasing the types reduces the number of instantiations of down-stream
// functions.
Expand All @@ -518,35 +521,36 @@ template <typename _Group, class _Type, thread_scope _Scope>
_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
_Group const& __group, _Type* __destination, _Type const* __source, std::size_t __size, pipeline<_Scope>& __pipeline)
{
_CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
return ::cuda::__memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
}

template <typename _Group,
class _Type,
std::size_t _Alignment,
thread_scope _Scope,
std::size_t _Larger_alignment = (alignof(_Type) > _Alignment) ? alignof(_Type) : _Alignment>
template <typename _Group, class _Type, std::size_t _Alignment, thread_scope _Scope>
_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
_Group const& __group,
_Type* __destination,
_Type const* __source,
aligned_size_t<_Alignment> __size,
pipeline<_Scope>& __pipeline)
{
static_assert(_Alignment >= alignof(_Type), "alignment must be at least the alignof(T)");
_CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
return ::cuda::__memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
}

template <class _Type, typename _Size, thread_scope _Scope>
_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
memcpy_async(_Type* __destination, _Type const* __source, _Size __size, pipeline<_Scope>& __pipeline)
{
_CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
return ::cuda::__memcpy_async_pipeline(__single_thread_group{}, __destination, __source, __size, __pipeline);
}

template <typename _Group, thread_scope _Scope>
_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
_Group const& __group, void* __destination, void const* __source, std::size_t __size, pipeline<_Scope>& __pipeline)
{
_CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
return ::cuda::__memcpy_async_pipeline(
__group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __pipeline);
}
Expand All @@ -559,6 +563,7 @@ _LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
aligned_size_t<_Alignment> __size,
pipeline<_Scope>& __pipeline)
{
_CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
return ::cuda::__memcpy_async_pipeline(
__group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __pipeline);
}
Expand All @@ -567,6 +572,7 @@ template <typename _Size, thread_scope _Scope>
_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
memcpy_async(void* __destination, void const* __source, _Size __size, pipeline<_Scope>& __pipeline)
{
_CCCL_ASSERT(::cuda::__memcpy_async_check_pre(__destination, __source, __size), "memcpy_async preconditions unmet");
return ::cuda::__memcpy_async_pipeline(
__single_thread_group{},
reinterpret_cast<char*>(__destination),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
__host__ __device__ constexpr bool test()
{
using aligned_t = cuda::aligned_size_t<1>;
static_assert(!cuda::std::is_default_constructible<aligned_t>::value, "");
static_assert(aligned_t::align == 1, "");
static_assert(!cuda::std::is_default_constructible<aligned_t>::value);
static_assert(aligned_t::align == 1);
{
const aligned_t aligned{42};
assert(aligned.value == 42);
Expand All @@ -33,11 +33,11 @@ __host__ __device__ constexpr bool test()
}

// test C++11 differently
static_assert(cuda::aligned_size_t<42>{1337}.value == 1337, "");
static_assert(cuda::aligned_size_t<32>{1024}.value == 1024);

int main(int, char**)
{
test();
static_assert(test(), "");
static_assert(test());
return 0;
}
Loading
Loading