From bf70a4fc06a1d9789a6cb081e5543bc3bff70f32 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 21 Oct 2024 11:08:44 +0200 Subject: [PATCH] squeeze memory dispatch Co-authored-by: Marcel Koch --- .../cuda_hip/components/memory.nvidia.hpp.inc | 766 ++++++++---------- dev_tools/scripts/generate_cuda_memory_ptx.py | 54 +- 2 files changed, 380 insertions(+), 440 deletions(-) diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc index 49c9ae7601c..a695904e82a 100644 --- a/common/cuda_hip/components/memory.nvidia.hpp.inc +++ b/common/cuda_hip/components/memory.nvidia.hpp.inc @@ -68,17 +68,15 @@ __device__ __forceinline__ void membar_acq_rel_local() __device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr) { int32 result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.s32 %0, [%1];" - : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) - : "memory"); + "ld.volatile.shared.s32 %0, [%1];" #else - asm volatile("ld.relaxed.cta.shared.s32 %0, [%1];" - : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) - : "memory"); + "ld.relaxed.cta.shared.s32 %0, [%1];" #endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); return result; } @@ -86,34 +84,30 @@ __device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr) __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result) { + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.s32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "r"(result) - : "memory"); + "st.volatile.shared.s32 [%0], %1;" #else - asm volatile("st.relaxed.cta.shared.s32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "r"(result) - : "memory"); + "st.relaxed.cta.shared.s32 [%0], %1;" #endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "r"(result) + : "memory"); } __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr) { int64 result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.s64 %0, [%1];" - : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) - : "memory"); + "ld.volatile.shared.s64 %0, [%1];" #else - asm volatile("ld.relaxed.cta.shared.s64 %0, [%1];" - : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) - : "memory"); + "ld.relaxed.cta.shared.s64 %0, [%1];" #endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); return result; } @@ -121,34 +115,30 @@ __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr) __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result) { + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.s64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "l"(result) - : "memory"); + "st.volatile.shared.s64 [%0], %1;" #else - asm volatile("st.relaxed.cta.shared.s64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "l"(result) - : "memory"); + "st.relaxed.cta.shared.s64 [%0], %1;" #endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "l"(result) + : "memory"); } __device__ __forceinline__ float load_relaxed_shared(const float* ptr) { float result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.f32 %0, [%1];" - : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) - : "memory"); + "ld.volatile.shared.f32 %0, [%1];" #else - asm volatile("ld.relaxed.cta.shared.f32 %0, [%1];" - : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) - : "memory"); + "ld.relaxed.cta.shared.f32 %0, [%1];" #endif + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); return result; } @@ -156,36 +146,30 @@ __device__ __forceinline__ float load_relaxed_shared(const float* ptr) __device__ __forceinline__ void store_relaxed_shared(float* ptr, float result) { + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "f"(result) - : "memory"); + "st.volatile.shared.f32 [%0], %1;" #else - asm volatile("st.relaxed.cta.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "f"(result) - : "memory"); + "st.relaxed.cta.shared.f32 [%0], %1;" #endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "f"(result) + : "memory"); } __device__ __forceinline__ double load_relaxed_shared(const double* ptr) { double result; -#if __CUDA_ARCH__ < 700 asm volatile( +#if __CUDA_ARCH__ < 700 "ld.volatile.shared.f64 %0, [%1];" - : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) - : "memory"); #else - asm volatile( "ld.relaxed.cta.shared.f64 %0, [%1];" +#endif : "=d"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); -#endif return result; } @@ -193,34 +177,30 @@ __device__ __forceinline__ double load_relaxed_shared(const double* ptr) __device__ __forceinline__ void store_relaxed_shared(double* ptr, double result) { + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "d"(result) - : "memory"); + "st.volatile.shared.f64 [%0], %1;" #else - asm volatile("st.relaxed.cta.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "d"(result) - : "memory"); + "st.relaxed.cta.shared.f64 [%0], %1;" #endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "d"(result) + : "memory"); } __device__ __forceinline__ int32 load_acquire_shared(const int32* ptr) { int32 result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.s32 %0, [%1];" - : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) - : "memory"); + "ld.volatile.shared.s32 %0, [%1];" #else - asm volatile("ld.acquire.cta.shared.s32 %0, [%1];" - : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) - : "memory"); + "ld.acquire.cta.shared.s32 %0, [%1];" #endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); membar_acq_rel_shared(); return result; } @@ -229,34 +209,30 @@ __device__ __forceinline__ int32 load_acquire_shared(const int32* ptr) __device__ __forceinline__ void store_release_shared(int32* ptr, int32 result) { membar_acq_rel_shared(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.s32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "r"(result) - : "memory"); + "st.volatile.shared.s32 [%0], %1;" #else - asm volatile("st.release.cta.shared.s32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "r"(result) - : "memory"); + "st.release.cta.shared.s32 [%0], %1;" #endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "r"(result) + : "memory"); } __device__ __forceinline__ int64 load_acquire_shared(const int64* ptr) { int64 result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.s64 %0, [%1];" - : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) - : "memory"); + "ld.volatile.shared.s64 %0, [%1];" #else - asm volatile("ld.acquire.cta.shared.s64 %0, [%1];" - : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) - : "memory"); + "ld.acquire.cta.shared.s64 %0, [%1];" #endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); membar_acq_rel_shared(); return result; } @@ -265,34 +241,30 @@ __device__ __forceinline__ int64 load_acquire_shared(const int64* ptr) __device__ __forceinline__ void store_release_shared(int64* ptr, int64 result) { membar_acq_rel_shared(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.s64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "l"(result) - : "memory"); + "st.volatile.shared.s64 [%0], %1;" #else - asm volatile("st.release.cta.shared.s64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "l"(result) - : "memory"); + "st.release.cta.shared.s64 [%0], %1;" #endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "l"(result) + : "memory"); } __device__ __forceinline__ float load_acquire_shared(const float* ptr) { float result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.f32 %0, [%1];" - : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) - : "memory"); + "ld.volatile.shared.f32 %0, [%1];" #else - asm volatile("ld.acquire.cta.shared.f32 %0, [%1];" - : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) - : "memory"); + "ld.acquire.cta.shared.f32 %0, [%1];" #endif + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); membar_acq_rel_shared(); return result; } @@ -301,36 +273,30 @@ __device__ __forceinline__ float load_acquire_shared(const float* ptr) __device__ __forceinline__ void store_release_shared(float* ptr, float result) { membar_acq_rel_shared(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "f"(result) - : "memory"); + "st.volatile.shared.f32 [%0], %1;" #else - asm volatile("st.release.cta.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "f"(result) - : "memory"); + "st.release.cta.shared.f32 [%0], %1;" #endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "f"(result) + : "memory"); } __device__ __forceinline__ double load_acquire_shared(const double* ptr) { double result; -#if __CUDA_ARCH__ < 700 asm volatile( +#if __CUDA_ARCH__ < 700 "ld.volatile.shared.f64 %0, [%1];" - : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) - : "memory"); #else - asm volatile( "ld.acquire.cta.shared.f64 %0, [%1];" +#endif : "=d"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); -#endif membar_acq_rel_shared(); return result; } @@ -339,34 +305,30 @@ __device__ __forceinline__ double load_acquire_shared(const double* ptr) __device__ __forceinline__ void store_release_shared(double* ptr, double result) { membar_acq_rel_shared(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "d"(result) - : "memory"); + "st.volatile.shared.f64 [%0], %1;" #else - asm volatile("st.release.cta.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "d"(result) - : "memory"); + "st.release.cta.shared.f64 [%0], %1;" #endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "d"(result) + : "memory"); } __device__ __forceinline__ int32 load_relaxed_local(const int32* ptr) { int32 result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.s32 %0, [%1];" - : "=r"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.volatile.s32 %0, [%1];" #else - asm volatile("ld.relaxed.cta.s32 %0, [%1];" - : "=r"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.relaxed.cta.s32 %0, [%1];" #endif + : "=r"(result) + : "l"(const_cast(ptr)) + : "memory"); return result; } @@ -374,30 +336,30 @@ __device__ __forceinline__ int32 load_relaxed_local(const int32* ptr) __device__ __forceinline__ void store_relaxed_local(int32* ptr, int32 result) { + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result) - : "memory"); + "st.volatile.s32 [%0], %1;" #else - asm volatile("st.relaxed.cta.s32 [%0], %1;" ::"l"(ptr), "r"(result) - : "memory"); + "st.relaxed.cta.s32 [%0], %1;" #endif + ::"l"(ptr), + "r"(result) + : "memory"); } __device__ __forceinline__ int64 load_relaxed_local(const int64* ptr) { int64 result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.s64 %0, [%1];" - : "=l"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.volatile.s64 %0, [%1];" #else - asm volatile("ld.relaxed.cta.s64 %0, [%1];" - : "=l"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.relaxed.cta.s64 %0, [%1];" #endif + : "=l"(result) + : "l"(const_cast(ptr)) + : "memory"); return result; } @@ -405,30 +367,30 @@ __device__ __forceinline__ int64 load_relaxed_local(const int64* ptr) __device__ __forceinline__ void store_relaxed_local(int64* ptr, int64 result) { + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result) - : "memory"); + "st.volatile.s64 [%0], %1;" #else - asm volatile("st.relaxed.cta.s64 [%0], %1;" ::"l"(ptr), "l"(result) - : "memory"); + "st.relaxed.cta.s64 [%0], %1;" #endif + ::"l"(ptr), + "l"(result) + : "memory"); } __device__ __forceinline__ float load_relaxed_local(const float* ptr) { float result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.f32 %0, [%1];" - : "=f"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.volatile.f32 %0, [%1];" #else - asm volatile("ld.relaxed.cta.f32 %0, [%1];" - : "=f"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.relaxed.cta.f32 %0, [%1];" #endif + : "=f"(result) + : "l"(const_cast(ptr)) + : "memory"); return result; } @@ -436,30 +398,30 @@ __device__ __forceinline__ float load_relaxed_local(const float* ptr) __device__ __forceinline__ void store_relaxed_local(float* ptr, float result) { + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) - : "memory"); + "st.volatile.f32 [%0], %1;" #else - asm volatile("st.relaxed.cta.f32 [%0], %1;" ::"l"(ptr), "f"(result) - : "memory"); + "st.relaxed.cta.f32 [%0], %1;" #endif + ::"l"(ptr), + "f"(result) + : "memory"); } __device__ __forceinline__ double load_relaxed_local(const double* ptr) { double result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.f64 %0, [%1];" - : "=d"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.volatile.f64 %0, [%1];" #else - asm volatile("ld.relaxed.cta.f64 %0, [%1];" - : "=d"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.relaxed.cta.f64 %0, [%1];" #endif + : "=d"(result) + : "l"(const_cast(ptr)) + : "memory"); return result; } @@ -467,30 +429,30 @@ __device__ __forceinline__ double load_relaxed_local(const double* ptr) __device__ __forceinline__ void store_relaxed_local(double* ptr, double result) { + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) - : "memory"); + "st.volatile.f64 [%0], %1;" #else - asm volatile("st.relaxed.cta.f64 [%0], %1;" ::"l"(ptr), "d"(result) - : "memory"); + "st.relaxed.cta.f64 [%0], %1;" #endif + ::"l"(ptr), + "d"(result) + : "memory"); } __device__ __forceinline__ int32 load_acquire_local(const int32* ptr) { int32 result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.s32 %0, [%1];" - : "=r"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.volatile.s32 %0, [%1];" #else - asm volatile("ld.acquire.cta.s32 %0, [%1];" - : "=r"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.acquire.cta.s32 %0, [%1];" #endif + : "=r"(result) + : "l"(const_cast(ptr)) + : "memory"); membar_acq_rel_local(); return result; } @@ -499,30 +461,30 @@ __device__ __forceinline__ int32 load_acquire_local(const int32* ptr) __device__ __forceinline__ void store_release_local(int32* ptr, int32 result) { membar_acq_rel_local(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result) - : "memory"); + "st.volatile.s32 [%0], %1;" #else - asm volatile("st.release.cta.s32 [%0], %1;" ::"l"(ptr), "r"(result) - : "memory"); + "st.release.cta.s32 [%0], %1;" #endif + ::"l"(ptr), + "r"(result) + : "memory"); } __device__ __forceinline__ int64 load_acquire_local(const int64* ptr) { int64 result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.s64 %0, [%1];" - : "=l"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.volatile.s64 %0, [%1];" #else - asm volatile("ld.acquire.cta.s64 %0, [%1];" - : "=l"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.acquire.cta.s64 %0, [%1];" #endif + : "=l"(result) + : "l"(const_cast(ptr)) + : "memory"); membar_acq_rel_local(); return result; } @@ -531,30 +493,30 @@ __device__ __forceinline__ int64 load_acquire_local(const int64* ptr) __device__ __forceinline__ void store_release_local(int64* ptr, int64 result) { membar_acq_rel_local(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result) - : "memory"); + "st.volatile.s64 [%0], %1;" #else - asm volatile("st.release.cta.s64 [%0], %1;" ::"l"(ptr), "l"(result) - : "memory"); + "st.release.cta.s64 [%0], %1;" #endif + ::"l"(ptr), + "l"(result) + : "memory"); } __device__ __forceinline__ float load_acquire_local(const float* ptr) { float result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.f32 %0, [%1];" - : "=f"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.volatile.f32 %0, [%1];" #else - asm volatile("ld.acquire.cta.f32 %0, [%1];" - : "=f"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.acquire.cta.f32 %0, [%1];" #endif + : "=f"(result) + : "l"(const_cast(ptr)) + : "memory"); membar_acq_rel_local(); return result; } @@ -563,30 +525,30 @@ __device__ __forceinline__ float load_acquire_local(const float* ptr) __device__ __forceinline__ void store_release_local(float* ptr, float result) { membar_acq_rel_local(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) - : "memory"); + "st.volatile.f32 [%0], %1;" #else - asm volatile("st.release.cta.f32 [%0], %1;" ::"l"(ptr), "f"(result) - : "memory"); + "st.release.cta.f32 [%0], %1;" #endif + ::"l"(ptr), + "f"(result) + : "memory"); } __device__ __forceinline__ double load_acquire_local(const double* ptr) { double result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.f64 %0, [%1];" - : "=d"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.volatile.f64 %0, [%1];" #else - asm volatile("ld.acquire.cta.f64 %0, [%1];" - : "=d"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.acquire.cta.f64 %0, [%1];" #endif + : "=d"(result) + : "l"(const_cast(ptr)) + : "memory"); membar_acq_rel_local(); return result; } @@ -595,30 +557,30 @@ __device__ __forceinline__ double load_acquire_local(const double* ptr) __device__ __forceinline__ void store_release_local(double* ptr, double result) { membar_acq_rel_local(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) - : "memory"); + "st.volatile.f64 [%0], %1;" #else - asm volatile("st.release.cta.f64 [%0], %1;" ::"l"(ptr), "d"(result) - : "memory"); + "st.release.cta.f64 [%0], %1;" #endif + ::"l"(ptr), + "d"(result) + : "memory"); } __device__ __forceinline__ int32 load_relaxed(const int32* ptr) { int32 result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.s32 %0, [%1];" - : "=r"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.volatile.s32 %0, [%1];" #else - asm volatile("ld.relaxed.gpu.s32 %0, [%1];" - : "=r"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.relaxed.gpu.s32 %0, [%1];" #endif + : "=r"(result) + : "l"(const_cast(ptr)) + : "memory"); return result; } @@ -626,30 +588,30 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr) __device__ __forceinline__ void store_relaxed(int32* ptr, int32 result) { + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result) - : "memory"); + "st.volatile.s32 [%0], %1;" #else - asm volatile("st.relaxed.gpu.s32 [%0], %1;" ::"l"(ptr), "r"(result) - : "memory"); + "st.relaxed.gpu.s32 [%0], %1;" #endif + ::"l"(ptr), + "r"(result) + : "memory"); } __device__ __forceinline__ int64 load_relaxed(const int64* ptr) { int64 result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.s64 %0, [%1];" - : "=l"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.volatile.s64 %0, [%1];" #else - asm volatile("ld.relaxed.gpu.s64 %0, [%1];" - : "=l"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.relaxed.gpu.s64 %0, [%1];" #endif + : "=l"(result) + : "l"(const_cast(ptr)) + : "memory"); return result; } @@ -657,30 +619,30 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr) __device__ __forceinline__ void store_relaxed(int64* ptr, int64 result) { + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result) - : "memory"); + "st.volatile.s64 [%0], %1;" #else - asm volatile("st.relaxed.gpu.s64 [%0], %1;" ::"l"(ptr), "l"(result) - : "memory"); + "st.relaxed.gpu.s64 [%0], %1;" #endif + ::"l"(ptr), + "l"(result) + : "memory"); } __device__ __forceinline__ float load_relaxed(const float* ptr) { float result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.f32 %0, [%1];" - : "=f"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.volatile.f32 %0, [%1];" #else - asm volatile("ld.relaxed.gpu.f32 %0, [%1];" - : "=f"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.relaxed.gpu.f32 %0, [%1];" #endif + : "=f"(result) + : "l"(const_cast(ptr)) + : "memory"); return result; } @@ -688,30 +650,30 @@ __device__ __forceinline__ float load_relaxed(const float* ptr) __device__ __forceinline__ void store_relaxed(float* ptr, float result) { + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) - : "memory"); + "st.volatile.f32 [%0], %1;" #else - asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result) - : "memory"); + "st.relaxed.gpu.f32 [%0], %1;" #endif + ::"l"(ptr), + "f"(result) + : "memory"); } __device__ __forceinline__ double load_relaxed(const double* ptr) { double result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.f64 %0, [%1];" - : "=d"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.volatile.f64 %0, [%1];" #else - asm volatile("ld.relaxed.gpu.f64 %0, [%1];" - : "=d"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.relaxed.gpu.f64 %0, [%1];" #endif + : "=d"(result) + : "l"(const_cast(ptr)) + : "memory"); return result; } @@ -719,30 +681,30 @@ __device__ __forceinline__ double load_relaxed(const double* ptr) __device__ __forceinline__ void store_relaxed(double* ptr, double result) { + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) - : "memory"); + "st.volatile.f64 [%0], %1;" #else - asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result) - : "memory"); + "st.relaxed.gpu.f64 [%0], %1;" #endif + ::"l"(ptr), + "d"(result) + : "memory"); } __device__ __forceinline__ int32 load_acquire(const int32* ptr) { int32 result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.s32 %0, [%1];" - : "=r"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.volatile.s32 %0, [%1];" #else - asm volatile("ld.acquire.gpu.s32 %0, [%1];" - : "=r"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.acquire.gpu.s32 %0, [%1];" #endif + : "=r"(result) + : "l"(const_cast(ptr)) + : "memory"); membar_acq_rel(); return result; } @@ -751,30 +713,30 @@ __device__ __forceinline__ int32 load_acquire(const int32* ptr) __device__ __forceinline__ void store_release(int32* ptr, int32 result) { membar_acq_rel(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result) - : "memory"); + "st.volatile.s32 [%0], %1;" #else - asm volatile("st.release.gpu.s32 [%0], %1;" ::"l"(ptr), "r"(result) - : "memory"); + "st.release.gpu.s32 [%0], %1;" #endif + ::"l"(ptr), + "r"(result) + : "memory"); } __device__ __forceinline__ int64 load_acquire(const int64* ptr) { int64 result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.s64 %0, [%1];" - : "=l"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.volatile.s64 %0, [%1];" #else - asm volatile("ld.acquire.gpu.s64 %0, [%1];" - : "=l"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.acquire.gpu.s64 %0, [%1];" #endif + : "=l"(result) + : "l"(const_cast(ptr)) + : "memory"); membar_acq_rel(); return result; } @@ -783,30 +745,30 @@ __device__ __forceinline__ int64 load_acquire(const int64* ptr) __device__ __forceinline__ void store_release(int64* ptr, int64 result) { membar_acq_rel(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result) - : "memory"); + "st.volatile.s64 [%0], %1;" #else - asm volatile("st.release.gpu.s64 [%0], %1;" ::"l"(ptr), "l"(result) - : "memory"); + "st.release.gpu.s64 [%0], %1;" #endif + ::"l"(ptr), + "l"(result) + : "memory"); } __device__ __forceinline__ float load_acquire(const float* ptr) { float result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.f32 %0, [%1];" - : "=f"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.volatile.f32 %0, [%1];" #else - asm volatile("ld.acquire.gpu.f32 %0, [%1];" - : "=f"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.acquire.gpu.f32 %0, [%1];" #endif + : "=f"(result) + : "l"(const_cast(ptr)) + : "memory"); membar_acq_rel(); return result; } @@ -815,30 +777,30 @@ __device__ __forceinline__ float load_acquire(const float* ptr) __device__ __forceinline__ void store_release(float* ptr, float result) { membar_acq_rel(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) - : "memory"); + "st.volatile.f32 [%0], %1;" #else - asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result) - : "memory"); + "st.release.gpu.f32 [%0], %1;" #endif + ::"l"(ptr), + "f"(result) + : "memory"); } __device__ __forceinline__ double load_acquire(const double* ptr) { double result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.f64 %0, [%1];" - : "=d"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.volatile.f64 %0, [%1];" #else - asm volatile("ld.acquire.gpu.f64 %0, [%1];" - : "=d"(result) - : "l"(const_cast(ptr)) - : "memory"); + "ld.acquire.gpu.f64 %0, [%1];" #endif + : "=d"(result) + : "l"(const_cast(ptr)) + : "memory"); membar_acq_rel(); return result; } @@ -847,13 +809,15 @@ __device__ __forceinline__ double load_acquire(const double* ptr) __device__ __forceinline__ void store_release(double* ptr, double result) { membar_acq_rel(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) - : "memory"); + "st.volatile.f64 [%0], %1;" #else - asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result) - : "memory"); + "st.release.gpu.f64 [%0], %1;" #endif + ::"l"(ptr), + "d"(result) + : "memory"); } @@ -862,19 +826,16 @@ __device__ __forceinline__ thrust::complex load_relaxed_shared( { float real_result; float imag_result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.v2.f32 {%0, %1}, [%2];" - : "=f"(real_result), "=f"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))) - : "memory"); -#else - asm volatile("ld.relaxed.cta.shared.v2.f32 {%0, %1}, [%2];" - : "=f"(real_result), "=f"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))) - : "memory"); + "ld.volatile.shared.v2.f32 {%0, %1}, [%2];" +#else + "ld.relaxed.cta.shared.v2.f32 {%0, %1}, [%2];" #endif + : "=f"(real_result), "=f"(imag_result) + : "r"(convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))) + : "memory"); return thrust::complex{real_result, imag_result}; } @@ -884,17 +845,15 @@ __device__ __forceinline__ void store_relaxed_shared( { auto real_result = result.real(); auto imag_result = result.imag(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.v2.f32 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "f"(real_result), "f"(imag_result) - : "memory"); + "st.volatile.shared.v2.f32 [%0], {%1, %2};" #else - asm volatile("st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "f"(real_result), "f"(imag_result) - : "memory"); + "st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};" #endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "f"(real_result), "f"(imag_result) + : "memory"); } @@ -903,19 +862,16 @@ __device__ __forceinline__ thrust::complex load_relaxed_shared( { double real_result; double imag_result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.v2.f64 {%0, %1}, [%2];" - : "=d"(real_result), "=d"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))) - : "memory"); -#else - asm volatile("ld.relaxed.cta.shared.v2.f64 {%0, %1}, [%2];" - : "=d"(real_result), "=d"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))) - : "memory"); + "ld.volatile.shared.v2.f64 {%0, %1}, [%2];" +#else + "ld.relaxed.cta.shared.v2.f64 {%0, %1}, [%2];" #endif + : "=d"(real_result), "=d"(imag_result) + : "r"(convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))) + : "memory"); return thrust::complex{real_result, imag_result}; } @@ -925,17 +881,15 @@ __device__ __forceinline__ void store_relaxed_shared( { auto real_result = result.real(); auto imag_result = result.imag(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.v2.f64 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "d"(real_result), "d"(imag_result) - : "memory"); + "st.volatile.shared.v2.f64 [%0], {%1, %2};" #else - asm volatile("st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), - "d"(real_result), "d"(imag_result) - : "memory"); + "st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};" #endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "d"(real_result), "d"(imag_result) + : "memory"); } @@ -944,17 +898,15 @@ __device__ __forceinline__ thrust::complex load_relaxed_local( { float real_result; float imag_result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.v2.f32 {%0, %1}, [%2];" - : "=f"(real_result), "=f"(imag_result) - : "l"(const_cast*>(ptr)) - : "memory"); + "ld.volatile.v2.f32 {%0, %1}, [%2];" #else - asm volatile("ld.relaxed.cta.v2.f32 {%0, %1}, [%2];" - : "=f"(real_result), "=f"(imag_result) - : "l"(const_cast*>(ptr)) - : "memory"); + "ld.relaxed.cta.v2.f32 {%0, %1}, [%2];" #endif + : "=f"(real_result), "=f"(imag_result) + : "l"(const_cast*>(ptr)) + : "memory"); return thrust::complex{real_result, imag_result}; } @@ -964,15 +916,15 @@ __device__ __forceinline__ void store_relaxed_local( { auto real_result = result.real(); auto imag_result = result.imag(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(ptr), - "f"(real_result), "f"(imag_result) - : "memory"); + "st.volatile.v2.f32 [%0], {%1, %2};" #else - asm volatile("st.relaxed.cta.v2.f32 [%0], {%1, %2};" ::"l"(ptr), - "f"(real_result), "f"(imag_result) - : "memory"); + "st.relaxed.cta.v2.f32 [%0], {%1, %2};" #endif + ::"l"(ptr), + "f"(real_result), "f"(imag_result) + : "memory"); } @@ -981,17 +933,15 @@ __device__ __forceinline__ thrust::complex load_relaxed_local( { double real_result; double imag_result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.v2.f64 {%0, %1}, [%2];" - : "=d"(real_result), "=d"(imag_result) - : "l"(const_cast*>(ptr)) - : "memory"); + "ld.volatile.v2.f64 {%0, %1}, [%2];" #else - asm volatile("ld.relaxed.cta.v2.f64 {%0, %1}, [%2];" - : "=d"(real_result), "=d"(imag_result) - : "l"(const_cast*>(ptr)) - : "memory"); + "ld.relaxed.cta.v2.f64 {%0, %1}, [%2];" #endif + : "=d"(real_result), "=d"(imag_result) + : "l"(const_cast*>(ptr)) + : "memory"); return thrust::complex{real_result, imag_result}; } @@ -1001,15 +951,15 @@ __device__ __forceinline__ void store_relaxed_local( { auto real_result = result.real(); auto imag_result = result.imag(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(ptr), - "d"(real_result), "d"(imag_result) - : "memory"); + "st.volatile.v2.f64 [%0], {%1, %2};" #else - asm volatile("st.relaxed.cta.v2.f64 [%0], {%1, %2};" ::"l"(ptr), - "d"(real_result), "d"(imag_result) - : "memory"); + "st.relaxed.cta.v2.f64 [%0], {%1, %2};" #endif + ::"l"(ptr), + "d"(real_result), "d"(imag_result) + : "memory"); } @@ -1018,17 +968,15 @@ __device__ __forceinline__ thrust::complex load_relaxed( { float real_result; float imag_result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.v2.f32 {%0, %1}, [%2];" - : "=f"(real_result), "=f"(imag_result) - : "l"(const_cast*>(ptr)) - : "memory"); + "ld.volatile.v2.f32 {%0, %1}, [%2];" #else - asm volatile("ld.relaxed.gpu.v2.f32 {%0, %1}, [%2];" - : "=f"(real_result), "=f"(imag_result) - : "l"(const_cast*>(ptr)) - : "memory"); + "ld.relaxed.gpu.v2.f32 {%0, %1}, [%2];" #endif + : "=f"(real_result), "=f"(imag_result) + : "l"(const_cast*>(ptr)) + : "memory"); return thrust::complex{real_result, imag_result}; } @@ -1038,15 +986,15 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, { auto real_result = result.real(); auto imag_result = result.imag(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(ptr), - "f"(real_result), "f"(imag_result) - : "memory"); + "st.volatile.v2.f32 [%0], {%1, %2};" #else - asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"(ptr), - "f"(real_result), "f"(imag_result) - : "memory"); + "st.relaxed.gpu.v2.f32 [%0], {%1, %2};" #endif + ::"l"(ptr), + "f"(real_result), "f"(imag_result) + : "memory"); } @@ -1055,17 +1003,15 @@ __device__ __forceinline__ thrust::complex load_relaxed( { double real_result; double imag_result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.v2.f64 {%0, %1}, [%2];" - : "=d"(real_result), "=d"(imag_result) - : "l"(const_cast*>(ptr)) - : "memory"); + "ld.volatile.v2.f64 {%0, %1}, [%2];" #else - asm volatile("ld.relaxed.gpu.v2.f64 {%0, %1}, [%2];" - : "=d"(real_result), "=d"(imag_result) - : "l"(const_cast*>(ptr)) - : "memory"); + "ld.relaxed.gpu.v2.f64 {%0, %1}, [%2];" #endif + : "=d"(real_result), "=d"(imag_result) + : "l"(const_cast*>(ptr)) + : "memory"); return thrust::complex{real_result, imag_result}; } @@ -1075,13 +1021,13 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, { auto real_result = result.real(); auto imag_result = result.imag(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(ptr), - "d"(real_result), "d"(imag_result) - : "memory"); + "st.volatile.v2.f64 [%0], {%1, %2};" #else - asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"(ptr), - "d"(real_result), "d"(imag_result) - : "memory"); + "st.relaxed.gpu.v2.f64 [%0], {%1, %2};" #endif + ::"l"(ptr), + "d"(real_result), "d"(imag_result) + : "memory"); } diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index 9dec14d2394..49f99d4d96f 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -67,7 +67,7 @@ class type_desc: // for reasoning behind this implementation #if (!defined(__clang__) && __CUDACC_VER_MAJOR__ >= 11) return static_cast(__cvta_generic_to_shared(ptr)); -#elif (!defined(__clang__) && CUDACC_VER_MAJOR__ == 10 && \ +#elif (!defined(__clang__) && CUDACC_VER_MAJOR__ == 10 && \\ __CUDACC_VER_MINOR__ >= 2) return __nvvm_get_smem_pointer(ptr); #else @@ -123,17 +123,15 @@ class type_desc: __device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr) {{ {t.name} result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" - : "={t.val_constraint}"(result) - : "{s.ptr_constraint}"({const_ptr_expr}) - : "memory"); + "ld.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" #else - asm volatile("ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" - : "={t.val_constraint}"(result) - : "{s.ptr_constraint}"({const_ptr_expr}) - : "memory"); + "ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" #endif + : "={t.val_constraint}"(result) + : "{s.ptr_constraint}"({const_ptr_expr}) + : "memory"); {membar_expression} return result; }} @@ -142,15 +140,14 @@ class type_desc: __device__ __forceinline__ void store{o.fn_store_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} result) {{ {membar_expression} + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" - :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result) - : "memory"); + "st.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" #else - asm volatile("st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" - :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result) - : "memory"); + "st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" #endif + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result) + : "memory"); }} """) @@ -167,17 +164,15 @@ class type_desc: {{ {t.name} real_result; {t.name} imag_result; + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" - : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) - : "{s.ptr_constraint}"({const_ptr_expr}) - : "memory"); + "ld.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" #else - asm volatile("ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" - : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) - : "{s.ptr_constraint}"({const_ptr_expr}) - : "memory"); -#endif + "ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" +#endif + : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) + : "{s.ptr_constraint}"({const_ptr_expr}) + : "memory"); return thrust::complex<{t.name}>{{real_result, imag_result}}; }} @@ -186,14 +181,13 @@ class type_desc: {{ auto real_result = result.real(); auto imag_result = result.imag(); + asm volatile( #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" - :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) - : "memory"); + "st.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" #else - asm volatile("st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" - :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) - : "memory"); + "st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" #endif + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) + : "memory"); }} """)