From bf70a4fc06a1d9789a6cb081e5543bc3bff70f32 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 21 Oct 2024 11:08:44 +0200
Subject: [PATCH] squeeze memory dispatch

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 .../cuda_hip/components/memory.nvidia.hpp.inc | 766 ++++++++----------
 dev_tools/scripts/generate_cuda_memory_ptx.py |  54 +-
 2 files changed, 380 insertions(+), 440 deletions(-)

diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc
index 49c9ae7601c..a695904e82a 100644
--- a/common/cuda_hip/components/memory.nvidia.hpp.inc
+++ b/common/cuda_hip/components/memory.nvidia.hpp.inc
@@ -68,17 +68,15 @@ __device__ __forceinline__ void membar_acq_rel_local()
 __device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr)
 {
     int32 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.shared.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
-                 : "memory");
+        "ld.volatile.shared.s32 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.cta.shared.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
-                 : "memory");
+        "ld.relaxed.cta.shared.s32 %0, [%1];"
 #endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
+        : "memory");
 
     return result;
 }
@@ -86,34 +84,30 @@ __device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr)
 
 __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.s32 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "r"(result)
-                 : "memory");
+        "st.volatile.shared.s32 [%0], %1;"
 #else
-    asm volatile("st.relaxed.cta.shared.s32 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "r"(result)
-                 : "memory");
+        "st.relaxed.cta.shared.s32 [%0], %1;"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "r"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr)
 {
     int64 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.shared.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
-                 : "memory");
+        "ld.volatile.shared.s64 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.cta.shared.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
-                 : "memory");
+        "ld.relaxed.cta.shared.s64 %0, [%1];"
 #endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
+        : "memory");
 
     return result;
 }
@@ -121,34 +115,30 @@ __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr)
 
 __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.s64 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "l"(result)
-                 : "memory");
+        "st.volatile.shared.s64 [%0], %1;"
 #else
-    asm volatile("st.relaxed.cta.shared.s64 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "l"(result)
-                 : "memory");
+        "st.relaxed.cta.shared.s64 [%0], %1;"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "l"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ float load_relaxed_shared(const float* ptr)
 {
     float result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.shared.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
-                 : "memory");
+        "ld.volatile.shared.f32 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.cta.shared.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
-                 : "memory");
+        "ld.relaxed.cta.shared.f32 %0, [%1];"
 #endif
+        : "=f"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
+        : "memory");
 
     return result;
 }
@@ -156,36 +146,30 @@ __device__ __forceinline__ float load_relaxed_shared(const float* ptr)
 
 __device__ __forceinline__ void store_relaxed_shared(float* ptr, float result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "f"(result)
-                 : "memory");
+        "st.volatile.shared.f32 [%0], %1;"
 #else
-    asm volatile("st.relaxed.cta.shared.f32 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "f"(result)
-                 : "memory");
+        "st.relaxed.cta.shared.f32 [%0], %1;"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "f"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ double load_relaxed_shared(const double* ptr)
 {
     double result;
-#if __CUDA_ARCH__ < 700
     asm volatile(
+#if __CUDA_ARCH__ < 700
         "ld.volatile.shared.f64 %0, [%1];"
-        : "=d"(result)
-        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr)))
-        : "memory");
 #else
-    asm volatile(
         "ld.relaxed.cta.shared.f64 %0, [%1];"
+#endif
         : "=d"(result)
         : "r"(convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr)))
         : "memory");
-#endif
 
     return result;
 }
@@ -193,34 +177,30 @@ __device__ __forceinline__ double load_relaxed_shared(const double* ptr)
 
 __device__ __forceinline__ void store_relaxed_shared(double* ptr, double result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "d"(result)
-                 : "memory");
+        "st.volatile.shared.f64 [%0], %1;"
 #else
-    asm volatile("st.relaxed.cta.shared.f64 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "d"(result)
-                 : "memory");
+        "st.relaxed.cta.shared.f64 [%0], %1;"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "d"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int32 load_acquire_shared(const int32* ptr)
 {
     int32 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.shared.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
-                 : "memory");
+        "ld.volatile.shared.s32 %0, [%1];"
 #else
-    asm volatile("ld.acquire.cta.shared.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
-                 : "memory");
+        "ld.acquire.cta.shared.s32 %0, [%1];"
 #endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
+        : "memory");
     membar_acq_rel_shared();
     return result;
 }
@@ -229,34 +209,30 @@ __device__ __forceinline__ int32 load_acquire_shared(const int32* ptr)
 __device__ __forceinline__ void store_release_shared(int32* ptr, int32 result)
 {
     membar_acq_rel_shared();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.s32 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "r"(result)
-                 : "memory");
+        "st.volatile.shared.s32 [%0], %1;"
 #else
-    asm volatile("st.release.cta.shared.s32 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "r"(result)
-                 : "memory");
+        "st.release.cta.shared.s32 [%0], %1;"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "r"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int64 load_acquire_shared(const int64* ptr)
 {
     int64 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.shared.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
-                 : "memory");
+        "ld.volatile.shared.s64 %0, [%1];"
 #else
-    asm volatile("ld.acquire.cta.shared.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
-                 : "memory");
+        "ld.acquire.cta.shared.s64 %0, [%1];"
 #endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
+        : "memory");
     membar_acq_rel_shared();
     return result;
 }
@@ -265,34 +241,30 @@ __device__ __forceinline__ int64 load_acquire_shared(const int64* ptr)
 __device__ __forceinline__ void store_release_shared(int64* ptr, int64 result)
 {
     membar_acq_rel_shared();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.s64 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "l"(result)
-                 : "memory");
+        "st.volatile.shared.s64 [%0], %1;"
 #else
-    asm volatile("st.release.cta.shared.s64 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "l"(result)
-                 : "memory");
+        "st.release.cta.shared.s64 [%0], %1;"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "l"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ float load_acquire_shared(const float* ptr)
 {
     float result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.shared.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
-                 : "memory");
+        "ld.volatile.shared.f32 %0, [%1];"
 #else
-    asm volatile("ld.acquire.cta.shared.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
-                 : "memory");
+        "ld.acquire.cta.shared.f32 %0, [%1];"
 #endif
+        : "=f"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
+        : "memory");
     membar_acq_rel_shared();
     return result;
 }
@@ -301,36 +273,30 @@ __device__ __forceinline__ float load_acquire_shared(const float* ptr)
 __device__ __forceinline__ void store_release_shared(float* ptr, float result)
 {
     membar_acq_rel_shared();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "f"(result)
-                 : "memory");
+        "st.volatile.shared.f32 [%0], %1;"
 #else
-    asm volatile("st.release.cta.shared.f32 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "f"(result)
-                 : "memory");
+        "st.release.cta.shared.f32 [%0], %1;"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "f"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ double load_acquire_shared(const double* ptr)
 {
     double result;
-#if __CUDA_ARCH__ < 700
     asm volatile(
+#if __CUDA_ARCH__ < 700
         "ld.volatile.shared.f64 %0, [%1];"
-        : "=d"(result)
-        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr)))
-        : "memory");
 #else
-    asm volatile(
         "ld.acquire.cta.shared.f64 %0, [%1];"
+#endif
         : "=d"(result)
         : "r"(convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr)))
         : "memory");
-#endif
     membar_acq_rel_shared();
     return result;
 }
@@ -339,34 +305,30 @@ __device__ __forceinline__ double load_acquire_shared(const double* ptr)
 __device__ __forceinline__ void store_release_shared(double* ptr, double result)
 {
     membar_acq_rel_shared();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "d"(result)
-                 : "memory");
+        "st.volatile.shared.f64 [%0], %1;"
 #else
-    asm volatile("st.release.cta.shared.f64 [%0], %1;" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "d"(result)
-                 : "memory");
+        "st.release.cta.shared.f64 [%0], %1;"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "d"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int32 load_relaxed_local(const int32* ptr)
 {
     int32 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "l"(const_cast<int32*>(ptr))
-                 : "memory");
+        "ld.volatile.s32 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.cta.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "l"(const_cast<int32*>(ptr))
-                 : "memory");
+        "ld.relaxed.cta.s32 %0, [%1];"
 #endif
+        : "=r"(result)
+        : "l"(const_cast<int32*>(ptr))
+        : "memory");
 
     return result;
 }
@@ -374,30 +336,30 @@ __device__ __forceinline__ int32 load_relaxed_local(const int32* ptr)
 
 __device__ __forceinline__ void store_relaxed_local(int32* ptr, int32 result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result)
-                 : "memory");
+        "st.volatile.s32 [%0], %1;"
 #else
-    asm volatile("st.relaxed.cta.s32 [%0], %1;" ::"l"(ptr), "r"(result)
-                 : "memory");
+        "st.relaxed.cta.s32 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int64 load_relaxed_local(const int64* ptr)
 {
     int64 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "l"(const_cast<int64*>(ptr))
-                 : "memory");
+        "ld.volatile.s64 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.cta.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "l"(const_cast<int64*>(ptr))
-                 : "memory");
+        "ld.relaxed.cta.s64 %0, [%1];"
 #endif
+        : "=l"(result)
+        : "l"(const_cast<int64*>(ptr))
+        : "memory");
 
     return result;
 }
@@ -405,30 +367,30 @@ __device__ __forceinline__ int64 load_relaxed_local(const int64* ptr)
 
 __device__ __forceinline__ void store_relaxed_local(int64* ptr, int64 result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result)
-                 : "memory");
+        "st.volatile.s64 [%0], %1;"
 #else
-    asm volatile("st.relaxed.cta.s64 [%0], %1;" ::"l"(ptr), "l"(result)
-                 : "memory");
+        "st.relaxed.cta.s64 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ float load_relaxed_local(const float* ptr)
 {
     float result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "l"(const_cast<float*>(ptr))
-                 : "memory");
+        "ld.volatile.f32 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.cta.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "l"(const_cast<float*>(ptr))
-                 : "memory");
+        "ld.relaxed.cta.f32 %0, [%1];"
 #endif
+        : "=f"(result)
+        : "l"(const_cast<float*>(ptr))
+        : "memory");
 
     return result;
 }
@@ -436,30 +398,30 @@ __device__ __forceinline__ float load_relaxed_local(const float* ptr)
 
 __device__ __forceinline__ void store_relaxed_local(float* ptr, float result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result)
-                 : "memory");
+        "st.volatile.f32 [%0], %1;"
 #else
-    asm volatile("st.relaxed.cta.f32 [%0], %1;" ::"l"(ptr), "f"(result)
-                 : "memory");
+        "st.relaxed.cta.f32 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "f"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ double load_relaxed_local(const double* ptr)
 {
     double result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.f64 %0, [%1];"
-                 : "=d"(result)
-                 : "l"(const_cast<double*>(ptr))
-                 : "memory");
+        "ld.volatile.f64 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.cta.f64 %0, [%1];"
-                 : "=d"(result)
-                 : "l"(const_cast<double*>(ptr))
-                 : "memory");
+        "ld.relaxed.cta.f64 %0, [%1];"
 #endif
+        : "=d"(result)
+        : "l"(const_cast<double*>(ptr))
+        : "memory");
 
     return result;
 }
@@ -467,30 +429,30 @@ __device__ __forceinline__ double load_relaxed_local(const double* ptr)
 
 __device__ __forceinline__ void store_relaxed_local(double* ptr, double result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result)
-                 : "memory");
+        "st.volatile.f64 [%0], %1;"
 #else
-    asm volatile("st.relaxed.cta.f64 [%0], %1;" ::"l"(ptr), "d"(result)
-                 : "memory");
+        "st.relaxed.cta.f64 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "d"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int32 load_acquire_local(const int32* ptr)
 {
     int32 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "l"(const_cast<int32*>(ptr))
-                 : "memory");
+        "ld.volatile.s32 %0, [%1];"
 #else
-    asm volatile("ld.acquire.cta.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "l"(const_cast<int32*>(ptr))
-                 : "memory");
+        "ld.acquire.cta.s32 %0, [%1];"
 #endif
+        : "=r"(result)
+        : "l"(const_cast<int32*>(ptr))
+        : "memory");
     membar_acq_rel_local();
     return result;
 }
@@ -499,30 +461,30 @@ __device__ __forceinline__ int32 load_acquire_local(const int32* ptr)
 __device__ __forceinline__ void store_release_local(int32* ptr, int32 result)
 {
     membar_acq_rel_local();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result)
-                 : "memory");
+        "st.volatile.s32 [%0], %1;"
 #else
-    asm volatile("st.release.cta.s32 [%0], %1;" ::"l"(ptr), "r"(result)
-                 : "memory");
+        "st.release.cta.s32 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int64 load_acquire_local(const int64* ptr)
 {
     int64 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "l"(const_cast<int64*>(ptr))
-                 : "memory");
+        "ld.volatile.s64 %0, [%1];"
 #else
-    asm volatile("ld.acquire.cta.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "l"(const_cast<int64*>(ptr))
-                 : "memory");
+        "ld.acquire.cta.s64 %0, [%1];"
 #endif
+        : "=l"(result)
+        : "l"(const_cast<int64*>(ptr))
+        : "memory");
     membar_acq_rel_local();
     return result;
 }
@@ -531,30 +493,30 @@ __device__ __forceinline__ int64 load_acquire_local(const int64* ptr)
 __device__ __forceinline__ void store_release_local(int64* ptr, int64 result)
 {
     membar_acq_rel_local();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result)
-                 : "memory");
+        "st.volatile.s64 [%0], %1;"
 #else
-    asm volatile("st.release.cta.s64 [%0], %1;" ::"l"(ptr), "l"(result)
-                 : "memory");
+        "st.release.cta.s64 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ float load_acquire_local(const float* ptr)
 {
     float result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "l"(const_cast<float*>(ptr))
-                 : "memory");
+        "ld.volatile.f32 %0, [%1];"
 #else
-    asm volatile("ld.acquire.cta.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "l"(const_cast<float*>(ptr))
-                 : "memory");
+        "ld.acquire.cta.f32 %0, [%1];"
 #endif
+        : "=f"(result)
+        : "l"(const_cast<float*>(ptr))
+        : "memory");
     membar_acq_rel_local();
     return result;
 }
@@ -563,30 +525,30 @@ __device__ __forceinline__ float load_acquire_local(const float* ptr)
 __device__ __forceinline__ void store_release_local(float* ptr, float result)
 {
     membar_acq_rel_local();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result)
-                 : "memory");
+        "st.volatile.f32 [%0], %1;"
 #else
-    asm volatile("st.release.cta.f32 [%0], %1;" ::"l"(ptr), "f"(result)
-                 : "memory");
+        "st.release.cta.f32 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "f"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ double load_acquire_local(const double* ptr)
 {
     double result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.f64 %0, [%1];"
-                 : "=d"(result)
-                 : "l"(const_cast<double*>(ptr))
-                 : "memory");
+        "ld.volatile.f64 %0, [%1];"
 #else
-    asm volatile("ld.acquire.cta.f64 %0, [%1];"
-                 : "=d"(result)
-                 : "l"(const_cast<double*>(ptr))
-                 : "memory");
+        "ld.acquire.cta.f64 %0, [%1];"
 #endif
+        : "=d"(result)
+        : "l"(const_cast<double*>(ptr))
+        : "memory");
     membar_acq_rel_local();
     return result;
 }
@@ -595,30 +557,30 @@ __device__ __forceinline__ double load_acquire_local(const double* ptr)
 __device__ __forceinline__ void store_release_local(double* ptr, double result)
 {
     membar_acq_rel_local();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result)
-                 : "memory");
+        "st.volatile.f64 [%0], %1;"
 #else
-    asm volatile("st.release.cta.f64 [%0], %1;" ::"l"(ptr), "d"(result)
-                 : "memory");
+        "st.release.cta.f64 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "d"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int32 load_relaxed(const int32* ptr)
 {
     int32 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "l"(const_cast<int32*>(ptr))
-                 : "memory");
+        "ld.volatile.s32 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.gpu.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "l"(const_cast<int32*>(ptr))
-                 : "memory");
+        "ld.relaxed.gpu.s32 %0, [%1];"
 #endif
+        : "=r"(result)
+        : "l"(const_cast<int32*>(ptr))
+        : "memory");
 
     return result;
 }
@@ -626,30 +588,30 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr)
 
 __device__ __forceinline__ void store_relaxed(int32* ptr, int32 result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result)
-                 : "memory");
+        "st.volatile.s32 [%0], %1;"
 #else
-    asm volatile("st.relaxed.gpu.s32 [%0], %1;" ::"l"(ptr), "r"(result)
-                 : "memory");
+        "st.relaxed.gpu.s32 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int64 load_relaxed(const int64* ptr)
 {
     int64 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "l"(const_cast<int64*>(ptr))
-                 : "memory");
+        "ld.volatile.s64 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.gpu.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "l"(const_cast<int64*>(ptr))
-                 : "memory");
+        "ld.relaxed.gpu.s64 %0, [%1];"
 #endif
+        : "=l"(result)
+        : "l"(const_cast<int64*>(ptr))
+        : "memory");
 
     return result;
 }
@@ -657,30 +619,30 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr)
 
 __device__ __forceinline__ void store_relaxed(int64* ptr, int64 result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result)
-                 : "memory");
+        "st.volatile.s64 [%0], %1;"
 #else
-    asm volatile("st.relaxed.gpu.s64 [%0], %1;" ::"l"(ptr), "l"(result)
-                 : "memory");
+        "st.relaxed.gpu.s64 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ float load_relaxed(const float* ptr)
 {
     float result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "l"(const_cast<float*>(ptr))
-                 : "memory");
+        "ld.volatile.f32 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.gpu.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "l"(const_cast<float*>(ptr))
-                 : "memory");
+        "ld.relaxed.gpu.f32 %0, [%1];"
 #endif
+        : "=f"(result)
+        : "l"(const_cast<float*>(ptr))
+        : "memory");
 
     return result;
 }
@@ -688,30 +650,30 @@ __device__ __forceinline__ float load_relaxed(const float* ptr)
 
 __device__ __forceinline__ void store_relaxed(float* ptr, float result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result)
-                 : "memory");
+        "st.volatile.f32 [%0], %1;"
 #else
-    asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result)
-                 : "memory");
+        "st.relaxed.gpu.f32 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "f"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ double load_relaxed(const double* ptr)
 {
     double result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.f64 %0, [%1];"
-                 : "=d"(result)
-                 : "l"(const_cast<double*>(ptr))
-                 : "memory");
+        "ld.volatile.f64 %0, [%1];"
 #else
-    asm volatile("ld.relaxed.gpu.f64 %0, [%1];"
-                 : "=d"(result)
-                 : "l"(const_cast<double*>(ptr))
-                 : "memory");
+        "ld.relaxed.gpu.f64 %0, [%1];"
 #endif
+        : "=d"(result)
+        : "l"(const_cast<double*>(ptr))
+        : "memory");
 
     return result;
 }
@@ -719,30 +681,30 @@ __device__ __forceinline__ double load_relaxed(const double* ptr)
 
 __device__ __forceinline__ void store_relaxed(double* ptr, double result)
 {
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result)
-                 : "memory");
+        "st.volatile.f64 [%0], %1;"
 #else
-    asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result)
-                 : "memory");
+        "st.relaxed.gpu.f64 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "d"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int32 load_acquire(const int32* ptr)
 {
     int32 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "l"(const_cast<int32*>(ptr))
-                 : "memory");
+        "ld.volatile.s32 %0, [%1];"
 #else
-    asm volatile("ld.acquire.gpu.s32 %0, [%1];"
-                 : "=r"(result)
-                 : "l"(const_cast<int32*>(ptr))
-                 : "memory");
+        "ld.acquire.gpu.s32 %0, [%1];"
 #endif
+        : "=r"(result)
+        : "l"(const_cast<int32*>(ptr))
+        : "memory");
     membar_acq_rel();
     return result;
 }
@@ -751,30 +713,30 @@ __device__ __forceinline__ int32 load_acquire(const int32* ptr)
 __device__ __forceinline__ void store_release(int32* ptr, int32 result)
 {
     membar_acq_rel();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result)
-                 : "memory");
+        "st.volatile.s32 [%0], %1;"
 #else
-    asm volatile("st.release.gpu.s32 [%0], %1;" ::"l"(ptr), "r"(result)
-                 : "memory");
+        "st.release.gpu.s32 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ int64 load_acquire(const int64* ptr)
 {
     int64 result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "l"(const_cast<int64*>(ptr))
-                 : "memory");
+        "ld.volatile.s64 %0, [%1];"
 #else
-    asm volatile("ld.acquire.gpu.s64 %0, [%1];"
-                 : "=l"(result)
-                 : "l"(const_cast<int64*>(ptr))
-                 : "memory");
+        "ld.acquire.gpu.s64 %0, [%1];"
 #endif
+        : "=l"(result)
+        : "l"(const_cast<int64*>(ptr))
+        : "memory");
     membar_acq_rel();
     return result;
 }
@@ -783,30 +745,30 @@ __device__ __forceinline__ int64 load_acquire(const int64* ptr)
 __device__ __forceinline__ void store_release(int64* ptr, int64 result)
 {
     membar_acq_rel();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result)
-                 : "memory");
+        "st.volatile.s64 [%0], %1;"
 #else
-    asm volatile("st.release.gpu.s64 [%0], %1;" ::"l"(ptr), "l"(result)
-                 : "memory");
+        "st.release.gpu.s64 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ float load_acquire(const float* ptr)
 {
     float result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "l"(const_cast<float*>(ptr))
-                 : "memory");
+        "ld.volatile.f32 %0, [%1];"
 #else
-    asm volatile("ld.acquire.gpu.f32 %0, [%1];"
-                 : "=f"(result)
-                 : "l"(const_cast<float*>(ptr))
-                 : "memory");
+        "ld.acquire.gpu.f32 %0, [%1];"
 #endif
+        : "=f"(result)
+        : "l"(const_cast<float*>(ptr))
+        : "memory");
     membar_acq_rel();
     return result;
 }
@@ -815,30 +777,30 @@ __device__ __forceinline__ float load_acquire(const float* ptr)
 __device__ __forceinline__ void store_release(float* ptr, float result)
 {
     membar_acq_rel();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result)
-                 : "memory");
+        "st.volatile.f32 [%0], %1;"
 #else
-    asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result)
-                 : "memory");
+        "st.release.gpu.f32 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "f"(result)
+        : "memory");
 }
 
 
 __device__ __forceinline__ double load_acquire(const double* ptr)
 {
     double result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.f64 %0, [%1];"
-                 : "=d"(result)
-                 : "l"(const_cast<double*>(ptr))
-                 : "memory");
+        "ld.volatile.f64 %0, [%1];"
 #else
-    asm volatile("ld.acquire.gpu.f64 %0, [%1];"
-                 : "=d"(result)
-                 : "l"(const_cast<double*>(ptr))
-                 : "memory");
+        "ld.acquire.gpu.f64 %0, [%1];"
 #endif
+        : "=d"(result)
+        : "l"(const_cast<double*>(ptr))
+        : "memory");
     membar_acq_rel();
     return result;
 }
@@ -847,13 +809,15 @@ __device__ __forceinline__ double load_acquire(const double* ptr)
 __device__ __forceinline__ void store_release(double* ptr, double result)
 {
     membar_acq_rel();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result)
-                 : "memory");
+        "st.volatile.f64 [%0], %1;"
 #else
-    asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result)
-                 : "memory");
+        "st.release.gpu.f64 [%0], %1;"
 #endif
+        ::"l"(ptr),
+        "d"(result)
+        : "memory");
 }
 
 
@@ -862,19 +826,16 @@ __device__ __forceinline__ thrust::complex<float> load_relaxed_shared(
 {
     float real_result;
     float imag_result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.shared.v2.f32 {%0, %1}, [%2];"
-                 : "=f"(real_result), "=f"(imag_result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(
-                     const_cast<thrust::complex<float>*>(ptr)))
-                 : "memory");
-#else
-    asm volatile("ld.relaxed.cta.shared.v2.f32 {%0, %1}, [%2];"
-                 : "=f"(real_result), "=f"(imag_result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(
-                     const_cast<thrust::complex<float>*>(ptr)))
-                 : "memory");
+        "ld.volatile.shared.v2.f32 {%0, %1}, [%2];"
+#else
+        "ld.relaxed.cta.shared.v2.f32 {%0, %1}, [%2];"
 #endif
+        : "=f"(real_result), "=f"(imag_result)
+        : "r"(convert_generic_ptr_to_smem_ptr(
+            const_cast<thrust::complex<float>*>(ptr)))
+        : "memory");
     return thrust::complex<float>{real_result, imag_result};
 }
 
@@ -884,17 +845,15 @@ __device__ __forceinline__ void store_relaxed_shared(
 {
     auto real_result = result.real();
     auto imag_result = result.imag();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.v2.f32 [%0], {%1, %2};" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "f"(real_result), "f"(imag_result)
-                 : "memory");
+        "st.volatile.shared.v2.f32 [%0], {%1, %2};"
 #else
-    asm volatile("st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "f"(real_result), "f"(imag_result)
-                 : "memory");
+        "st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "f"(real_result), "f"(imag_result)
+        : "memory");
 }
 
 
@@ -903,19 +862,16 @@ __device__ __forceinline__ thrust::complex<double> load_relaxed_shared(
 {
     double real_result;
     double imag_result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.shared.v2.f64 {%0, %1}, [%2];"
-                 : "=d"(real_result), "=d"(imag_result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(
-                     const_cast<thrust::complex<double>*>(ptr)))
-                 : "memory");
-#else
-    asm volatile("ld.relaxed.cta.shared.v2.f64 {%0, %1}, [%2];"
-                 : "=d"(real_result), "=d"(imag_result)
-                 : "r"(convert_generic_ptr_to_smem_ptr(
-                     const_cast<thrust::complex<double>*>(ptr)))
-                 : "memory");
+        "ld.volatile.shared.v2.f64 {%0, %1}, [%2];"
+#else
+        "ld.relaxed.cta.shared.v2.f64 {%0, %1}, [%2];"
 #endif
+        : "=d"(real_result), "=d"(imag_result)
+        : "r"(convert_generic_ptr_to_smem_ptr(
+            const_cast<thrust::complex<double>*>(ptr)))
+        : "memory");
     return thrust::complex<double>{real_result, imag_result};
 }
 
@@ -925,17 +881,15 @@ __device__ __forceinline__ void store_relaxed_shared(
 {
     auto real_result = result.real();
     auto imag_result = result.imag();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.shared.v2.f64 [%0], {%1, %2};" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "d"(real_result), "d"(imag_result)
-                 : "memory");
+        "st.volatile.shared.v2.f64 [%0], {%1, %2};"
 #else
-    asm volatile("st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};" ::"r"(
-                     convert_generic_ptr_to_smem_ptr(ptr)),
-                 "d"(real_result), "d"(imag_result)
-                 : "memory");
+        "st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};"
 #endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "d"(real_result), "d"(imag_result)
+        : "memory");
 }
 
 
@@ -944,17 +898,15 @@ __device__ __forceinline__ thrust::complex<float> load_relaxed_local(
 {
     float real_result;
     float imag_result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.v2.f32 {%0, %1}, [%2];"
-                 : "=f"(real_result), "=f"(imag_result)
-                 : "l"(const_cast<thrust::complex<float>*>(ptr))
-                 : "memory");
+        "ld.volatile.v2.f32 {%0, %1}, [%2];"
 #else
-    asm volatile("ld.relaxed.cta.v2.f32 {%0, %1}, [%2];"
-                 : "=f"(real_result), "=f"(imag_result)
-                 : "l"(const_cast<thrust::complex<float>*>(ptr))
-                 : "memory");
+        "ld.relaxed.cta.v2.f32 {%0, %1}, [%2];"
 #endif
+        : "=f"(real_result), "=f"(imag_result)
+        : "l"(const_cast<thrust::complex<float>*>(ptr))
+        : "memory");
     return thrust::complex<float>{real_result, imag_result};
 }
 
@@ -964,15 +916,15 @@ __device__ __forceinline__ void store_relaxed_local(
 {
     auto real_result = result.real();
     auto imag_result = result.imag();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(ptr),
-                 "f"(real_result), "f"(imag_result)
-                 : "memory");
+        "st.volatile.v2.f32 [%0], {%1, %2};"
 #else
-    asm volatile("st.relaxed.cta.v2.f32 [%0], {%1, %2};" ::"l"(ptr),
-                 "f"(real_result), "f"(imag_result)
-                 : "memory");
+        "st.relaxed.cta.v2.f32 [%0], {%1, %2};"
 #endif
+        ::"l"(ptr),
+        "f"(real_result), "f"(imag_result)
+        : "memory");
 }
 
 
@@ -981,17 +933,15 @@ __device__ __forceinline__ thrust::complex<double> load_relaxed_local(
 {
     double real_result;
     double imag_result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.v2.f64 {%0, %1}, [%2];"
-                 : "=d"(real_result), "=d"(imag_result)
-                 : "l"(const_cast<thrust::complex<double>*>(ptr))
-                 : "memory");
+        "ld.volatile.v2.f64 {%0, %1}, [%2];"
 #else
-    asm volatile("ld.relaxed.cta.v2.f64 {%0, %1}, [%2];"
-                 : "=d"(real_result), "=d"(imag_result)
-                 : "l"(const_cast<thrust::complex<double>*>(ptr))
-                 : "memory");
+        "ld.relaxed.cta.v2.f64 {%0, %1}, [%2];"
 #endif
+        : "=d"(real_result), "=d"(imag_result)
+        : "l"(const_cast<thrust::complex<double>*>(ptr))
+        : "memory");
     return thrust::complex<double>{real_result, imag_result};
 }
 
@@ -1001,15 +951,15 @@ __device__ __forceinline__ void store_relaxed_local(
 {
     auto real_result = result.real();
     auto imag_result = result.imag();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(ptr),
-                 "d"(real_result), "d"(imag_result)
-                 : "memory");
+        "st.volatile.v2.f64 [%0], {%1, %2};"
 #else
-    asm volatile("st.relaxed.cta.v2.f64 [%0], {%1, %2};" ::"l"(ptr),
-                 "d"(real_result), "d"(imag_result)
-                 : "memory");
+        "st.relaxed.cta.v2.f64 [%0], {%1, %2};"
 #endif
+        ::"l"(ptr),
+        "d"(real_result), "d"(imag_result)
+        : "memory");
 }
 
 
@@ -1018,17 +968,15 @@ __device__ __forceinline__ thrust::complex<float> load_relaxed(
 {
     float real_result;
     float imag_result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.v2.f32 {%0, %1}, [%2];"
-                 : "=f"(real_result), "=f"(imag_result)
-                 : "l"(const_cast<thrust::complex<float>*>(ptr))
-                 : "memory");
+        "ld.volatile.v2.f32 {%0, %1}, [%2];"
 #else
-    asm volatile("ld.relaxed.gpu.v2.f32 {%0, %1}, [%2];"
-                 : "=f"(real_result), "=f"(imag_result)
-                 : "l"(const_cast<thrust::complex<float>*>(ptr))
-                 : "memory");
+        "ld.relaxed.gpu.v2.f32 {%0, %1}, [%2];"
 #endif
+        : "=f"(real_result), "=f"(imag_result)
+        : "l"(const_cast<thrust::complex<float>*>(ptr))
+        : "memory");
     return thrust::complex<float>{real_result, imag_result};
 }
 
@@ -1038,15 +986,15 @@ __device__ __forceinline__ void store_relaxed(thrust::complex<float>* ptr,
 {
     auto real_result = result.real();
     auto imag_result = result.imag();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(ptr),
-                 "f"(real_result), "f"(imag_result)
-                 : "memory");
+        "st.volatile.v2.f32 [%0], {%1, %2};"
 #else
-    asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"(ptr),
-                 "f"(real_result), "f"(imag_result)
-                 : "memory");
+        "st.relaxed.gpu.v2.f32 [%0], {%1, %2};"
 #endif
+        ::"l"(ptr),
+        "f"(real_result), "f"(imag_result)
+        : "memory");
 }
 
 
@@ -1055,17 +1003,15 @@ __device__ __forceinline__ thrust::complex<double> load_relaxed(
 {
     double real_result;
     double imag_result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile.v2.f64 {%0, %1}, [%2];"
-                 : "=d"(real_result), "=d"(imag_result)
-                 : "l"(const_cast<thrust::complex<double>*>(ptr))
-                 : "memory");
+        "ld.volatile.v2.f64 {%0, %1}, [%2];"
 #else
-    asm volatile("ld.relaxed.gpu.v2.f64 {%0, %1}, [%2];"
-                 : "=d"(real_result), "=d"(imag_result)
-                 : "l"(const_cast<thrust::complex<double>*>(ptr))
-                 : "memory");
+        "ld.relaxed.gpu.v2.f64 {%0, %1}, [%2];"
 #endif
+        : "=d"(real_result), "=d"(imag_result)
+        : "l"(const_cast<thrust::complex<double>*>(ptr))
+        : "memory");
     return thrust::complex<double>{real_result, imag_result};
 }
 
@@ -1075,13 +1021,13 @@ __device__ __forceinline__ void store_relaxed(thrust::complex<double>* ptr,
 {
     auto real_result = result.real();
     auto imag_result = result.imag();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(ptr),
-                 "d"(real_result), "d"(imag_result)
-                 : "memory");
+        "st.volatile.v2.f64 [%0], {%1, %2};"
 #else
-    asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"(ptr),
-                 "d"(real_result), "d"(imag_result)
-                 : "memory");
+        "st.relaxed.gpu.v2.f64 [%0], {%1, %2};"
 #endif
+        ::"l"(ptr),
+        "d"(real_result), "d"(imag_result)
+        : "memory");
 }
diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py
index 9dec14d2394..49f99d4d96f 100755
--- a/dev_tools/scripts/generate_cuda_memory_ptx.py
+++ b/dev_tools/scripts/generate_cuda_memory_ptx.py
@@ -67,7 +67,7 @@ class type_desc:
 // for reasoning behind this implementation
 #if (!defined(__clang__) && __CUDACC_VER_MAJOR__ >= 11)
     return static_cast<uint32>(__cvta_generic_to_shared(ptr));
-#elif (!defined(__clang__) && CUDACC_VER_MAJOR__ == 10 && \
+#elif (!defined(__clang__) && CUDACC_VER_MAJOR__ == 10 && \\
        __CUDACC_VER_MINOR__ >= 2)
     return __nvvm_get_smem_pointer(ptr);
 #else
@@ -123,17 +123,15 @@ class type_desc:
 __device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr)
 {{
     {t.name} result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];"
-                 : "={t.val_constraint}"(result)
-                 : "{s.ptr_constraint}"({const_ptr_expr})
-                 : "memory");
+        "ld.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];"
 #else
-    asm volatile("ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];"
-                 : "={t.val_constraint}"(result)
-                 : "{s.ptr_constraint}"({const_ptr_expr})
-                 : "memory");
+        "ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];"
 #endif
+        : "={t.val_constraint}"(result)
+        : "{s.ptr_constraint}"({const_ptr_expr})
+        : "memory");
     {membar_expression}
     return result;
 }}
@@ -142,15 +140,14 @@ class type_desc:
 __device__ __forceinline__ void store{o.fn_store_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} result)
 {{
     {membar_expression}
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;"
-                 :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result)
-                 : "memory");
+        "st.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;"
 #else
-    asm volatile("st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;"
-                 :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result)
-                 : "memory");
+        "st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;"
 #endif
+        :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result)
+        : "memory");
 }}
 """)
 
@@ -167,17 +164,15 @@ class type_desc:
 {{
     {t.name} real_result;
     {t.name} imag_result;
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("ld.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];"
-                 : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result)
-                 : "{s.ptr_constraint}"({const_ptr_expr})
-                 : "memory");
+        "ld.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];"
 #else
-    asm volatile("ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];"
-                 : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result)
-                 : "{s.ptr_constraint}"({const_ptr_expr})
-                 : "memory");
-#endif
+        "ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];"
+#endif                 
+        : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result)
+        : "{s.ptr_constraint}"({const_ptr_expr})
+        : "memory");
     return thrust::complex<{t.name}>{{real_result, imag_result}};
 }}
 
@@ -186,14 +181,13 @@ class type_desc:
 {{
     auto real_result = result.real();
     auto imag_result = result.imag();
+    asm volatile(
 #if __CUDA_ARCH__ < 700
-    asm volatile("st.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};"
-                 :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result)
-                 : "memory");
+        "st.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};"
 #else
-    asm volatile("st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};"
-                 :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result)
-                 : "memory");
+        "st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};"
 #endif
+        :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result)
+        : "memory");
 }}
 """)