From 9de37fada7ff2678a310071c1f5b9e998bd0a71a Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 5 Jan 2023 16:32:20 -0600
Subject: [PATCH 1/2] fix type mismatch and convert real valuetype to device
 (for __half)

---
 common/cuda_hip/matrix/fbcsr_kernels.hpp.inc    |  3 +--
 common/unified/multigrid/pgm_kernels.cpp        | 11 +++++++----
 cuda/factorization/par_ilut_filter_kernel.cu    |  8 ++++----
 cuda/factorization/par_ilut_select_common.cu    |  6 +++---
 cuda/factorization/par_ilut_select_kernel.cu    |  6 +++---
 cuda/solver/idr_kernels.cu                      |  2 +-
 cuda/solver/multigrid_kernels.cu                |  2 +-
 cuda/stop/residual_norm_kernels.cu              |  4 ++--
 .../par_ilut_filter_kernel.hip.cpp              |  8 +++++---
 .../par_ilut_select_common.hip.cpp              |  6 ++++--
 .../par_ilut_select_kernel.hip.cpp              |  6 +++---
 hip/solver/idr_kernels.hip.cpp                  |  2 +-
 hip/solver/multigrid_kernels.hip.cpp            | 10 +++++-----
 hip/stop/residual_norm_kernels.hip.cpp          | 17 ++++++++---------
 14 files changed, 48 insertions(+), 43 deletions(-)
diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
index 5b4dd1dc8da..e6c26ca15c5 100644
--- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
@@ -294,8 +294,7 @@ void fallback_transpose(const std::shared_ptr<const DefaultExecutor> exec,
         thrust::device_pointer_cast(out_row_idxs.get_data()),
         thrust::device_pointer_cast(out_col_idxs),
         thrust::device_pointer_cast(permutation.get_data())));
-    using tuple_type =
-        thrust::tuple<IndexType, IndexType, device_type<ValueType>>;
+    using tuple_type = thrust::tuple<IndexType, IndexType, IndexType>;
     thrust::sort(thrust::device, zip_it, zip_it + nnzb,
                  [] __device__(const tuple_type& a, const tuple_type& b) {
                      return thrust::tie(thrust::get<0>(a), thrust::get<1>(a)) <
diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp
index 373592392b3..4d67c2adce0 100644
--- a/common/unified/multigrid/pgm_kernels.cpp
+++ b/common/unified/multigrid/pgm_kernels.cpp
@@ -199,8 +199,9 @@ void find_strongest_neighbor(
         exec,
         [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, auto weight_vals,
                       auto diag, auto agg, auto strongest_neighbor) {
-            auto max_weight_unagg = zero<ValueType>();
-            auto max_weight_agg = zero<ValueType>();
+            using value_type = device_type<ValueType>;
+            auto max_weight_unagg = zero<value_type>();
+            auto max_weight_agg = zero<value_type>();
             IndexType strongest_unagg = -1;
             IndexType strongest_agg = -1;
             if (agg[row] != -1) {
@@ -267,7 +268,8 @@ void assign_to_exist_agg(std::shared_ptr<const DefaultExecutor> exec,
                 if (agg_val[row] != -1) {
                     return;
                 }
-                ValueType max_weight_agg = zero<ValueType>();
+                using value_type = device_type<ValueType>;
+                value_type max_weight_agg = zero<value_type>();
                 IndexType strongest_agg = -1;
                 for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) {
                     auto col = col_idxs[idx];
@@ -304,7 +306,8 @@ void assign_to_exist_agg(std::shared_ptr<const DefaultExecutor> exec,
                 if (agg_val[row] != -1) {
                     return;
                 }
-                ValueType max_weight_agg = zero<ValueType>();
+                using value_type = device_type<ValueType>;
+                value_type max_weight_agg = zero<value_type>();
                 IndexType strongest_agg = -1;
                 for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) {
                     auto col = col_idxs[idx];
diff --git a/cuda/factorization/par_ilut_filter_kernel.cu b/cuda/factorization/par_ilut_filter_kernel.cu
index bee3692e7f0..142d07fd4d2 100644
--- a/cuda/factorization/par_ilut_filter_kernel.cu
+++ b/cuda/factorization/par_ilut_filter_kernel.cu
@@ -97,8 +97,8 @@ void threshold_filter(syn::value_list<int, subwarp_size>,
     if (num_blocks > 0) {
         kernel::threshold_filter_nnz<subwarp_size>
             <<<num_blocks, default_block_size>>>(
-                old_row_ptrs, as_cuda_type(old_vals), num_rows, threshold,
-                new_row_ptrs, lower);
+                old_row_ptrs, as_cuda_type(old_vals), num_rows,
+                as_cuda_type(threshold), new_row_ptrs, lower);
     }
 
     // build row pointers
@@ -126,8 +126,8 @@ void threshold_filter(syn::value_list<int, subwarp_size>,
         kernel::threshold_filter<subwarp_size>
             <<<num_blocks, default_block_size>>>(
                 old_row_ptrs, old_col_idxs, as_cuda_type(old_vals), num_rows,
-                threshold, new_row_ptrs, new_row_idxs, new_col_idxs,
-                as_cuda_type(new_vals), lower);
+                as_cuda_type(threshold), new_row_ptrs, new_row_idxs,
+                new_col_idxs, as_cuda_type(new_vals), lower);
     }
 }
 
diff --git a/cuda/factorization/par_ilut_select_common.cu b/cuda/factorization/par_ilut_select_common.cu
index 445d7879d02..91f322509d3 100644
--- a/cuda/factorization/par_ilut_select_common.cu
+++ b/cuda/factorization/par_ilut_select_common.cu
@@ -70,12 +70,12 @@ void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,
         static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
     // pick sample, build searchtree
     kernel::build_searchtree<<<1, bucket_count>>>(as_cuda_type(values), size,
-                                                  tree);
+                                                  as_cuda_type(tree));
     // determine bucket sizes
     if (num_blocks > 0) {
         kernel::count_buckets<<<num_blocks, default_block_size>>>(
-            as_cuda_type(values), size, tree, partial_counts, oracles,
-            items_per_thread);
+            as_cuda_type(values), size, as_cuda_type(tree), partial_counts,
+            oracles, items_per_thread);
     }
     // compute prefix sum and total sum over block-local values
     kernel::block_prefix_sum<<<bucket_count, default_block_size>>>(
diff --git a/cuda/factorization/par_ilut_select_kernel.cu b/cuda/factorization/par_ilut_select_kernel.cu
index f36fca4197a..1b7e9fc35fc 100644
--- a/cuda/factorization/par_ilut_select_kernel.cu
+++ b/cuda/factorization/par_ilut_select_kernel.cu
@@ -77,8 +77,8 @@ void sampleselect_filter(const ValueType* values, IndexType size,
         static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
     if (num_blocks > 0) {
         kernel::filter_bucket<<<num_blocks, default_block_size>>>(
-            as_cuda_type(values), size, bucket, oracles, partial_counts, out,
-            items_per_thread);
+            as_cuda_type(values), size, bucket, oracles, partial_counts,
+            as_cuda_type(out), items_per_thread);
     }
 }
 
@@ -172,7 +172,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
     // base case
     auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
     kernel::basecase_select<<<1, kernel::basecase_block_size>>>(
-        tmp22, bucket.size, rank, out_ptr);
+        as_cuda_type(tmp22), bucket.size, rank, as_cuda_type(out_ptr));
     threshold = exec->copy_val_to_host(out_ptr);
 }
 
diff --git a/cuda/solver/idr_kernels.cu b/cuda/solver/idr_kernels.cu
index d1e6da182a5..5d86d5044ad 100644
--- a/cuda/solver/idr_kernels.cu
+++ b/cuda/solver/idr_kernels.cu
@@ -345,7 +345,7 @@ void compute_omega(
 {
     const auto grid_dim = ceildiv(nrhs, config::warp_size);
     compute_omega_kernel<<<grid_dim, config::warp_size>>>(
-        nrhs, kappa, as_cuda_type(tht->get_const_values()),
+        nrhs, as_cuda_type(kappa), as_cuda_type(tht->get_const_values()),
         as_cuda_type(residual_norm->get_const_values()),
         as_cuda_type(omega->get_values()),
         as_cuda_type(stop_status->get_const_data()));
diff --git a/cuda/solver/multigrid_kernels.cu b/cuda/solver/multigrid_kernels.cu
index b253a5c78de..7fef6bee2d1 100644
--- a/cuda/solver/multigrid_kernels.cu
+++ b/cuda/solver/multigrid_kernels.cu
@@ -139,7 +139,7 @@ void kcycle_check_stop(std::shared_ptr<const DefaultExecutor> exec,
     if (grid > 0) {
         kernel::kcycle_check_stop_kernel<<<grid, default_block_size>>>(
             nrhs, as_cuda_type(old_norm->get_const_values()),
-            as_cuda_type(new_norm->get_const_values()), rel_tol,
+            as_cuda_type(new_norm->get_const_values()), as_cuda_type(rel_tol),
             as_cuda_type(dis_stop.get_data()));
     }
     is_stop = exec->copy_val_to_host(dis_stop.get_const_data());
diff --git a/cuda/stop/residual_norm_kernels.cu b/cuda/stop/residual_norm_kernels.cu
index f24cd54b3a1..4dd794f4ea7 100644
--- a/cuda/stop/residual_norm_kernels.cu
+++ b/cuda/stop/residual_norm_kernels.cu
@@ -106,7 +106,7 @@ void residual_norm(std::shared_ptr<const CudaExecutor> exec,
 
     if (grid_size > 0) {
         residual_norm_kernel<<<grid_size, block_size>>>(
-            tau->get_size()[1], rel_residual_goal,
+            tau->get_size()[1], as_cuda_type(rel_residual_goal),
             as_cuda_type(tau->get_const_values()),
             as_cuda_type(orig_tau->get_const_values()), stoppingId,
             setFinalized, as_cuda_type(stop_status->get_data()),
@@ -185,7 +185,7 @@ void implicit_residual_norm(
 
     if (grid_size > 0) {
         implicit_residual_norm_kernel<<<grid_size, block_size>>>(
-            tau->get_size()[1], rel_residual_goal,
+            tau->get_size()[1], as_cuda_type(rel_residual_goal),
             as_cuda_type(tau->get_const_values()),
             as_cuda_type(orig_tau->get_const_values()), stoppingId,
             setFinalized, as_cuda_type(stop_status->get_data()),
diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernel.hip.cpp
index 90b03035327..90de79fbd56 100644
--- a/hip/factorization/par_ilut_filter_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_filter_kernel.hip.cpp
@@ -101,7 +101,8 @@ void threshold_filter(syn::value_list<int, subwarp_size>,
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(kernel::threshold_filter_nnz<subwarp_size>),
             num_blocks, default_block_size, 0, 0, old_row_ptrs,
-            as_hip_type(old_vals), num_rows, threshold, new_row_ptrs, lower);
+            as_hip_type(old_vals), num_rows, as_hip_type(threshold),
+            new_row_ptrs, lower);
     }
 
     // build row pointers
@@ -129,8 +130,9 @@ void threshold_filter(syn::value_list<int, subwarp_size>,
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(kernel::threshold_filter<subwarp_size>), num_blocks,
             default_block_size, 0, 0, old_row_ptrs, old_col_idxs,
-            as_hip_type(old_vals), num_rows, threshold, new_row_ptrs,
-            new_row_idxs, new_col_idxs, as_hip_type(new_vals), lower);
+            as_hip_type(old_vals), num_rows, as_hip_type(threshold),
+            new_row_ptrs, new_row_idxs, new_col_idxs, as_hip_type(new_vals),
+            lower);
     }
 }
 
diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp
index dff3ffed9ec..14a07dc3a57 100644
--- a/hip/factorization/par_ilut_select_common.hip.cpp
+++ b/hip/factorization/par_ilut_select_common.hip.cpp
@@ -76,12 +76,14 @@ void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,
         static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
     // pick sample, build searchtree
     hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::build_searchtree), 1,
-                       bucket_count, 0, 0, as_hip_type(values), size, tree);
+                       bucket_count, 0, 0, as_hip_type(values), size,
+                       as_hip_type(tree));
     // determine bucket sizes
     if (num_blocks > 0) {
         hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::count_buckets), num_blocks,
                            default_block_size, 0, 0, as_hip_type(values), size,
-                           tree, partial_counts, oracles, items_per_thread);
+                           as_hip_type(tree), partial_counts, oracles,
+                           items_per_thread);
     }
     // compute prefix sum and total sum over block-local values
     hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::block_prefix_sum), bucket_count,
diff --git a/hip/factorization/par_ilut_select_kernel.hip.cpp b/hip/factorization/par_ilut_select_kernel.hip.cpp
index 523059edb82..d9f61f5ceb9 100644
--- a/hip/factorization/par_ilut_select_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_select_kernel.hip.cpp
@@ -81,7 +81,7 @@ void sampleselect_filter(const ValueType* values, IndexType size,
     if (num_blocks > 0) {
         hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::filter_bucket), num_blocks,
                            default_block_size, 0, 0, as_hip_type(values), size,
-                           bucket, oracles, partial_counts, out,
+                           bucket, oracles, partial_counts, as_hip_type(out),
                            items_per_thread);
     }
 }
@@ -176,8 +176,8 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
     // base case
     auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
     hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::basecase_select), 1,
-                       kernel::basecase_block_size, 0, 0, tmp22, bucket.size,
-                       rank, out_ptr);
+                       kernel::basecase_block_size, 0, 0, as_hip_type(tmp22),
+                       bucket.size, rank, as_hip_type(out_ptr));
     threshold = exec->copy_val_to_host(out_ptr);
 }
 
diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp
index 7d6f177e4c4..34b783bd706 100644
--- a/hip/solver/idr_kernels.hip.cpp
+++ b/hip/solver/idr_kernels.hip.cpp
@@ -356,7 +356,7 @@ void compute_omega(
 {
     const auto grid_dim = ceildiv(nrhs, config::warp_size);
     hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_omega_kernel), grid_dim,
-                       config::warp_size, 0, 0, nrhs, kappa,
+                       config::warp_size, 0, 0, nrhs, as_hip_type(kappa),
                        as_hip_type(tht->get_const_values()),
                        as_hip_type(residual_norm->get_const_values()),
                        as_hip_type(omega->get_values()),
diff --git a/hip/solver/multigrid_kernels.hip.cpp b/hip/solver/multigrid_kernels.hip.cpp
index 60f54bfed6d..415500db966 100644
--- a/hip/solver/multigrid_kernels.hip.cpp
+++ b/hip/solver/multigrid_kernels.hip.cpp
@@ -142,11 +142,11 @@ void kcycle_check_stop(std::shared_ptr<const DefaultExecutor> exec,
     const auto nrhs = new_norm->get_size()[1];
     const auto grid = ceildiv(nrhs, default_block_size);
     if (grid > 0) {
-        hipLaunchKernelGGL(kernel::kcycle_check_stop_kernel, grid,
-                           default_block_size, 0, 0, nrhs,
-                           as_hip_type(old_norm->get_const_values()),
-                           as_hip_type(new_norm->get_const_values()), rel_tol,
-                           as_hip_type(dis_stop.get_data()));
+        hipLaunchKernelGGL(
+            kernel::kcycle_check_stop_kernel, grid, default_block_size, 0, 0,
+            nrhs, as_hip_type(old_norm->get_const_values()),
+            as_hip_type(new_norm->get_const_values()), as_hip_type(rel_tol),
+            as_hip_type(dis_stop.get_data()));
     }
     is_stop = exec->copy_val_to_host(dis_stop.get_const_data());
 }
diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp
index b8f1b522f5a..02f9b54a13e 100644
--- a/hip/stop/residual_norm_kernels.hip.cpp
+++ b/hip/stop/residual_norm_kernels.hip.cpp
@@ -110,7 +110,7 @@ void residual_norm(std::shared_ptr<const HipExecutor> exec,
 
     if (grid_size > 0) {
         hipLaunchKernelGGL((residual_norm_kernel), grid_size, block_size, 0, 0,
-                           tau->get_size()[1], rel_residual_goal,
+                           tau->get_size()[1], as_hip_type(rel_residual_goal),
                            as_hip_type(tau->get_const_values()),
                            as_hip_type(orig_tau->get_const_values()),
                            stoppingId, setFinalized,
@@ -143,13 +143,12 @@ constexpr int default_block_size = 512;
 
 template <typename ValueType>
 __global__
-    __launch_bounds__(default_block_size) void implicit_residual_norm_kernel(
-        size_type num_cols, remove_complex<ValueType> rel_residual_goal,
-        const ValueType* __restrict__ tau,
-        const remove_complex<ValueType>* __restrict__ orig_tau,
-        uint8 stoppingId, bool setFinalized,
-        stopping_status* __restrict__ stop_status,
-        bool* __restrict__ device_storage)
+__launch_bounds__(default_block_size) void implicit_residual_norm_kernel(
+    size_type num_cols, remove_complex<ValueType> rel_residual_goal,
+    const ValueType* __restrict__ tau,
+    const remove_complex<ValueType>* __restrict__ orig_tau, uint8 stoppingId,
+    bool setFinalized, stopping_status* __restrict__ stop_status,
+    bool* __restrict__ device_storage)
 {
     const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_cols) {
@@ -192,7 +191,7 @@ void implicit_residual_norm(
     if (grid_size > 0) {
         hipLaunchKernelGGL(
             (implicit_residual_norm_kernel), grid_size, block_size, 0, 0,
-            tau->get_size()[1], rel_residual_goal,
+            tau->get_size()[1], as_hip_type(rel_residual_goal),
             as_hip_type(tau->get_const_values()),
             as_hip_type(orig_tau->get_const_values()), stoppingId, setFinalized,
             as_hip_type(stop_status->get_data()),

From d7b41c76e7b74821b94ee4a99786e60c453191e9 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 9 Feb 2023 12:07:53 +0100
Subject: [PATCH 2/2] delete some redundent as_device_type

Co-authored-by: Pratik Nayak <pratikvn@protonmail.com>
---
 common/unified/multigrid/pgm_kernels.cpp |  4 ++--
 cuda/matrix/csr_kernels.cu               | 12 ++++-------
 cuda/solver/cb_gmres_kernels.cu          |  7 +++---
 cuda/solver/idr_kernels.cu               | 19 ++++++++---------
 hip/matrix/csr_kernels.hip.cpp           | 10 ++++-----
 hip/solver/cb_gmres_kernels.hip.cpp      |  8 +++----
 hip/solver/idr_kernels.hip.cpp           | 27 ++++++++++++------------
 hip/stop/residual_norm_kernels.hip.cpp   | 13 ++++++------
 8 files changed, 45 insertions(+), 55 deletions(-)

diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp
index 4d67c2adce0..50678d83c9f 100644
--- a/common/unified/multigrid/pgm_kernels.cpp
+++ b/common/unified/multigrid/pgm_kernels.cpp
@@ -269,7 +269,7 @@ void assign_to_exist_agg(std::shared_ptr<const DefaultExecutor> exec,
                     return;
                 }
                 using value_type = device_type<ValueType>;
-                value_type max_weight_agg = zero<value_type>();
+                auto max_weight_agg = zero<value_type>();
                 IndexType strongest_agg = -1;
                 for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) {
                     auto col = col_idxs[idx];
@@ -307,7 +307,7 @@ void assign_to_exist_agg(std::shared_ptr<const DefaultExecutor> exec,
                     return;
                 }
                 using value_type = device_type<ValueType>;
-                value_type max_weight_agg = zero<value_type>();
+                auto max_weight_agg = zero<value_type>();
                 IndexType strongest_agg = -1;
                 for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) {
                     auto col = col_idxs[idx];
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
index 9a4cefaae76..18d4d93eb26 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.cu
@@ -298,10 +298,8 @@ void load_balance_spmv(std::shared_ptr<const CudaExecutor> exec,
                     a->get_const_col_idxs(),
                     as_cuda_type(a->get_const_row_ptrs()),
                     as_cuda_type(a->get_const_srow()),
-                    as_cuda_type(b->get_const_values()),
-                    as_cuda_type(b->get_stride()),
-                    as_cuda_type(c->get_values()),
-                    as_cuda_type(c->get_stride()));
+                    as_cuda_type(b->get_const_values()), b->get_stride(),
+                    as_cuda_type(c->get_values()), c->get_stride());
             }
         } else {
             if (csr_grid.x > 0 && csr_grid.y > 0) {
@@ -311,10 +309,8 @@ void load_balance_spmv(std::shared_ptr<const CudaExecutor> exec,
                     a->get_const_col_idxs(),
                     as_cuda_type(a->get_const_row_ptrs()),
                     as_cuda_type(a->get_const_srow()),
-                    as_cuda_type(b->get_const_values()),
-                    as_cuda_type(b->get_stride()),
-                    as_cuda_type(c->get_values()),
-                    as_cuda_type(c->get_stride()));
+                    as_cuda_type(b->get_const_values()), b->get_stride(),
+                    as_cuda_type(c->get_values()), c->get_stride());
             }
         }
     }
diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu
index 5944446f8f0..2058ff084c8 100644
--- a/cuda/solver/cb_gmres_kernels.cu
+++ b/cuda/solver/cb_gmres_kernels.cu
@@ -348,7 +348,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const CudaExecutor> exec,
                 stride_arnoldi, as_cuda_type(hessenberg_iter->get_values()),
                 stride_hessenberg, iter + 1, acc::as_cuda_range(krylov_bases),
                 as_cuda_type(stop_status), as_cuda_type(reorth_status),
-                as_cuda_type(num_reorth->get_data()));
+                num_reorth->get_data());
         num_reorth_host = exec->copy_val_to_host(num_reorth->get_const_data());
     }
 
@@ -388,8 +388,7 @@ void givens_rotation(std::shared_ptr<const CudaExecutor> exec,
         givens_sin->get_stride(), as_cuda_type(givens_cos->get_values()),
         givens_cos->get_stride(), as_cuda_type(residual_norm->get_values()),
         as_cuda_type(residual_norm_collection->get_values()),
-        residual_norm_collection->get_stride(),
-        as_cuda_type(stop_status->get_const_data()));
+        residual_norm_collection->get_stride(), stop_status->get_const_data());
 }
 
 
@@ -412,7 +411,7 @@ void arnoldi(std::shared_ptr<const CudaExecutor> exec,
         static_cast<unsigned int>(
             ceildiv(final_iter_nums->get_num_elems(), default_block_size)),
         default_block_size>>>(as_cuda_type(final_iter_nums->get_data()),
-                              as_cuda_type(stop_status->get_const_data()),
+                              stop_status->get_const_data(),
                               final_iter_nums->get_num_elems());
     finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter,
                        buffer_iter, arnoldi_norm, iter,
diff --git a/cuda/solver/idr_kernels.cu b/cuda/solver/idr_kernels.cu
index 5d86d5044ad..e774821752c 100644
--- a/cuda/solver/idr_kernels.cu
+++ b/cuda/solver/idr_kernels.cu
@@ -129,7 +129,7 @@ void solve_lower_triangular(const size_type nrhs,
         subspace_dim, nrhs, as_cuda_type(m->get_const_values()),
         m->get_stride(), as_cuda_type(f->get_const_values()), f->get_stride(),
         as_cuda_type(c->get_values()), c->get_stride(),
-        as_cuda_type(stop_status->get_const_data()));
+        stop_status->get_const_data());
 }
 
 
@@ -161,7 +161,7 @@ void update_g_and_u(std::shared_ptr<const CudaExecutor> exec,
             multidot_kernel<<<grid_dim, block_dim>>>(
                 size, nrhs, as_cuda_type(p_i), as_cuda_type(g_k->get_values()),
                 g_k->get_stride(), as_cuda_type(alpha->get_values()),
-                as_cuda_type(stop_status->get_const_data()));
+                stop_status->get_const_data());
         } else {
             cublas::dot(exec->get_cublas_handle(), size, p_i, 1,
                         g_k->get_values(), g_k->get_stride(),
@@ -175,14 +175,14 @@ void update_g_and_u(std::shared_ptr<const CudaExecutor> exec,
                 as_cuda_type(g->get_const_values()), g->get_stride(),
                 as_cuda_type(g_k->get_values()), g_k->get_stride(),
                 as_cuda_type(u->get_values()), u->get_stride(),
-                as_cuda_type(stop_status->get_const_data()));
+                stop_status->get_const_data());
     }
     update_g_kernel<default_block_size>
         <<<ceildiv(size * g_k->get_stride(), default_block_size),
            default_block_size>>>(
             k, size, nrhs, as_cuda_type(g_k->get_const_values()),
             g_k->get_stride(), as_cuda_type(g->get_values()), g->get_stride(),
-            as_cuda_type(stop_status->get_const_data()));
+            stop_status->get_const_data());
 }
 
 
@@ -212,7 +212,7 @@ void update_m(std::shared_ptr<const CudaExecutor> exec, const size_type nrhs,
             multidot_kernel<<<grid_dim, block_dim>>>(
                 size, nrhs, as_cuda_type(p_i),
                 as_cuda_type(g_k->get_const_values()), g_k->get_stride(),
-                as_cuda_type(m_i), as_cuda_type(stop_status->get_const_data()));
+                as_cuda_type(m_i), stop_status->get_const_data());
         } else {
             cublas::dot(exec->get_cublas_handle(), size, p_i, 1,
                         g_k->get_const_values(), g_k->get_stride(), m_i);
@@ -242,7 +242,7 @@ void update_x_r_and_f(std::shared_ptr<const CudaExecutor> exec,
         as_cuda_type(f->get_values()), f->get_stride(),
         as_cuda_type(r->get_values()), r->get_stride(),
         as_cuda_type(x->get_values()), x->get_stride(),
-        as_cuda_type(stop_status->get_const_data()));
+        stop_status->get_const_data());
     components::fill_array(exec, f->get_values() + k * f->get_stride(), nrhs,
                            zero<ValueType>());
 }
@@ -286,7 +286,7 @@ void step_1(std::shared_ptr<const CudaExecutor> exec, const size_type nrhs,
         as_cuda_type(c->get_const_values()), c->get_stride(),
         as_cuda_type(g->get_const_values()), g->get_stride(),
         as_cuda_type(v->get_values()), v->get_stride(),
-        as_cuda_type(stop_status->get_const_data()));
+        stop_status->get_const_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL);
@@ -313,7 +313,7 @@ void step_2(std::shared_ptr<const CudaExecutor> exec, const size_type nrhs,
         preconditioned_vector->get_stride(),
         as_cuda_type(c->get_const_values()), c->get_stride(),
         as_cuda_type(u->get_values()), u->get_stride(),
-        as_cuda_type(stop_status->get_const_data()));
+        stop_status->get_const_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL);
@@ -347,8 +347,7 @@ void compute_omega(
     compute_omega_kernel<<<grid_dim, config::warp_size>>>(
         nrhs, as_cuda_type(kappa), as_cuda_type(tht->get_const_values()),
         as_cuda_type(residual_norm->get_const_values()),
-        as_cuda_type(omega->get_values()),
-        as_cuda_type(stop_status->get_const_data()));
+        as_cuda_type(omega->get_values()), stop_status->get_const_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
index 97ad8393b9f..9be1bef3ade 100644
--- a/hip/matrix/csr_kernels.hip.cpp
+++ b/hip/matrix/csr_kernels.hip.cpp
@@ -303,9 +303,8 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
                 as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
                 as_hip_type(a->get_const_row_ptrs()),
                 as_hip_type(a->get_const_srow()),
-                as_hip_type(b->get_const_values()),
-                as_hip_type(b->get_stride()), as_hip_type(c->get_values()),
-                as_hip_type(c->get_stride()));
+                as_hip_type(b->get_const_values()), b->get_stride(),
+                as_hip_type(c->get_values()), c->get_stride());
         }
     } else if (a->get_strategy()->get_name() == "merge_path") {
         int items_per_thread =
@@ -397,9 +396,8 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
                 as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
                 as_hip_type(a->get_const_row_ptrs()),
                 as_hip_type(a->get_const_srow()),
-                as_hip_type(b->get_const_values()),
-                as_hip_type(b->get_stride()), as_hip_type(c->get_values()),
-                as_hip_type(c->get_stride()));
+                as_hip_type(b->get_const_values()), b->get_stride(),
+                as_hip_type(c->get_values()), c->get_stride());
         }
     } else if (a->get_strategy()->get_name() == "merge_path") {
         int items_per_thread =
diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp
index af4298be8dd..39c34f37ae9 100644
--- a/hip/solver/cb_gmres_kernels.hip.cpp
+++ b/hip/solver/cb_gmres_kernels.hip.cpp
@@ -358,7 +358,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const HipExecutor> exec,
             stride_arnoldi, as_hip_type(hessenberg_iter->get_values()),
             stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases),
             as_hip_type(stop_status), as_hip_type(reorth_status),
-            as_hip_type(num_reorth->get_data()));
+            num_reorth->get_data());
         num_reorth_host = exec->copy_val_to_host(num_reorth->get_const_data());
         // num_reorth_host := number of next_krylov vector to be
         // reorthogonalization
@@ -401,8 +401,7 @@ void givens_rotation(std::shared_ptr<const HipExecutor> exec,
         givens_sin->get_stride(), as_hip_type(givens_cos->get_values()),
         givens_cos->get_stride(), as_hip_type(residual_norm->get_values()),
         as_hip_type(residual_norm_collection->get_values()),
-        residual_norm_collection->get_stride(),
-        as_hip_type(stop_status->get_const_data()));
+        residual_norm_collection->get_stride(), stop_status->get_const_data());
 }
 
 
@@ -426,8 +425,7 @@ void arnoldi(std::shared_ptr<const HipExecutor> exec,
         static_cast<unsigned int>(
             ceildiv(final_iter_nums->get_num_elems(), default_block_size)),
         default_block_size, 0, 0, as_hip_type(final_iter_nums->get_data()),
-        as_hip_type(stop_status->get_const_data()),
-        final_iter_nums->get_num_elems());
+        stop_status->get_const_data(), final_iter_nums->get_num_elems());
     finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter,
                        buffer_iter, arnoldi_norm, iter,
                        stop_status->get_const_data(), reorth_status->get_data(),
diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp
index 34b783bd706..b9168aa55d9 100644
--- a/hip/solver/idr_kernels.hip.cpp
+++ b/hip/solver/idr_kernels.hip.cpp
@@ -135,7 +135,7 @@ void solve_lower_triangular(const size_type nrhs,
                        as_hip_type(m->get_const_values()), m->get_stride(),
                        as_hip_type(f->get_const_values()), f->get_stride(),
                        as_hip_type(c->get_values()), c->get_stride(),
-                       as_hip_type(stop_status->get_const_data()));
+                       stop_status->get_const_data());
 }
 
 
@@ -168,7 +168,7 @@ void update_g_and_u(std::shared_ptr<const HipExecutor> exec,
                 multidot_kernel, grid_dim, block_dim, 0, 0, size, nrhs,
                 as_hip_type(p_i), as_hip_type(g_k->get_values()),
                 g_k->get_stride(), as_hip_type(alpha->get_values()),
-                as_hip_type(stop_status->get_const_data()));
+                stop_status->get_const_data());
         } else {
             hipblas::dot(exec->get_hipblas_handle(), size, p_i, 1,
                          g_k->get_values(), g_k->get_stride(),
@@ -183,14 +183,14 @@ void update_g_and_u(std::shared_ptr<const HipExecutor> exec,
             as_hip_type(g->get_const_values()), g->get_stride(),
             as_hip_type(g_k->get_values()), g_k->get_stride(),
             as_hip_type(u->get_values()), u->get_stride(),
-            as_hip_type(stop_status->get_const_data()));
+            stop_status->get_const_data());
     }
     hipLaunchKernelGGL(update_g_kernel<default_block_size>,
                        ceildiv(size * g_k->get_stride(), default_block_size),
                        default_block_size, 0, 0, k, size, nrhs,
                        as_hip_type(g_k->get_const_values()), g_k->get_stride(),
                        as_hip_type(g->get_values()), g->get_stride(),
-                       as_hip_type(stop_status->get_const_data()));
+                       stop_status->get_const_data());
 }
 
 
@@ -221,7 +221,7 @@ void update_m(std::shared_ptr<const HipExecutor> exec, const size_type nrhs,
                                nrhs, as_hip_type(p_i),
                                as_hip_type(g_k->get_const_values()),
                                g_k->get_stride(), as_hip_type(m_i),
-                               as_hip_type(stop_status->get_const_data()));
+                               stop_status->get_const_data());
         } else {
             hipblas::dot(exec->get_hipblas_handle(), size, p_i, 1,
                          g_k->get_const_values(), g_k->get_stride(), m_i);
@@ -252,7 +252,7 @@ void update_x_r_and_f(std::shared_ptr<const HipExecutor> exec,
                        as_hip_type(f->get_values()), f->get_stride(),
                        as_hip_type(r->get_values()), r->get_stride(),
                        as_hip_type(x->get_values()), x->get_stride(),
-                       as_hip_type(stop_status->get_const_data()));
+                       stop_status->get_const_data());
     components::fill_array(exec, f->get_values() + k * f->get_stride(), nrhs,
                            zero<ValueType>());
 }
@@ -297,7 +297,7 @@ void step_1(std::shared_ptr<const HipExecutor> exec, const size_type nrhs,
         residual->get_stride(), as_hip_type(c->get_const_values()),
         c->get_stride(), as_hip_type(g->get_const_values()), g->get_stride(),
         as_hip_type(v->get_values()), v->get_stride(),
-        as_hip_type(stop_status->get_const_data()));
+        stop_status->get_const_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL);
@@ -324,7 +324,7 @@ void step_2(std::shared_ptr<const HipExecutor> exec, const size_type nrhs,
         as_hip_type(preconditioned_vector->get_const_values()),
         preconditioned_vector->get_stride(), as_hip_type(c->get_const_values()),
         c->get_stride(), as_hip_type(u->get_values()), u->get_stride(),
-        as_hip_type(stop_status->get_const_data()));
+        stop_status->get_const_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL);
@@ -355,12 +355,11 @@ void compute_omega(
     matrix::Dense<ValueType>* omega, const array<stopping_status>* stop_status)
 {
     const auto grid_dim = ceildiv(nrhs, config::warp_size);
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_omega_kernel), grid_dim,
-                       config::warp_size, 0, 0, nrhs, as_hip_type(kappa),
-                       as_hip_type(tht->get_const_values()),
-                       as_hip_type(residual_norm->get_const_values()),
-                       as_hip_type(omega->get_values()),
-                       as_hip_type(stop_status->get_const_data()));
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(compute_omega_kernel), grid_dim, config::warp_size, 0,
+        0, nrhs, as_hip_type(kappa), as_hip_type(tht->get_const_values()),
+        as_hip_type(residual_norm->get_const_values()),
+        as_hip_type(omega->get_values()), stop_status->get_const_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp
index 02f9b54a13e..f5f4fedba86 100644
--- a/hip/stop/residual_norm_kernels.hip.cpp
+++ b/hip/stop/residual_norm_kernels.hip.cpp
@@ -143,12 +143,13 @@ constexpr int default_block_size = 512;
 
 template <typename ValueType>
 __global__
-__launch_bounds__(default_block_size) void implicit_residual_norm_kernel(
-    size_type num_cols, remove_complex<ValueType> rel_residual_goal,
-    const ValueType* __restrict__ tau,
-    const remove_complex<ValueType>* __restrict__ orig_tau, uint8 stoppingId,
-    bool setFinalized, stopping_status* __restrict__ stop_status,
-    bool* __restrict__ device_storage)
+    __launch_bounds__(default_block_size) void implicit_residual_norm_kernel(
+        size_type num_cols, remove_complex<ValueType> rel_residual_goal,
+        const ValueType* __restrict__ tau,
+        const remove_complex<ValueType>* __restrict__ orig_tau,
+        uint8 stoppingId, bool setFinalized,
+        stopping_status* __restrict__ stop_status,
+        bool* __restrict__ device_storage)
 {
     const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_cols) {