From 9de37fada7ff2678a310071c1f5b9e998bd0a71a Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 5 Jan 2023 16:32:20 -0600 Subject: [PATCH 1/2] fix type mismatch and convert real valuetype to device (for __half) --- common/cuda_hip/matrix/fbcsr_kernels.hpp.inc | 3 +-- common/unified/multigrid/pgm_kernels.cpp | 11 +++++++---- cuda/factorization/par_ilut_filter_kernel.cu | 8 ++++---- cuda/factorization/par_ilut_select_common.cu | 6 +++--- cuda/factorization/par_ilut_select_kernel.cu | 6 +++--- cuda/solver/idr_kernels.cu | 2 +- cuda/solver/multigrid_kernels.cu | 2 +- cuda/stop/residual_norm_kernels.cu | 4 ++-- .../par_ilut_filter_kernel.hip.cpp | 8 +++++--- .../par_ilut_select_common.hip.cpp | 6 ++++-- .../par_ilut_select_kernel.hip.cpp | 6 +++--- hip/solver/idr_kernels.hip.cpp | 2 +- hip/solver/multigrid_kernels.hip.cpp | 10 +++++----- hip/stop/residual_norm_kernels.hip.cpp | 17 ++++++++--------- 14 files changed, 48 insertions(+), 43 deletions(-) diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc index 5b4dd1dc8da..e6c26ca15c5 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc @@ -294,8 +294,7 @@ void fallback_transpose(const std::shared_ptr exec, thrust::device_pointer_cast(out_row_idxs.get_data()), thrust::device_pointer_cast(out_col_idxs), thrust::device_pointer_cast(permutation.get_data()))); - using tuple_type = - thrust::tuple>; + using tuple_type = thrust::tuple; thrust::sort(thrust::device, zip_it, zip_it + nnzb, [] __device__(const tuple_type& a, const tuple_type& b) { return thrust::tie(thrust::get<0>(a), thrust::get<1>(a)) < diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp index 373592392b3..4d67c2adce0 100644 --- a/common/unified/multigrid/pgm_kernels.cpp +++ b/common/unified/multigrid/pgm_kernels.cpp @@ -199,8 +199,9 @@ void find_strongest_neighbor( exec, [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, auto weight_vals, auto diag, auto agg, auto strongest_neighbor) { - auto max_weight_unagg = zero(); - auto max_weight_agg = zero(); + using value_type = device_type; + auto max_weight_unagg = zero(); + auto max_weight_agg = zero(); IndexType strongest_unagg = -1; IndexType strongest_agg = -1; if (agg[row] != -1) { @@ -267,7 +268,8 @@ void assign_to_exist_agg(std::shared_ptr exec, if (agg_val[row] != -1) { return; } - ValueType max_weight_agg = zero(); + using value_type = device_type; + value_type max_weight_agg = zero(); IndexType strongest_agg = -1; for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) { auto col = col_idxs[idx]; @@ -304,7 +306,8 @@ void assign_to_exist_agg(std::shared_ptr exec, if (agg_val[row] != -1) { return; } - ValueType max_weight_agg = zero(); + using value_type = device_type; + value_type max_weight_agg = zero(); IndexType strongest_agg = -1; for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) { auto col = col_idxs[idx]; diff --git a/cuda/factorization/par_ilut_filter_kernel.cu b/cuda/factorization/par_ilut_filter_kernel.cu index bee3692e7f0..142d07fd4d2 100644 --- a/cuda/factorization/par_ilut_filter_kernel.cu +++ b/cuda/factorization/par_ilut_filter_kernel.cu @@ -97,8 +97,8 @@ void threshold_filter(syn::value_list, if (num_blocks > 0) { kernel::threshold_filter_nnz <<>>( - old_row_ptrs, as_cuda_type(old_vals), num_rows, threshold, - new_row_ptrs, lower); + old_row_ptrs, as_cuda_type(old_vals), num_rows, + as_cuda_type(threshold), new_row_ptrs, lower); } // build row pointers @@ -126,8 +126,8 @@ void threshold_filter(syn::value_list, kernel::threshold_filter <<>>( old_row_ptrs, old_col_idxs, as_cuda_type(old_vals), num_rows, - threshold, new_row_ptrs, new_row_idxs, new_col_idxs, - as_cuda_type(new_vals), lower); + as_cuda_type(threshold), new_row_ptrs, new_row_idxs, + new_col_idxs, as_cuda_type(new_vals), lower); } } diff --git a/cuda/factorization/par_ilut_select_common.cu b/cuda/factorization/par_ilut_select_common.cu index 445d7879d02..91f322509d3 100644 --- a/cuda/factorization/par_ilut_select_common.cu +++ b/cuda/factorization/par_ilut_select_common.cu @@ -70,12 +70,12 @@ void sampleselect_count(std::shared_ptr exec, static_cast(ceildiv(num_threads_total, default_block_size)); // pick sample, build searchtree kernel::build_searchtree<<<1, bucket_count>>>(as_cuda_type(values), size, - tree); + as_cuda_type(tree)); // determine bucket sizes if (num_blocks > 0) { kernel::count_buckets<<>>( - as_cuda_type(values), size, tree, partial_counts, oracles, - items_per_thread); + as_cuda_type(values), size, as_cuda_type(tree), partial_counts, + oracles, items_per_thread); } // compute prefix sum and total sum over block-local values kernel::block_prefix_sum<<>>( diff --git a/cuda/factorization/par_ilut_select_kernel.cu b/cuda/factorization/par_ilut_select_kernel.cu index f36fca4197a..1b7e9fc35fc 100644 --- a/cuda/factorization/par_ilut_select_kernel.cu +++ b/cuda/factorization/par_ilut_select_kernel.cu @@ -77,8 +77,8 @@ void sampleselect_filter(const ValueType* values, IndexType size, static_cast(ceildiv(num_threads_total, default_block_size)); if (num_blocks > 0) { kernel::filter_bucket<<>>( - as_cuda_type(values), size, bucket, oracles, partial_counts, out, - items_per_thread); + as_cuda_type(values), size, bucket, oracles, partial_counts, + as_cuda_type(out), items_per_thread); } } @@ -172,7 +172,7 @@ void threshold_select(std::shared_ptr exec, // base case auto out_ptr = reinterpret_cast(tmp1.get_data()); kernel::basecase_select<<<1, kernel::basecase_block_size>>>( - tmp22, bucket.size, rank, out_ptr); + as_cuda_type(tmp22), bucket.size, rank, as_cuda_type(out_ptr)); threshold = exec->copy_val_to_host(out_ptr); } diff --git a/cuda/solver/idr_kernels.cu b/cuda/solver/idr_kernels.cu index d1e6da182a5..5d86d5044ad 100644 --- a/cuda/solver/idr_kernels.cu +++ b/cuda/solver/idr_kernels.cu @@ -345,7 +345,7 @@ void compute_omega( { const auto grid_dim = ceildiv(nrhs, config::warp_size); compute_omega_kernel<<>>( - nrhs, kappa, as_cuda_type(tht->get_const_values()), + nrhs, as_cuda_type(kappa), as_cuda_type(tht->get_const_values()), as_cuda_type(residual_norm->get_const_values()), as_cuda_type(omega->get_values()), as_cuda_type(stop_status->get_const_data())); diff --git a/cuda/solver/multigrid_kernels.cu b/cuda/solver/multigrid_kernels.cu index b253a5c78de..7fef6bee2d1 100644 --- a/cuda/solver/multigrid_kernels.cu +++ b/cuda/solver/multigrid_kernels.cu @@ -139,7 +139,7 @@ void kcycle_check_stop(std::shared_ptr exec, if (grid > 0) { kernel::kcycle_check_stop_kernel<<>>( nrhs, as_cuda_type(old_norm->get_const_values()), - as_cuda_type(new_norm->get_const_values()), rel_tol, + as_cuda_type(new_norm->get_const_values()), as_cuda_type(rel_tol), as_cuda_type(dis_stop.get_data())); } is_stop = exec->copy_val_to_host(dis_stop.get_const_data()); diff --git a/cuda/stop/residual_norm_kernels.cu b/cuda/stop/residual_norm_kernels.cu index f24cd54b3a1..4dd794f4ea7 100644 --- a/cuda/stop/residual_norm_kernels.cu +++ b/cuda/stop/residual_norm_kernels.cu @@ -106,7 +106,7 @@ void residual_norm(std::shared_ptr exec, if (grid_size > 0) { residual_norm_kernel<<>>( - tau->get_size()[1], rel_residual_goal, + tau->get_size()[1], as_cuda_type(rel_residual_goal), as_cuda_type(tau->get_const_values()), as_cuda_type(orig_tau->get_const_values()), stoppingId, setFinalized, as_cuda_type(stop_status->get_data()), @@ -185,7 +185,7 @@ void implicit_residual_norm( if (grid_size > 0) { implicit_residual_norm_kernel<<>>( - tau->get_size()[1], rel_residual_goal, + tau->get_size()[1], as_cuda_type(rel_residual_goal), as_cuda_type(tau->get_const_values()), as_cuda_type(orig_tau->get_const_values()), stoppingId, setFinalized, as_cuda_type(stop_status->get_data()), diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernel.hip.cpp index 90b03035327..90de79fbd56 100644 --- a/hip/factorization/par_ilut_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_filter_kernel.hip.cpp @@ -101,7 +101,8 @@ void threshold_filter(syn::value_list, hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::threshold_filter_nnz), num_blocks, default_block_size, 0, 0, old_row_ptrs, - as_hip_type(old_vals), num_rows, threshold, new_row_ptrs, lower); + as_hip_type(old_vals), num_rows, as_hip_type(threshold), + new_row_ptrs, lower); } // build row pointers @@ -129,8 +130,9 @@ void threshold_filter(syn::value_list, hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::threshold_filter), num_blocks, default_block_size, 0, 0, old_row_ptrs, old_col_idxs, - as_hip_type(old_vals), num_rows, threshold, new_row_ptrs, - new_row_idxs, new_col_idxs, as_hip_type(new_vals), lower); + as_hip_type(old_vals), num_rows, as_hip_type(threshold), + new_row_ptrs, new_row_idxs, new_col_idxs, as_hip_type(new_vals), + lower); } } diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp index dff3ffed9ec..14a07dc3a57 100644 --- a/hip/factorization/par_ilut_select_common.hip.cpp +++ b/hip/factorization/par_ilut_select_common.hip.cpp @@ -76,12 +76,14 @@ void sampleselect_count(std::shared_ptr exec, static_cast(ceildiv(num_threads_total, default_block_size)); // pick sample, build searchtree hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::build_searchtree), 1, - bucket_count, 0, 0, as_hip_type(values), size, tree); + bucket_count, 0, 0, as_hip_type(values), size, + as_hip_type(tree)); // determine bucket sizes if (num_blocks > 0) { hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::count_buckets), num_blocks, default_block_size, 0, 0, as_hip_type(values), size, - tree, partial_counts, oracles, items_per_thread); + as_hip_type(tree), partial_counts, oracles, + items_per_thread); } // compute prefix sum and total sum over block-local values hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::block_prefix_sum), bucket_count, diff --git a/hip/factorization/par_ilut_select_kernel.hip.cpp b/hip/factorization/par_ilut_select_kernel.hip.cpp index 523059edb82..d9f61f5ceb9 100644 --- a/hip/factorization/par_ilut_select_kernel.hip.cpp +++ b/hip/factorization/par_ilut_select_kernel.hip.cpp @@ -81,7 +81,7 @@ void sampleselect_filter(const ValueType* values, IndexType size, if (num_blocks > 0) { hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::filter_bucket), num_blocks, default_block_size, 0, 0, as_hip_type(values), size, - bucket, oracles, partial_counts, out, + bucket, oracles, partial_counts, as_hip_type(out), items_per_thread); } } @@ -176,8 +176,8 @@ void threshold_select(std::shared_ptr exec, // base case auto out_ptr = reinterpret_cast(tmp1.get_data()); hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::basecase_select), 1, - kernel::basecase_block_size, 0, 0, tmp22, bucket.size, - rank, out_ptr); + kernel::basecase_block_size, 0, 0, as_hip_type(tmp22), + bucket.size, rank, as_hip_type(out_ptr)); threshold = exec->copy_val_to_host(out_ptr); } diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp index 7d6f177e4c4..34b783bd706 100644 --- a/hip/solver/idr_kernels.hip.cpp +++ b/hip/solver/idr_kernels.hip.cpp @@ -356,7 +356,7 @@ void compute_omega( { const auto grid_dim = ceildiv(nrhs, config::warp_size); hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_omega_kernel), grid_dim, - config::warp_size, 0, 0, nrhs, kappa, + config::warp_size, 0, 0, nrhs, as_hip_type(kappa), as_hip_type(tht->get_const_values()), as_hip_type(residual_norm->get_const_values()), as_hip_type(omega->get_values()), diff --git a/hip/solver/multigrid_kernels.hip.cpp b/hip/solver/multigrid_kernels.hip.cpp index 60f54bfed6d..415500db966 100644 --- a/hip/solver/multigrid_kernels.hip.cpp +++ b/hip/solver/multigrid_kernels.hip.cpp @@ -142,11 +142,11 @@ void kcycle_check_stop(std::shared_ptr exec, const auto nrhs = new_norm->get_size()[1]; const auto grid = ceildiv(nrhs, default_block_size); if (grid > 0) { - hipLaunchKernelGGL(kernel::kcycle_check_stop_kernel, grid, - default_block_size, 0, 0, nrhs, - as_hip_type(old_norm->get_const_values()), - as_hip_type(new_norm->get_const_values()), rel_tol, - as_hip_type(dis_stop.get_data())); + hipLaunchKernelGGL( + kernel::kcycle_check_stop_kernel, grid, default_block_size, 0, 0, + nrhs, as_hip_type(old_norm->get_const_values()), + as_hip_type(new_norm->get_const_values()), as_hip_type(rel_tol), + as_hip_type(dis_stop.get_data())); } is_stop = exec->copy_val_to_host(dis_stop.get_const_data()); } diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp index b8f1b522f5a..02f9b54a13e 100644 --- a/hip/stop/residual_norm_kernels.hip.cpp +++ b/hip/stop/residual_norm_kernels.hip.cpp @@ -110,7 +110,7 @@ void residual_norm(std::shared_ptr exec, if (grid_size > 0) { hipLaunchKernelGGL((residual_norm_kernel), grid_size, block_size, 0, 0, - tau->get_size()[1], rel_residual_goal, + tau->get_size()[1], as_hip_type(rel_residual_goal), as_hip_type(tau->get_const_values()), as_hip_type(orig_tau->get_const_values()), stoppingId, setFinalized, @@ -143,13 +143,12 @@ constexpr int default_block_size = 512; template __global__ - __launch_bounds__(default_block_size) void implicit_residual_norm_kernel( - size_type num_cols, remove_complex rel_residual_goal, - const ValueType* __restrict__ tau, - const remove_complex* __restrict__ orig_tau, - uint8 stoppingId, bool setFinalized, - stopping_status* __restrict__ stop_status, - bool* __restrict__ device_storage) +__launch_bounds__(default_block_size) void implicit_residual_norm_kernel( + size_type num_cols, remove_complex rel_residual_goal, + const ValueType* __restrict__ tau, + const remove_complex* __restrict__ orig_tau, uint8 stoppingId, + bool setFinalized, stopping_status* __restrict__ stop_status, + bool* __restrict__ device_storage) { const auto tidx = thread::get_thread_id_flat(); if (tidx < num_cols) { @@ -192,7 +191,7 @@ void implicit_residual_norm( if (grid_size > 0) { hipLaunchKernelGGL( (implicit_residual_norm_kernel), grid_size, block_size, 0, 0, - tau->get_size()[1], rel_residual_goal, + tau->get_size()[1], as_hip_type(rel_residual_goal), as_hip_type(tau->get_const_values()), as_hip_type(orig_tau->get_const_values()), stoppingId, setFinalized, as_hip_type(stop_status->get_data()), From d7b41c76e7b74821b94ee4a99786e60c453191e9 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 9 Feb 2023 12:07:53 +0100 Subject: [PATCH 2/2] delete some redundent as_device_type Co-authored-by: Pratik Nayak --- common/unified/multigrid/pgm_kernels.cpp | 4 ++-- cuda/matrix/csr_kernels.cu | 12 ++++------- cuda/solver/cb_gmres_kernels.cu | 7 +++--- cuda/solver/idr_kernels.cu | 19 ++++++++--------- hip/matrix/csr_kernels.hip.cpp | 10 ++++----- hip/solver/cb_gmres_kernels.hip.cpp | 8 +++---- hip/solver/idr_kernels.hip.cpp | 27 ++++++++++++------------ hip/stop/residual_norm_kernels.hip.cpp | 13 ++++++------ 8 files changed, 45 insertions(+), 55 deletions(-) diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp index 4d67c2adce0..50678d83c9f 100644 --- a/common/unified/multigrid/pgm_kernels.cpp +++ b/common/unified/multigrid/pgm_kernels.cpp @@ -269,7 +269,7 @@ void assign_to_exist_agg(std::shared_ptr exec, return; } using value_type = device_type; - value_type max_weight_agg = zero(); + auto max_weight_agg = zero(); IndexType strongest_agg = -1; for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) { auto col = col_idxs[idx]; @@ -307,7 +307,7 @@ void assign_to_exist_agg(std::shared_ptr exec, return; } using value_type = device_type; - value_type max_weight_agg = zero(); + auto max_weight_agg = zero(); IndexType strongest_agg = -1; for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) { auto col = col_idxs[idx]; diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index 9a4cefaae76..18d4d93eb26 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -298,10 +298,8 @@ void load_balance_spmv(std::shared_ptr exec, a->get_const_col_idxs(), as_cuda_type(a->get_const_row_ptrs()), as_cuda_type(a->get_const_srow()), - as_cuda_type(b->get_const_values()), - as_cuda_type(b->get_stride()), - as_cuda_type(c->get_values()), - as_cuda_type(c->get_stride())); + as_cuda_type(b->get_const_values()), b->get_stride(), + as_cuda_type(c->get_values()), c->get_stride()); } } else { if (csr_grid.x > 0 && csr_grid.y > 0) { @@ -311,10 +309,8 @@ void load_balance_spmv(std::shared_ptr exec, a->get_const_col_idxs(), as_cuda_type(a->get_const_row_ptrs()), as_cuda_type(a->get_const_srow()), - as_cuda_type(b->get_const_values()), - as_cuda_type(b->get_stride()), - as_cuda_type(c->get_values()), - as_cuda_type(c->get_stride())); + as_cuda_type(b->get_const_values()), b->get_stride(), + as_cuda_type(c->get_values()), c->get_stride()); } } } diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu index 5944446f8f0..2058ff084c8 100644 --- a/cuda/solver/cb_gmres_kernels.cu +++ b/cuda/solver/cb_gmres_kernels.cu @@ -348,7 +348,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, stride_arnoldi, as_cuda_type(hessenberg_iter->get_values()), stride_hessenberg, iter + 1, acc::as_cuda_range(krylov_bases), as_cuda_type(stop_status), as_cuda_type(reorth_status), - as_cuda_type(num_reorth->get_data())); + num_reorth->get_data()); num_reorth_host = exec->copy_val_to_host(num_reorth->get_const_data()); } @@ -388,8 +388,7 @@ void givens_rotation(std::shared_ptr exec, givens_sin->get_stride(), as_cuda_type(givens_cos->get_values()), givens_cos->get_stride(), as_cuda_type(residual_norm->get_values()), as_cuda_type(residual_norm_collection->get_values()), - residual_norm_collection->get_stride(), - as_cuda_type(stop_status->get_const_data())); + residual_norm_collection->get_stride(), stop_status->get_const_data()); } @@ -412,7 +411,7 @@ void arnoldi(std::shared_ptr exec, static_cast( ceildiv(final_iter_nums->get_num_elems(), default_block_size)), default_block_size>>>(as_cuda_type(final_iter_nums->get_data()), - as_cuda_type(stop_status->get_const_data()), + stop_status->get_const_data(), final_iter_nums->get_num_elems()); finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter, buffer_iter, arnoldi_norm, iter, diff --git a/cuda/solver/idr_kernels.cu b/cuda/solver/idr_kernels.cu index 5d86d5044ad..e774821752c 100644 --- a/cuda/solver/idr_kernels.cu +++ b/cuda/solver/idr_kernels.cu @@ -129,7 +129,7 @@ void solve_lower_triangular(const size_type nrhs, subspace_dim, nrhs, as_cuda_type(m->get_const_values()), m->get_stride(), as_cuda_type(f->get_const_values()), f->get_stride(), as_cuda_type(c->get_values()), c->get_stride(), - as_cuda_type(stop_status->get_const_data())); + stop_status->get_const_data()); } @@ -161,7 +161,7 @@ void update_g_and_u(std::shared_ptr exec, multidot_kernel<<>>( size, nrhs, as_cuda_type(p_i), as_cuda_type(g_k->get_values()), g_k->get_stride(), as_cuda_type(alpha->get_values()), - as_cuda_type(stop_status->get_const_data())); + stop_status->get_const_data()); } else { cublas::dot(exec->get_cublas_handle(), size, p_i, 1, g_k->get_values(), g_k->get_stride(), @@ -175,14 +175,14 @@ void update_g_and_u(std::shared_ptr exec, as_cuda_type(g->get_const_values()), g->get_stride(), as_cuda_type(g_k->get_values()), g_k->get_stride(), as_cuda_type(u->get_values()), u->get_stride(), - as_cuda_type(stop_status->get_const_data())); + stop_status->get_const_data()); } update_g_kernel <<get_stride(), default_block_size), default_block_size>>>( k, size, nrhs, as_cuda_type(g_k->get_const_values()), g_k->get_stride(), as_cuda_type(g->get_values()), g->get_stride(), - as_cuda_type(stop_status->get_const_data())); + stop_status->get_const_data()); } @@ -212,7 +212,7 @@ void update_m(std::shared_ptr exec, const size_type nrhs, multidot_kernel<<>>( size, nrhs, as_cuda_type(p_i), as_cuda_type(g_k->get_const_values()), g_k->get_stride(), - as_cuda_type(m_i), as_cuda_type(stop_status->get_const_data())); + as_cuda_type(m_i), stop_status->get_const_data()); } else { cublas::dot(exec->get_cublas_handle(), size, p_i, 1, g_k->get_const_values(), g_k->get_stride(), m_i); @@ -242,7 +242,7 @@ void update_x_r_and_f(std::shared_ptr exec, as_cuda_type(f->get_values()), f->get_stride(), as_cuda_type(r->get_values()), r->get_stride(), as_cuda_type(x->get_values()), x->get_stride(), - as_cuda_type(stop_status->get_const_data())); + stop_status->get_const_data()); components::fill_array(exec, f->get_values() + k * f->get_stride(), nrhs, zero()); } @@ -286,7 +286,7 @@ void step_1(std::shared_ptr exec, const size_type nrhs, as_cuda_type(c->get_const_values()), c->get_stride(), as_cuda_type(g->get_const_values()), g->get_stride(), as_cuda_type(v->get_values()), v->get_stride(), - as_cuda_type(stop_status->get_const_data())); + stop_status->get_const_data()); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL); @@ -313,7 +313,7 @@ void step_2(std::shared_ptr exec, const size_type nrhs, preconditioned_vector->get_stride(), as_cuda_type(c->get_const_values()), c->get_stride(), as_cuda_type(u->get_values()), u->get_stride(), - as_cuda_type(stop_status->get_const_data())); + stop_status->get_const_data()); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL); @@ -347,8 +347,7 @@ void compute_omega( compute_omega_kernel<<>>( nrhs, as_cuda_type(kappa), as_cuda_type(tht->get_const_values()), as_cuda_type(residual_norm->get_const_values()), - as_cuda_type(omega->get_values()), - as_cuda_type(stop_status->get_const_data())); + as_cuda_type(omega->get_values()), stop_status->get_const_data()); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index 97ad8393b9f..9be1bef3ade 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -303,9 +303,8 @@ void spmv(std::shared_ptr exec, as_hip_type(a->get_const_values()), a->get_const_col_idxs(), as_hip_type(a->get_const_row_ptrs()), as_hip_type(a->get_const_srow()), - as_hip_type(b->get_const_values()), - as_hip_type(b->get_stride()), as_hip_type(c->get_values()), - as_hip_type(c->get_stride())); + as_hip_type(b->get_const_values()), b->get_stride(), + as_hip_type(c->get_values()), c->get_stride()); } } else if (a->get_strategy()->get_name() == "merge_path") { int items_per_thread = @@ -397,9 +396,8 @@ void advanced_spmv(std::shared_ptr exec, as_hip_type(a->get_const_values()), a->get_const_col_idxs(), as_hip_type(a->get_const_row_ptrs()), as_hip_type(a->get_const_srow()), - as_hip_type(b->get_const_values()), - as_hip_type(b->get_stride()), as_hip_type(c->get_values()), - as_hip_type(c->get_stride())); + as_hip_type(b->get_const_values()), b->get_stride(), + as_hip_type(c->get_values()), c->get_stride()); } } else if (a->get_strategy()->get_name() == "merge_path") { int items_per_thread = diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp index af4298be8dd..39c34f37ae9 100644 --- a/hip/solver/cb_gmres_kernels.hip.cpp +++ b/hip/solver/cb_gmres_kernels.hip.cpp @@ -358,7 +358,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, stride_arnoldi, as_hip_type(hessenberg_iter->get_values()), stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases), as_hip_type(stop_status), as_hip_type(reorth_status), - as_hip_type(num_reorth->get_data())); + num_reorth->get_data()); num_reorth_host = exec->copy_val_to_host(num_reorth->get_const_data()); // num_reorth_host := number of next_krylov vector to be // reorthogonalization @@ -401,8 +401,7 @@ void givens_rotation(std::shared_ptr exec, givens_sin->get_stride(), as_hip_type(givens_cos->get_values()), givens_cos->get_stride(), as_hip_type(residual_norm->get_values()), as_hip_type(residual_norm_collection->get_values()), - residual_norm_collection->get_stride(), - as_hip_type(stop_status->get_const_data())); + residual_norm_collection->get_stride(), stop_status->get_const_data()); } @@ -426,8 +425,7 @@ void arnoldi(std::shared_ptr exec, static_cast( ceildiv(final_iter_nums->get_num_elems(), default_block_size)), default_block_size, 0, 0, as_hip_type(final_iter_nums->get_data()), - as_hip_type(stop_status->get_const_data()), - final_iter_nums->get_num_elems()); + stop_status->get_const_data(), final_iter_nums->get_num_elems()); finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter, buffer_iter, arnoldi_norm, iter, stop_status->get_const_data(), reorth_status->get_data(), diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp index 34b783bd706..b9168aa55d9 100644 --- a/hip/solver/idr_kernels.hip.cpp +++ b/hip/solver/idr_kernels.hip.cpp @@ -135,7 +135,7 @@ void solve_lower_triangular(const size_type nrhs, as_hip_type(m->get_const_values()), m->get_stride(), as_hip_type(f->get_const_values()), f->get_stride(), as_hip_type(c->get_values()), c->get_stride(), - as_hip_type(stop_status->get_const_data())); + stop_status->get_const_data()); } @@ -168,7 +168,7 @@ void update_g_and_u(std::shared_ptr exec, multidot_kernel, grid_dim, block_dim, 0, 0, size, nrhs, as_hip_type(p_i), as_hip_type(g_k->get_values()), g_k->get_stride(), as_hip_type(alpha->get_values()), - as_hip_type(stop_status->get_const_data())); + stop_status->get_const_data()); } else { hipblas::dot(exec->get_hipblas_handle(), size, p_i, 1, g_k->get_values(), g_k->get_stride(), @@ -183,14 +183,14 @@ void update_g_and_u(std::shared_ptr exec, as_hip_type(g->get_const_values()), g->get_stride(), as_hip_type(g_k->get_values()), g_k->get_stride(), as_hip_type(u->get_values()), u->get_stride(), - as_hip_type(stop_status->get_const_data())); + stop_status->get_const_data()); } hipLaunchKernelGGL(update_g_kernel, ceildiv(size * g_k->get_stride(), default_block_size), default_block_size, 0, 0, k, size, nrhs, as_hip_type(g_k->get_const_values()), g_k->get_stride(), as_hip_type(g->get_values()), g->get_stride(), - as_hip_type(stop_status->get_const_data())); + stop_status->get_const_data()); } @@ -221,7 +221,7 @@ void update_m(std::shared_ptr exec, const size_type nrhs, nrhs, as_hip_type(p_i), as_hip_type(g_k->get_const_values()), g_k->get_stride(), as_hip_type(m_i), - as_hip_type(stop_status->get_const_data())); + stop_status->get_const_data()); } else { hipblas::dot(exec->get_hipblas_handle(), size, p_i, 1, g_k->get_const_values(), g_k->get_stride(), m_i); @@ -252,7 +252,7 @@ void update_x_r_and_f(std::shared_ptr exec, as_hip_type(f->get_values()), f->get_stride(), as_hip_type(r->get_values()), r->get_stride(), as_hip_type(x->get_values()), x->get_stride(), - as_hip_type(stop_status->get_const_data())); + stop_status->get_const_data()); components::fill_array(exec, f->get_values() + k * f->get_stride(), nrhs, zero()); } @@ -297,7 +297,7 @@ void step_1(std::shared_ptr exec, const size_type nrhs, residual->get_stride(), as_hip_type(c->get_const_values()), c->get_stride(), as_hip_type(g->get_const_values()), g->get_stride(), as_hip_type(v->get_values()), v->get_stride(), - as_hip_type(stop_status->get_const_data())); + stop_status->get_const_data()); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL); @@ -324,7 +324,7 @@ void step_2(std::shared_ptr exec, const size_type nrhs, as_hip_type(preconditioned_vector->get_const_values()), preconditioned_vector->get_stride(), as_hip_type(c->get_const_values()), c->get_stride(), as_hip_type(u->get_values()), u->get_stride(), - as_hip_type(stop_status->get_const_data())); + stop_status->get_const_data()); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL); @@ -355,12 +355,11 @@ void compute_omega( matrix::Dense* omega, const array* stop_status) { const auto grid_dim = ceildiv(nrhs, config::warp_size); - hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_omega_kernel), grid_dim, - config::warp_size, 0, 0, nrhs, as_hip_type(kappa), - as_hip_type(tht->get_const_values()), - as_hip_type(residual_norm->get_const_values()), - as_hip_type(omega->get_values()), - as_hip_type(stop_status->get_const_data())); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(compute_omega_kernel), grid_dim, config::warp_size, 0, + 0, nrhs, as_hip_type(kappa), as_hip_type(tht->get_const_values()), + as_hip_type(residual_norm->get_const_values()), + as_hip_type(omega->get_values()), stop_status->get_const_data()); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp index 02f9b54a13e..f5f4fedba86 100644 --- a/hip/stop/residual_norm_kernels.hip.cpp +++ b/hip/stop/residual_norm_kernels.hip.cpp @@ -143,12 +143,13 @@ constexpr int default_block_size = 512; template __global__ -__launch_bounds__(default_block_size) void implicit_residual_norm_kernel( - size_type num_cols, remove_complex rel_residual_goal, - const ValueType* __restrict__ tau, - const remove_complex* __restrict__ orig_tau, uint8 stoppingId, - bool setFinalized, stopping_status* __restrict__ stop_status, - bool* __restrict__ device_storage) + __launch_bounds__(default_block_size) void implicit_residual_norm_kernel( + size_type num_cols, remove_complex rel_residual_goal, + const ValueType* __restrict__ tau, + const remove_complex* __restrict__ orig_tau, + uint8 stoppingId, bool setFinalized, + stopping_status* __restrict__ stop_status, + bool* __restrict__ device_storage) { const auto tidx = thread::get_thread_id_flat(); if (tidx < num_cols) {