Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -294,8 +294,7 @@ void fallback_transpose(const std::shared_ptr<const DefaultExecutor> exec,
thrust::device_pointer_cast(out_row_idxs.get_data()),
thrust::device_pointer_cast(out_col_idxs),
thrust::device_pointer_cast(permutation.get_data())));
using tuple_type =
thrust::tuple<IndexType, IndexType, device_type<ValueType>>;
using tuple_type = thrust::tuple<IndexType, IndexType, IndexType>;
thrust::sort(thrust::device, zip_it, zip_it + nnzb,
[] __device__(const tuple_type& a, const tuple_type& b) {
return thrust::tie(thrust::get<0>(a), thrust::get<1>(a)) <
Expand Down
11 changes: 7 additions & 4 deletions common/unified/multigrid/pgm_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,9 @@ void find_strongest_neighbor(
exec,
[] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, auto weight_vals,
auto diag, auto agg, auto strongest_neighbor) {
auto max_weight_unagg = zero<ValueType>();
auto max_weight_agg = zero<ValueType>();
using value_type = device_type<ValueType>;
auto max_weight_unagg = zero<value_type>();
auto max_weight_agg = zero<value_type>();
IndexType strongest_unagg = -1;
IndexType strongest_agg = -1;
if (agg[row] != -1) {
Expand Down Expand Up @@ -267,7 +268,8 @@ void assign_to_exist_agg(std::shared_ptr<const DefaultExecutor> exec,
if (agg_val[row] != -1) {
return;
}
ValueType max_weight_agg = zero<ValueType>();
using value_type = device_type<ValueType>;
auto max_weight_agg = zero<value_type>();
IndexType strongest_agg = -1;
for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) {
auto col = col_idxs[idx];
Expand Down Expand Up @@ -304,7 +306,8 @@ void assign_to_exist_agg(std::shared_ptr<const DefaultExecutor> exec,
if (agg_val[row] != -1) {
return;
}
ValueType max_weight_agg = zero<ValueType>();
using value_type = device_type<ValueType>;
auto max_weight_agg = zero<value_type>();
IndexType strongest_agg = -1;
for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) {
auto col = col_idxs[idx];
Expand Down
8 changes: 4 additions & 4 deletions cuda/factorization/par_ilut_filter_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ void threshold_filter(syn::value_list<int, subwarp_size>,
if (num_blocks > 0) {
kernel::threshold_filter_nnz<subwarp_size>
<<<num_blocks, default_block_size>>>(
old_row_ptrs, as_cuda_type(old_vals), num_rows, threshold,
new_row_ptrs, lower);
old_row_ptrs, as_cuda_type(old_vals), num_rows,
as_cuda_type(threshold), new_row_ptrs, lower);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we actually need that? As far as I can see, gko::half should be the same on CPU vs. GPU.
Are you changing the type so that gko::half is a different type on CPU and GPU?
Looks like that is your plan in #1257.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I will use __half on the GPU but gko::half on CPU.

}

// build row pointers
Expand Down Expand Up @@ -126,8 +126,8 @@ void threshold_filter(syn::value_list<int, subwarp_size>,
kernel::threshold_filter<subwarp_size>
<<<num_blocks, default_block_size>>>(
old_row_ptrs, old_col_idxs, as_cuda_type(old_vals), num_rows,
threshold, new_row_ptrs, new_row_idxs, new_col_idxs,
as_cuda_type(new_vals), lower);
as_cuda_type(threshold), new_row_ptrs, new_row_idxs,
new_col_idxs, as_cuda_type(new_vals), lower);
}
}

Expand Down
6 changes: 3 additions & 3 deletions cuda/factorization/par_ilut_select_common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,12 @@ void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,
static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
// pick sample, build searchtree
kernel::build_searchtree<<<1, bucket_count>>>(as_cuda_type(values), size,
tree);
as_cuda_type(tree));
// determine bucket sizes
if (num_blocks > 0) {
kernel::count_buckets<<<num_blocks, default_block_size>>>(
as_cuda_type(values), size, tree, partial_counts, oracles,
items_per_thread);
as_cuda_type(values), size, as_cuda_type(tree), partial_counts,
oracles, items_per_thread);
}
// compute prefix sum and total sum over block-local values
kernel::block_prefix_sum<<<bucket_count, default_block_size>>>(
Expand Down
6 changes: 3 additions & 3 deletions cuda/factorization/par_ilut_select_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ void sampleselect_filter(const ValueType* values, IndexType size,
static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
if (num_blocks > 0) {
kernel::filter_bucket<<<num_blocks, default_block_size>>>(
as_cuda_type(values), size, bucket, oracles, partial_counts, out,
items_per_thread);
as_cuda_type(values), size, bucket, oracles, partial_counts,
as_cuda_type(out), items_per_thread);
}
}

Expand Down Expand Up @@ -172,7 +172,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
// base case
auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
kernel::basecase_select<<<1, kernel::basecase_block_size>>>(
tmp22, bucket.size, rank, out_ptr);
as_cuda_type(tmp22), bucket.size, rank, as_cuda_type(out_ptr));
threshold = exec->copy_val_to_host(out_ptr);
}

Expand Down
12 changes: 4 additions & 8 deletions cuda/matrix/csr_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -298,10 +298,8 @@ void load_balance_spmv(std::shared_ptr<const CudaExecutor> exec,
a->get_const_col_idxs(),
as_cuda_type(a->get_const_row_ptrs()),
as_cuda_type(a->get_const_srow()),
as_cuda_type(b->get_const_values()),
as_cuda_type(b->get_stride()),
as_cuda_type(c->get_values()),
as_cuda_type(c->get_stride()));
as_cuda_type(b->get_const_values()), b->get_stride(),
as_cuda_type(c->get_values()), c->get_stride());
}
} else {
if (csr_grid.x > 0 && csr_grid.y > 0) {
Expand All @@ -311,10 +309,8 @@ void load_balance_spmv(std::shared_ptr<const CudaExecutor> exec,
a->get_const_col_idxs(),
as_cuda_type(a->get_const_row_ptrs()),
as_cuda_type(a->get_const_srow()),
as_cuda_type(b->get_const_values()),
as_cuda_type(b->get_stride()),
as_cuda_type(c->get_values()),
as_cuda_type(c->get_stride()));
as_cuda_type(b->get_const_values()), b->get_stride(),
as_cuda_type(c->get_values()), c->get_stride());
}
}
}
Expand Down
7 changes: 3 additions & 4 deletions cuda/solver/cb_gmres_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const CudaExecutor> exec,
stride_arnoldi, as_cuda_type(hessenberg_iter->get_values()),
stride_hessenberg, iter + 1, acc::as_cuda_range(krylov_bases),
as_cuda_type(stop_status), as_cuda_type(reorth_status),
as_cuda_type(num_reorth->get_data()));
num_reorth->get_data());
num_reorth_host = exec->copy_val_to_host(num_reorth->get_const_data());
}

Expand Down Expand Up @@ -388,8 +388,7 @@ void givens_rotation(std::shared_ptr<const CudaExecutor> exec,
givens_sin->get_stride(), as_cuda_type(givens_cos->get_values()),
givens_cos->get_stride(), as_cuda_type(residual_norm->get_values()),
as_cuda_type(residual_norm_collection->get_values()),
residual_norm_collection->get_stride(),
as_cuda_type(stop_status->get_const_data()));
residual_norm_collection->get_stride(), stop_status->get_const_data());
}


Expand All @@ -412,7 +411,7 @@ void arnoldi(std::shared_ptr<const CudaExecutor> exec,
static_cast<unsigned int>(
ceildiv(final_iter_nums->get_num_elems(), default_block_size)),
default_block_size>>>(as_cuda_type(final_iter_nums->get_data()),
as_cuda_type(stop_status->get_const_data()),
stop_status->get_const_data(),
final_iter_nums->get_num_elems());
finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter,
buffer_iter, arnoldi_norm, iter,
Expand Down
21 changes: 10 additions & 11 deletions cuda/solver/idr_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ void solve_lower_triangular(const size_type nrhs,
subspace_dim, nrhs, as_cuda_type(m->get_const_values()),
m->get_stride(), as_cuda_type(f->get_const_values()), f->get_stride(),
as_cuda_type(c->get_values()), c->get_stride(),
as_cuda_type(stop_status->get_const_data()));
stop_status->get_const_data());
}


Expand Down Expand Up @@ -161,7 +161,7 @@ void update_g_and_u(std::shared_ptr<const CudaExecutor> exec,
multidot_kernel<<<grid_dim, block_dim>>>(
size, nrhs, as_cuda_type(p_i), as_cuda_type(g_k->get_values()),
g_k->get_stride(), as_cuda_type(alpha->get_values()),
as_cuda_type(stop_status->get_const_data()));
stop_status->get_const_data());
} else {
cublas::dot(exec->get_cublas_handle(), size, p_i, 1,
g_k->get_values(), g_k->get_stride(),
Expand All @@ -175,14 +175,14 @@ void update_g_and_u(std::shared_ptr<const CudaExecutor> exec,
as_cuda_type(g->get_const_values()), g->get_stride(),
as_cuda_type(g_k->get_values()), g_k->get_stride(),
as_cuda_type(u->get_values()), u->get_stride(),
as_cuda_type(stop_status->get_const_data()));
stop_status->get_const_data());
}
update_g_kernel<default_block_size>
<<<ceildiv(size * g_k->get_stride(), default_block_size),
default_block_size>>>(
k, size, nrhs, as_cuda_type(g_k->get_const_values()),
g_k->get_stride(), as_cuda_type(g->get_values()), g->get_stride(),
as_cuda_type(stop_status->get_const_data()));
stop_status->get_const_data());
}


Expand Down Expand Up @@ -212,7 +212,7 @@ void update_m(std::shared_ptr<const CudaExecutor> exec, const size_type nrhs,
multidot_kernel<<<grid_dim, block_dim>>>(
size, nrhs, as_cuda_type(p_i),
as_cuda_type(g_k->get_const_values()), g_k->get_stride(),
as_cuda_type(m_i), as_cuda_type(stop_status->get_const_data()));
as_cuda_type(m_i), stop_status->get_const_data());
} else {
cublas::dot(exec->get_cublas_handle(), size, p_i, 1,
g_k->get_const_values(), g_k->get_stride(), m_i);
Expand Down Expand Up @@ -242,7 +242,7 @@ void update_x_r_and_f(std::shared_ptr<const CudaExecutor> exec,
as_cuda_type(f->get_values()), f->get_stride(),
as_cuda_type(r->get_values()), r->get_stride(),
as_cuda_type(x->get_values()), x->get_stride(),
as_cuda_type(stop_status->get_const_data()));
stop_status->get_const_data());
components::fill_array(exec, f->get_values() + k * f->get_stride(), nrhs,
zero<ValueType>());
}
Expand Down Expand Up @@ -286,7 +286,7 @@ void step_1(std::shared_ptr<const CudaExecutor> exec, const size_type nrhs,
as_cuda_type(c->get_const_values()), c->get_stride(),
as_cuda_type(g->get_const_values()), g->get_stride(),
as_cuda_type(v->get_values()), v->get_stride(),
as_cuda_type(stop_status->get_const_data()));
stop_status->get_const_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL);
Expand All @@ -313,7 +313,7 @@ void step_2(std::shared_ptr<const CudaExecutor> exec, const size_type nrhs,
preconditioned_vector->get_stride(),
as_cuda_type(c->get_const_values()), c->get_stride(),
as_cuda_type(u->get_values()), u->get_stride(),
as_cuda_type(stop_status->get_const_data()));
stop_status->get_const_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL);
Expand Down Expand Up @@ -345,10 +345,9 @@ void compute_omega(
{
const auto grid_dim = ceildiv(nrhs, config::warp_size);
compute_omega_kernel<<<grid_dim, config::warp_size>>>(
nrhs, kappa, as_cuda_type(tht->get_const_values()),
nrhs, as_cuda_type(kappa), as_cuda_type(tht->get_const_values()),
as_cuda_type(residual_norm->get_const_values()),
as_cuda_type(omega->get_values()),
as_cuda_type(stop_status->get_const_data()));
as_cuda_type(omega->get_values()), stop_status->get_const_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
Expand Down
2 changes: 1 addition & 1 deletion cuda/solver/multigrid_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ void kcycle_check_stop(std::shared_ptr<const DefaultExecutor> exec,
if (grid > 0) {
kernel::kcycle_check_stop_kernel<<<grid, default_block_size>>>(
nrhs, as_cuda_type(old_norm->get_const_values()),
as_cuda_type(new_norm->get_const_values()), rel_tol,
as_cuda_type(new_norm->get_const_values()), as_cuda_type(rel_tol),
as_cuda_type(dis_stop.get_data()));
}
is_stop = exec->copy_val_to_host(dis_stop.get_const_data());
Expand Down
4 changes: 2 additions & 2 deletions cuda/stop/residual_norm_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ void residual_norm(std::shared_ptr<const CudaExecutor> exec,

if (grid_size > 0) {
residual_norm_kernel<<<grid_size, block_size>>>(
tau->get_size()[1], rel_residual_goal,
tau->get_size()[1], as_cuda_type(rel_residual_goal),
as_cuda_type(tau->get_const_values()),
as_cuda_type(orig_tau->get_const_values()), stoppingId,
setFinalized, as_cuda_type(stop_status->get_data()),
Expand Down Expand Up @@ -185,7 +185,7 @@ void implicit_residual_norm(

if (grid_size > 0) {
implicit_residual_norm_kernel<<<grid_size, block_size>>>(
tau->get_size()[1], rel_residual_goal,
tau->get_size()[1], as_cuda_type(rel_residual_goal),
as_cuda_type(tau->get_const_values()),
as_cuda_type(orig_tau->get_const_values()), stoppingId,
setFinalized, as_cuda_type(stop_status->get_data()),
Expand Down
8 changes: 5 additions & 3 deletions hip/factorization/par_ilut_filter_kernel.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ void threshold_filter(syn::value_list<int, subwarp_size>,
hipLaunchKernelGGL(
HIP_KERNEL_NAME(kernel::threshold_filter_nnz<subwarp_size>),
num_blocks, default_block_size, 0, 0, old_row_ptrs,
as_hip_type(old_vals), num_rows, threshold, new_row_ptrs, lower);
as_hip_type(old_vals), num_rows, as_hip_type(threshold),
new_row_ptrs, lower);
}

// build row pointers
Expand Down Expand Up @@ -129,8 +130,9 @@ void threshold_filter(syn::value_list<int, subwarp_size>,
hipLaunchKernelGGL(
HIP_KERNEL_NAME(kernel::threshold_filter<subwarp_size>), num_blocks,
default_block_size, 0, 0, old_row_ptrs, old_col_idxs,
as_hip_type(old_vals), num_rows, threshold, new_row_ptrs,
new_row_idxs, new_col_idxs, as_hip_type(new_vals), lower);
as_hip_type(old_vals), num_rows, as_hip_type(threshold),
new_row_ptrs, new_row_idxs, new_col_idxs, as_hip_type(new_vals),
lower);
}
}

Expand Down
6 changes: 4 additions & 2 deletions hip/factorization/par_ilut_select_common.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,14 @@ void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,
static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
// pick sample, build searchtree
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::build_searchtree), 1,
bucket_count, 0, 0, as_hip_type(values), size, tree);
bucket_count, 0, 0, as_hip_type(values), size,
as_hip_type(tree));
// determine bucket sizes
if (num_blocks > 0) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::count_buckets), num_blocks,
default_block_size, 0, 0, as_hip_type(values), size,
tree, partial_counts, oracles, items_per_thread);
as_hip_type(tree), partial_counts, oracles,
items_per_thread);
}
// compute prefix sum and total sum over block-local values
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::block_prefix_sum), bucket_count,
Expand Down
6 changes: 3 additions & 3 deletions hip/factorization/par_ilut_select_kernel.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ void sampleselect_filter(const ValueType* values, IndexType size,
if (num_blocks > 0) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::filter_bucket), num_blocks,
default_block_size, 0, 0, as_hip_type(values), size,
bucket, oracles, partial_counts, out,
bucket, oracles, partial_counts, as_hip_type(out),
items_per_thread);
}
}
Expand Down Expand Up @@ -176,8 +176,8 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
// base case
auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::basecase_select), 1,
kernel::basecase_block_size, 0, 0, tmp22, bucket.size,
rank, out_ptr);
kernel::basecase_block_size, 0, 0, as_hip_type(tmp22),
bucket.size, rank, as_hip_type(out_ptr));
threshold = exec->copy_val_to_host(out_ptr);
}

Expand Down
10 changes: 4 additions & 6 deletions hip/matrix/csr_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,9 +303,8 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
as_hip_type(a->get_const_row_ptrs()),
as_hip_type(a->get_const_srow()),
as_hip_type(b->get_const_values()),
as_hip_type(b->get_stride()), as_hip_type(c->get_values()),
as_hip_type(c->get_stride()));
as_hip_type(b->get_const_values()), b->get_stride(),
as_hip_type(c->get_values()), c->get_stride());
}
} else if (a->get_strategy()->get_name() == "merge_path") {
int items_per_thread =
Expand Down Expand Up @@ -397,9 +396,8 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
as_hip_type(a->get_const_row_ptrs()),
as_hip_type(a->get_const_srow()),
as_hip_type(b->get_const_values()),
as_hip_type(b->get_stride()), as_hip_type(c->get_values()),
as_hip_type(c->get_stride()));
as_hip_type(b->get_const_values()), b->get_stride(),
as_hip_type(c->get_values()), c->get_stride());
}
} else if (a->get_strategy()->get_name() == "merge_path") {
int items_per_thread =
Expand Down
8 changes: 3 additions & 5 deletions hip/solver/cb_gmres_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const HipExecutor> exec,
stride_arnoldi, as_hip_type(hessenberg_iter->get_values()),
stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases),
as_hip_type(stop_status), as_hip_type(reorth_status),
as_hip_type(num_reorth->get_data()));
num_reorth->get_data());
num_reorth_host = exec->copy_val_to_host(num_reorth->get_const_data());
// num_reorth_host := number of next_krylov vector to be
// reorthogonalization
Expand Down Expand Up @@ -401,8 +401,7 @@ void givens_rotation(std::shared_ptr<const HipExecutor> exec,
givens_sin->get_stride(), as_hip_type(givens_cos->get_values()),
givens_cos->get_stride(), as_hip_type(residual_norm->get_values()),
as_hip_type(residual_norm_collection->get_values()),
residual_norm_collection->get_stride(),
as_hip_type(stop_status->get_const_data()));
residual_norm_collection->get_stride(), stop_status->get_const_data());
}


Expand All @@ -426,8 +425,7 @@ void arnoldi(std::shared_ptr<const HipExecutor> exec,
static_cast<unsigned int>(
ceildiv(final_iter_nums->get_num_elems(), default_block_size)),
default_block_size, 0, 0, as_hip_type(final_iter_nums->get_data()),
as_hip_type(stop_status->get_const_data()),
final_iter_nums->get_num_elems());
stop_status->get_const_data(), final_iter_nums->get_num_elems());
finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter,
buffer_iter, arnoldi_norm, iter,
stop_status->get_const_data(), reorth_status->get_data(),
Expand Down
Loading