Skip to content

Commit 6654849

Browse files
authored
Feature: implement k continuity initialization strategy & kernel (#6171)
* Feature: implement k continuity initialization strategy & kernel optimization in davidson-subspcae algorithm - add k continuity initialization strategy in planewave basis - implement heterogenous computation branching between CPU and DCU - implement optimized eigenvalue operations for GPU & DCU - implement optimized preconditioner for GPU & DCU - implement optimized normalization op for GPU & DCU * add tests & docs * fix brace error
1 parent 025e574 commit 6654849

File tree

23 files changed

+1371
-140
lines changed

23 files changed

+1371
-140
lines changed

docs/advanced/input_files/input-main.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
- [pw\_seed](#pw_seed)
3939
- [pw\_diag\_thr](#pw_diag_thr)
4040
- [diago\_smooth\_ethr](#diago_smooth_ethr)
41+
- [use\_k\_continuity](#use_k_continuity)
4142
- [pw\_diag\_nmax](#pw_diag_nmax)
4243
- [pw\_diag\_ndim](#pw_diag_ndim)
4344
- [diag\_subspace](#diag_subspace)
@@ -824,6 +825,18 @@ These variables are used to control the plane wave related parameters.
824825
- **Description**: If `TRUE`, the smooth threshold strategy, which applies a larger threshold (10e-5) for the empty states, will be implemented in the diagonalization methods. (This strategy should not affect total energy, forces, and other ground-state properties, but computational efficiency will be improved.) If `FALSE`, the smooth threshold strategy will not be applied.
825826
- **Default**: false
826827

828+
### use_k_continuity
829+
830+
- **Type**: Boolean
831+
- **Availability**: Used only for plane wave basis set.
832+
- **Description**: Whether to use k-point continuity for initializing wave functions. When enabled, this strategy exploits the similarity between wavefunctions at neighboring k-points by propagating the wavefunction from a previously initialized k-point to a new k-point, significantly reducing the computational cost of the initial guess.
833+
834+
**Important constraints:**
835+
- Must be used together with `diago_smooth_ethr = 1` for optimal performance
836+
837+
This feature is particularly useful for calculations with dense k-point sampling where the computational cost of wavefunction initialization becomes significant.
838+
- **Default**: false
839+
827840
### pw_diag_nmax
828841

829842
- **Type**: Integer

source/module_base/kernels/cuda/math_kernel_op_vec.cu

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,19 @@ __global__ void vector_mul_vector_kernel(const int size,
5252
}
5353
}
5454

55+
template <typename T>
56+
__global__ void vector_div_constant_kernel(const int size,
57+
T* result,
58+
const T* vector,
59+
const typename GetTypeReal<T>::type constant)
60+
{
61+
int i = blockIdx.x * blockDim.x + threadIdx.x;
62+
if (i < size)
63+
{
64+
result[i] = vector[i] / constant;
65+
}
66+
}
67+
5568
template <typename T>
5669
__global__ void vector_div_vector_kernel(const int size,
5770
T* result,
@@ -127,6 +140,55 @@ void vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator
127140
vector_mul_real_wrapper(dim, result, vector, constant);
128141
}
129142

143+
// vector operator: result[i] = vector[i] / constant
144+
template <>
145+
void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
146+
double* result,
147+
const double* vector,
148+
const double constant)
149+
{
150+
// In small cases, 1024 threads per block will only utilize 17 blocks, much less than 40
151+
int thread = thread_per_block;
152+
int block = (dim + thread - 1) / thread;
153+
vector_div_constant_kernel<double><<<block, thread>>>(dim, result, vector, constant);
154+
155+
cudaCheckOnDebug();
156+
}
157+
158+
template <typename FPTYPE>
159+
inline void vector_div_constant_wrapper(const int& dim,
160+
std::complex<FPTYPE>* result,
161+
const std::complex<FPTYPE>* vector,
162+
const FPTYPE constant)
163+
{
164+
thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
165+
const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
166+
167+
int thread = thread_per_block;
168+
int block = (dim + thread - 1) / thread;
169+
vector_div_constant_kernel<thrust::complex<FPTYPE>><<<block, thread>>>(dim, result_tmp, vector_tmp, constant);
170+
171+
cudaCheckOnDebug();
172+
}
173+
174+
template <>
175+
void vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
176+
std::complex<float>* result,
177+
const std::complex<float>* vector,
178+
const float constant)
179+
{
180+
vector_div_constant_wrapper(dim, result, vector, constant);
181+
}
182+
183+
template <>
184+
void vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
185+
std::complex<double>* result,
186+
const std::complex<double>* vector,
187+
const double constant)
188+
{
189+
vector_div_constant_wrapper(dim, result, vector, constant);
190+
}
191+
130192
// vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex)
131193
template <>
132194
void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
@@ -306,6 +368,10 @@ template struct vector_mul_real_op<std::complex<float>, base_device::DEVICE_GPU>
306368
template struct vector_mul_real_op<double, base_device::DEVICE_GPU>;
307369
template struct vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;
308370

371+
template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
372+
template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
373+
template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
374+
309375
template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
310376
template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
311377
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;

source/module_base/kernels/math_kernel_op.h

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,21 @@ template <typename T, typename Device> struct vector_mul_vector_op {
9898
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
9999
};
100100

101+
// vector operator: result[i] = vector[i] / constant
102+
template <typename T, typename Device> struct vector_div_constant_op {
103+
using Real = typename GetTypeReal<T>::type;
104+
/// @brief result[i] = vector[i] / constant
105+
///
106+
/// Input Parameters
107+
/// \param dim : array size
108+
/// \param vector : input array
109+
/// \param constant : input constant
110+
///
111+
/// Output Parameters
112+
/// \param result : output array
113+
void operator()(const int& dim, T* result, const T* vector, const Real constant);
114+
};
115+
101116
// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
102117
template <typename T, typename Device> struct vector_div_vector_op {
103118
using Real = typename GetTypeReal<T>::type;
@@ -284,6 +299,48 @@ template <typename T, typename Device> struct matrixCopy {
284299
void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB);
285300
};
286301

302+
template <typename T, typename Device>
303+
struct apply_eigenvalues_op {
304+
using Real = typename GetTypeReal<T>::type;
305+
306+
void operator()(const Device *d, const int &nbase, const int &nbase_x, const int &notconv,
307+
T *result, const T *vectors, const Real *eigenvalues);
308+
};
309+
310+
template <typename T, typename Device>
311+
struct precondition_op {
312+
using Real = typename GetTypeReal<T>::type;
313+
void operator()(const Device* d,
314+
const int& dim,
315+
T* psi_iter,
316+
const int& nbase,
317+
const int& notconv,
318+
const Real* precondition,
319+
const Real* eigenvalues);
320+
};
321+
322+
template <typename T, typename Device>
323+
struct normalize_op {
324+
using Real = typename GetTypeReal<T>::type;
325+
void operator()(const Device* d,
326+
const int& dim,
327+
T* psi_iter,
328+
const int& nbase,
329+
const int& notconv,
330+
Real* psi_norm = nullptr);
331+
};
332+
333+
template <typename T>
334+
struct normalize_op<T, base_device::DEVICE_GPU> {
335+
using Real = typename GetTypeReal<T>::type;
336+
void operator()(const base_device::DEVICE_GPU* d,
337+
const int& dim,
338+
T* psi_iter,
339+
const int& nbase,
340+
const int& notconv,
341+
Real* psi_norm);
342+
};
343+
287344
#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
288345
// Partially specialize functor for base_device::GpuDevice.
289346
template <typename T> struct dot_real_op<T, base_device::DEVICE_GPU> {
@@ -306,6 +363,12 @@ template <typename T> struct vector_mul_vector_op<T, base_device::DEVICE_GPU> {
306363
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
307364
};
308365

366+
// vector operator: result[i] = vector[i] / constant
367+
template <typename T> struct vector_div_constant_op<T, base_device::DEVICE_GPU> {
368+
using Real = typename GetTypeReal<T>::type;
369+
void operator()(const int& dim, T* result, const T* vector, const Real constant);
370+
};
371+
309372
// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
310373
template <typename T> struct vector_div_vector_op<T, base_device::DEVICE_GPU> {
311374
using Real = typename GetTypeReal<T>::type;
@@ -334,6 +397,26 @@ template <typename T> struct matrixCopy<T, base_device::DEVICE_GPU> {
334397
void createGpuBlasHandle();
335398
void destoryBLAShandle();
336399

400+
// vector operator: result[i] = -lambda[i] * vector[i]
401+
template <typename T> struct apply_eigenvalues_op<T, base_device::DEVICE_GPU> {
402+
using Real = typename GetTypeReal<T>::type;
403+
404+
void operator()(const base_device::DEVICE_GPU *d, const int &nbase, const int &nbase_x, const int &notconv,
405+
T *result, const T *vectors, const Real *eigenvalues);
406+
};
407+
408+
template <typename T>
409+
struct precondition_op<T, base_device::DEVICE_GPU> {
410+
using Real = typename GetTypeReal<T>::type;
411+
void operator()(const base_device::DEVICE_GPU* d,
412+
const int& dim,
413+
T* psi_iter,
414+
const int& nbase,
415+
const int& notconv,
416+
const Real* precondition,
417+
const Real* eigenvalues);
418+
};
419+
337420
#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
338421
} // namespace hsolver
339422

source/module_base/kernels/math_kernel_op_vec.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,22 @@ struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
6060
}
6161
};
6262

63+
template <typename T>
64+
struct vector_div_constant_op<T, base_device::DEVICE_CPU>
65+
{
66+
using Real = typename GetTypeReal<T>::type;
67+
void operator()(const int& dim, T* result, const T* vector, const Real constant)
68+
{
69+
#ifdef _OPENMP
70+
#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
71+
#endif
72+
for (int i = 0; i < dim; i++)
73+
{
74+
result[i] = vector[i] / constant;
75+
}
76+
}
77+
};
78+
6379
template <typename T>
6480
struct vector_div_vector_op<T, base_device::DEVICE_CPU>
6581
{
@@ -159,6 +175,10 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_CP
159175
template struct vector_mul_vector_op<double, base_device::DEVICE_CPU>;
160176
template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
161177

178+
template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_CPU>;
179+
template struct vector_div_constant_op<double, base_device::DEVICE_CPU>;
180+
template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_CPU>;
181+
162182
template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
163183
template struct vector_div_vector_op<double, base_device::DEVICE_CPU>;
164184
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;

source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,19 @@ __launch_bounds__(1024) __global__ void vector_mul_vector_kernel(const int size,
5050
}
5151
}
5252

53+
template <typename T>
54+
__launch_bounds__(1024) __global__ void vector_div_constant_kernel(const int size,
55+
T* result,
56+
const T* vector,
57+
const typename GetTypeReal<T>::type constant)
58+
{
59+
int i = blockIdx.x * blockDim.x + threadIdx.x;
60+
if (i < size)
61+
{
62+
result[i] = vector[i] / constant;
63+
}
64+
}
65+
5366
template <typename T>
5467
__launch_bounds__(1024) __global__ void vector_div_vector_kernel(const int size,
5568
T* result,
@@ -142,6 +155,69 @@ void vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator
142155
hipCheckOnDebug();
143156
}
144157

158+
// vector operator: result[i] = vector[i] / constant
159+
template <>
160+
void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
161+
double* result,
162+
const double* vector,
163+
const double constant)
164+
{
165+
int thread = 1024;
166+
int block = (dim + thread - 1) / thread;
167+
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<double>),
168+
dim3(block),
169+
dim3(thread),
170+
0,
171+
0,
172+
dim,
173+
result,
174+
vector,
175+
constant);
176+
177+
hipCheckOnDebug();
178+
}
179+
180+
template <typename FPTYPE>
181+
inline void vector_div_constant_wrapper(const int& dim,
182+
std::complex<FPTYPE>* result,
183+
const std::complex<FPTYPE>* vector,
184+
const FPTYPE constant)
185+
{
186+
thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
187+
const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
188+
int thread = 1024;
189+
int block = (dim + thread - 1) / thread;
190+
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<thrust::complex<FPTYPE>>),
191+
dim3(block),
192+
dim3(thread),
193+
0,
194+
0,
195+
dim,
196+
result_tmp,
197+
vector_tmp,
198+
constant);
199+
200+
hipCheckOnDebug();
201+
}
202+
203+
template <>
204+
void vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
205+
std::complex<float>* result,
206+
const std::complex<float>* vector,
207+
const float constant)
208+
{
209+
vector_div_constant_wrapper(dim, result, vector, constant);
210+
}
211+
212+
template <>
213+
void vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
214+
std::complex<double>* result,
215+
const std::complex<double>* vector,
216+
const double constant)
217+
{
218+
vector_div_constant_wrapper(dim, result, vector, constant);
219+
}
220+
145221
// vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex)
146222
template <>
147223
void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
@@ -357,6 +433,10 @@ template struct vector_mul_real_op<std::complex<float>, base_device::DEVICE_GPU>
357433
template struct vector_mul_real_op<double, base_device::DEVICE_GPU>;
358434
template struct vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;
359435

436+
template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
437+
template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
438+
template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
439+
360440
template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
361441
template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
362442
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;

source/module_base/module_device/device.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
#include <base/macros/macros.h>
77
#include <cstring>
8-
8+
#include <iostream>
99
#ifdef __MPI
1010
#include "mpi.h"
1111
#endif
@@ -166,6 +166,11 @@ int device_count = -1;
166166
cudaGetDeviceCount(&device_count);
167167
#elif defined(__ROCM)
168168
hipGetDeviceCount(&device_count);
169+
/***auto start_time = std::chrono::high_resolution_clock::now();
170+
std::cout << "Starting hipGetDeviceCount.." << std::endl;
171+
auto end_time = std::chrono::high_resolution_clock::now();
172+
auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
173+
std::cout << "hipGetDeviceCount took " << duration.count() << "seconds" << std::endl;***/
169174
#endif
170175
if (device_count <= 0)
171176
{
@@ -711,4 +716,4 @@ void record_device_memory<base_device::DEVICE_GPU>(
711716
#endif
712717

713718
} // end of namespace information
714-
} // end of namespace base_device
719+
} // end of namespace base_device

0 commit comments

Comments
 (0)