deepmodeling
diff --git a/‎docs/advanced/input_files/input-main.md
Lines changed: 13 additions & 0 deletions b/‎docs/advanced/input_files/input-main.md
Lines changed: 13 additions & 0 deletions
diff --git a/‎source/module_base/kernels/cuda/math_kernel_op_vec.cu
Lines changed: 66 additions & 0 deletions b/‎source/module_base/kernels/cuda/math_kernel_op_vec.cu
Lines changed: 66 additions & 0 deletions
diff --git a/‎source/module_base/kernels/math_kernel_op.h
Lines changed: 83 additions & 0 deletions b/‎source/module_base/kernels/math_kernel_op.h
Lines changed: 83 additions & 0 deletions
diff --git a/‎source/module_base/kernels/math_kernel_op_vec.cpp
Lines changed: 20 additions & 0 deletions b/‎source/module_base/kernels/math_kernel_op_vec.cpp
Lines changed: 20 additions & 0 deletions
diff --git a/‎source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu
Lines changed: 80 additions & 0 deletions b/‎source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu
Lines changed: 80 additions & 0 deletions
diff --git a/‎source/module_base/module_device/device.cpp
Lines changed: 7 additions & 2 deletions b/‎source/module_base/module_device/device.cpp
Lines changed: 7 additions & 2 deletions
@@ -38,6 +38,7 @@
     - [pw\_seed](#pw_seed)
     - [pw\_diag\_thr](#pw_diag_thr)
     - [diago\_smooth\_ethr](#diago_smooth_ethr)
+    - [use\_k\_continuity](#use_k_continuity)
     - [pw\_diag\_nmax](#pw_diag_nmax)
     - [pw\_diag\_ndim](#pw_diag_ndim)
     - [diag\_subspace](#diag_subspace)
@@ -824,6 +825,18 @@ These variables are used to control the plane wave related parameters.
 - **Description**: If `TRUE`, the smooth threshold strategy, which applies a larger threshold (10e-5) for the empty states, will be implemented in the diagonalization methods. (This strategy should not affect total energy, forces, and other ground-state properties, but computational efficiency will be improved.) If `FALSE`, the smooth threshold strategy will not be applied.
 - **Default**: false
 
+### use_k_continuity
+
+- **Type**: Boolean
+- **Availability**: Used only for plane wave basis set.
+- **Description**: Whether to use k-point continuity for initializing wave functions. When enabled, this strategy exploits the similarity between wavefunctions at neighboring k-points by propagating the wavefunction from a previously initialized k-point to a new k-point, significantly reducing the computational cost of the initial guess.
+
+  **Important constraints:**
+  - Must be used together with `diago_smooth_ethr = 1` for optimal performance
+
+  This feature is particularly useful for calculations with dense k-point sampling where the computational cost of wavefunction initialization becomes significant.
+- **Default**: false
+
 ### pw_diag_nmax
 
 - **Type**: Integer
 
@@ -52,6 +52,19 @@ __global__ void vector_mul_vector_kernel(const int size,
     }
 }
 
+template <typename T>
+__global__ void vector_div_constant_kernel(const int size,
+                                         T* result,
+                                         const T* vector,
+                                         const typename GetTypeReal<T>::type constant)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        result[i] = vector[i] / constant;
+    }
+}
+
 template <typename T>
 __global__ void vector_div_vector_kernel(const int size,
                                          T* result,
@@ -127,6 +140,55 @@ void vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator
     vector_mul_real_wrapper(dim, result, vector, constant);
 }
 
+// vector operator: result[i] = vector[i] / constant
+template <>
+void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                     double* result,
+                                                                     const double* vector,
+                                                                     const double constant)
+{
+    // In small cases, 1024 threads per block will only utilize 17 blocks, much less than 40
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_div_constant_kernel<double><<<block, thread>>>(dim, result, vector, constant);
+
+    cudaCheckOnDebug();
+}
+
+template <typename FPTYPE>
+inline void vector_div_constant_wrapper(const int& dim,
+                                    std::complex<FPTYPE>* result,
+                                    const std::complex<FPTYPE>* vector,
+                                    const FPTYPE constant)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
+
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_div_constant_kernel<thrust::complex<FPTYPE>><<<block, thread>>>(dim, result_tmp, vector_tmp, constant);
+
+    cudaCheckOnDebug();
+}
+
+template <>
+void vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                  std::complex<float>* result,
+                                                                                  const std::complex<float>* vector,
+                                                                                  const float constant)
+{
+    vector_div_constant_wrapper(dim, result, vector, constant);
+}
+
+template <>
+void vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                   std::complex<double>* result,
+                                                                                   const std::complex<double>* vector,
+                                                                                   const double constant)
+{
+    vector_div_constant_wrapper(dim, result, vector, constant);
+}
+
 // vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex)
 template <>
 void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
@@ -306,6 +368,10 @@ template struct vector_mul_real_op<std::complex<float>, base_device::DEVICE_GPU>
 template struct vector_mul_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 
+template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
+template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
+
 template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 
@@ -98,6 +98,21 @@ template <typename T, typename Device> struct vector_mul_vector_op {
   void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
 };
 
+// vector operator: result[i] = vector[i] / constant
+template <typename T, typename Device> struct vector_div_constant_op {
+  using Real = typename GetTypeReal<T>::type;
+  /// @brief result[i] = vector[i] / constant
+  ///
+  /// Input Parameters
+  /// \param dim : array size
+  /// \param vector : input array 
+  /// \param constant : input constant
+  ///
+  /// Output Parameters
+  /// \param result : output array
+  void operator()(const int& dim, T* result, const T* vector, const Real constant);
+};
+
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
 template <typename T, typename Device> struct vector_div_vector_op {
   using Real = typename GetTypeReal<T>::type;
@@ -284,6 +299,48 @@ template <typename T, typename Device> struct matrixCopy {
   void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB);
 };
 
+template <typename T, typename Device>
+struct apply_eigenvalues_op {
+    using Real = typename GetTypeReal<T>::type;
+
+    void operator()(const Device *d, const int &nbase, const int &nbase_x, const int &notconv,
+                    T *result, const T *vectors, const Real *eigenvalues);
+};
+
+template <typename T, typename Device>
+struct precondition_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const Device* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,  
+                   const Real* eigenvalues);
+};
+
+template <typename T, typename Device>
+struct normalize_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const Device* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   Real* psi_norm = nullptr);
+};
+
+template <typename T>
+struct normalize_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const base_device::DEVICE_GPU* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   Real* psi_norm);
+};
+
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 // Partially specialize functor for base_device::GpuDevice.
 template <typename T> struct dot_real_op<T, base_device::DEVICE_GPU> {
@@ -306,6 +363,12 @@ template <typename T> struct vector_mul_vector_op<T, base_device::DEVICE_GPU> {
   void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
 };
 
+// vector operator: result[i] = vector[i] / constant
+template <typename T> struct vector_div_constant_op<T, base_device::DEVICE_GPU> {
+  using Real = typename GetTypeReal<T>::type;
+  void operator()(const int& dim, T* result, const T* vector, const Real constant);
+};
+
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
 template <typename T> struct vector_div_vector_op<T, base_device::DEVICE_GPU> {
   using Real = typename GetTypeReal<T>::type;
@@ -334,6 +397,26 @@ template <typename T> struct matrixCopy<T, base_device::DEVICE_GPU> {
 void createGpuBlasHandle();
 void destoryBLAShandle();
 
+// vector operator: result[i] = -lambda[i] * vector[i]
+template <typename T> struct apply_eigenvalues_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+
+    void operator()(const base_device::DEVICE_GPU *d, const int &nbase, const int &nbase_x, const int &notconv,
+                    T *result, const T *vectors, const Real *eigenvalues);
+};
+
+template <typename T>
+struct precondition_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const base_device::DEVICE_GPU* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,
+                   const Real* eigenvalues);
+};
+
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 } // namespace hsolver
 
 
@@ -60,6 +60,22 @@ struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
     }
 };
 
+template <typename T>
+struct vector_div_constant_op<T, base_device::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& dim, T* result, const T* vector, const Real constant)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
+#endif
+        for (int i = 0; i < dim; i++)
+        {
+            result[i] = vector[i] / constant;
+        }
+    }
+};
+
 template <typename T>
 struct vector_div_vector_op<T, base_device::DEVICE_CPU>
 {
@@ -159,6 +175,10 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_CP
 template struct vector_mul_vector_op<double, base_device::DEVICE_CPU>;
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
 
+template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct vector_div_constant_op<double, base_device::DEVICE_CPU>;
+template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_CPU>;
+
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_CPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
 
@@ -50,6 +50,19 @@ __launch_bounds__(1024) __global__ void vector_mul_vector_kernel(const int size,
     }
 }
 
+template <typename T>
+__launch_bounds__(1024) __global__ void vector_div_constant_kernel(const int size,
+                                                                 T* result,
+                                                                 const T* vector,
+                                                                 const typename GetTypeReal<T>::type constant)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        result[i] = vector[i] / constant;
+    }
+}
+
 template <typename T>
 __launch_bounds__(1024) __global__ void vector_div_vector_kernel(const int size,
                                                                  T* result,
@@ -142,6 +155,69 @@ void vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator
     hipCheckOnDebug();
 }
 
+// vector operator: result[i] = vector[i] / constant
+template <>
+void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                     double* result,
+                                                                     const double* vector,
+                                                                     const double constant)
+{
+    int thread = 1024;
+    int block = (dim + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<double>),
+                       dim3(block),
+                       dim3(thread),
+                       0,
+                       0,
+                       dim,
+                       result,
+                       vector,
+                       constant);
+
+    hipCheckOnDebug();
+}
+
+template <typename FPTYPE>
+inline void vector_div_constant_wrapper(const int& dim,
+                                    std::complex<FPTYPE>* result,
+                                    const std::complex<FPTYPE>* vector,
+                                    const FPTYPE constant)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
+    int thread = 1024;
+    int block = (dim + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<thrust::complex<FPTYPE>>),
+                       dim3(block),
+                       dim3(thread),
+                       0,
+                       0,
+                       dim,
+                       result_tmp,
+                       vector_tmp,
+                       constant);
+
+    hipCheckOnDebug();
+}
+
+template <>
+void vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                  std::complex<float>* result,
+                                                                                  const std::complex<float>* vector,
+                                                                                  const float constant)
+{
+    vector_div_constant_wrapper(dim, result, vector, constant);
+}
+
+template <>
+void vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                   std::complex<double>* result,
+                                                                                   const std::complex<double>* vector,
+                                                                                   const double constant)
+{
+    vector_div_constant_wrapper(dim, result, vector, constant);
+}
+
 // vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex)
 template <>
 void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
@@ -357,6 +433,10 @@ template struct vector_mul_real_op<std::complex<float>, base_device::DEVICE_GPU>
 template struct vector_mul_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 
+template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
+template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
+
 template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 
@@ -5,7 +5,7 @@
 
 #include <base/macros/macros.h>
 #include <cstring>
-
+#include <iostream>
 #ifdef __MPI
 #include "mpi.h"
 #endif
@@ -166,6 +166,11 @@ int device_count = -1;
 cudaGetDeviceCount(&device_count);
 #elif defined(__ROCM)
 hipGetDeviceCount(&device_count);
+/***auto start_time = std::chrono::high_resolution_clock::now();
+std::cout << "Starting hipGetDeviceCount.." << std::endl;
+auto end_time = std::chrono::high_resolution_clock::now();
+auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+std::cout << "hipGetDeviceCount took " << duration.count() << "seconds" << std::endl;***/
 #endif
 if (device_count <= 0)
 {
@@ -711,4 +716,4 @@ void record_device_memory<base_device::DEVICE_GPU>(
 #endif
 
 } // end of namespace information
-} // end of namespace base_device
+} // end of namespace base_device