diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index a28ed8d11d..02721be142 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -354,6 +354,7 @@ OBJS_HAMILT_LCAO=hamilt_lcao.o\
     overlap_new.o\
     td_ekinetic_lcao.o\
     td_nonlocal_lcao.o\
+    td_pot_hybrid.o\
     veff_lcao.o\
     meta_lcao.o\
     op_dftu_lcao.o\
@@ -623,8 +624,9 @@ OBJS_LCAO=evolve_elec.o\
       propagator_cn2.o\
       propagator_taylor.o\
       propagator_etrs.o\
-      td_velocity.o\
-      td_current.o\
+      td_folding.o\
+      td_info.o\
+      velocity_op.o\
       snap_psibeta_half_tddft.o\
       solve_propagation.o\
       upsi.o\
diff --git a/source/module_lr/lr_spectrum.h b/source/module_lr/lr_spectrum.h
index aee905ccdf..9a286c95dd 100644
--- a/source/module_lr/lr_spectrum.h
+++ b/source/module_lr/lr_spectrum.h
@@ -5,7 +5,7 @@
 #include "source_estate/module_dm/density_matrix.h"
 #include "module_lr/utils/lr_util.h"
 #include "source_basis/module_nao/two_center_bundle.h"
-#include "source_lcao/module_tddft/td_current.h"
+#include "source_lcao/module_tddft/velocity_op.h"
 namespace LR
 {
     template<typename T>
@@ -52,8 +52,8 @@ namespace LR
         /// calculate the transition dipole of all states in length gauge
         void cal_transition_dipoles_length();
         /// calculate the transition dipole of state S in velocity gauge: $i(\sum_{iak}X^S_{iak}<ik|v|ak>)/\Omega_S$
-        ModuleBase::Vector3<T> cal_transition_dipole_istate_velocity_R(const int istate, const TD_current& vR);
-        ModuleBase::Vector3<T> cal_transition_dipole_istate_velocity_k(const int istate, const TD_current& vR);
+        ModuleBase::Vector3<T> cal_transition_dipole_istate_velocity_R(const int istate, const Velocity_op<std::complex<double>>& vR);
+        ModuleBase::Vector3<T> cal_transition_dipole_istate_velocity_k(const int istate, const Velocity_op<std::complex<double>>& vR);
         /// calculate the transition dipole of all states in velocity gauge
         void cal_transition_dipoles_velocity();
         double cal_mean_squared_dipole(ModuleBase::Vector3<T> dipole);
diff --git a/source/module_lr/lr_spectrum_velocity.cpp b/source/module_lr/lr_spectrum_velocity.cpp
index 81c35c5820..bbc7ae2dca 100644
--- a/source/module_lr/lr_spectrum_velocity.cpp
+++ b/source/module_lr/lr_spectrum_velocity.cpp
@@ -7,17 +7,17 @@
 namespace LR
 {
     /// get the velocity matrix v(R)
-    inline TD_current get_velocity_matrix_R(const UnitCell& ucell,
+    inline Velocity_op<std::complex<double>> get_velocity_matrix_R(const UnitCell& ucell,
         const Grid_Driver& gd,
         const Parallel_Orbitals& pmat,
         const TwoCenterBundle& two_center_bundle)
     {
-        // convert the orbital object to the old class for TD_current
+        // convert the orbital object to the old class for Velocity_op
         LCAO_Orbitals orb;
         const auto& inp = PARAM.inp;
         two_center_bundle.to_LCAO_Orbitals(orb, inp.lcao_ecut, inp.lcao_dk, inp.lcao_dr, inp.lcao_rmax);
         // actually this class calculates the velocity matrix v(R) at A=0
-        TD_current vR(&ucell, &gd, &pmat, orb, two_center_bundle.overlap_orb.get());
+        Velocity_op<std::complex<double>> vR(&ucell, &gd, &pmat, orb, two_center_bundle.overlap_orb.get());
         vR.calculate_vcomm_r(); // $<\mu, 0|[Vnl, r]|\nu, R>$
         vR.calculate_grad_term();   // $<\mu, 0|\nabla|\nu, R>$
         return vR;
@@ -47,7 +47,7 @@ namespace LR
 
     /// this algorithm has bug in multi-k cases, just for test
     template<typename T>
-    ModuleBase::Vector3<T> LR::LR_Spectrum<T>::cal_transition_dipole_istate_velocity_R(const int istate, const TD_current& vR)
+    ModuleBase::Vector3<T> LR::LR_Spectrum<T>::cal_transition_dipole_istate_velocity_R(const int istate, const Velocity_op<std::complex<double>>& vR)
     {
         // transition density matrix D(R)
         const elecstate::DensityMatrix<T, T>& DM_trans = this->cal_transition_density_matrix(istate);
@@ -69,7 +69,7 @@ namespace LR
 
     // this algorithm is actually in use
     template<typename T>
-    ModuleBase::Vector3<T> LR::LR_Spectrum<T>::cal_transition_dipole_istate_velocity_k(const int istate, const TD_current& vR)
+    ModuleBase::Vector3<T> LR::LR_Spectrum<T>::cal_transition_dipole_istate_velocity_k(const int istate, const Velocity_op<std::complex<double>>& vR)
     {
         // transition density matrix D(R)
         const elecstate::DensityMatrix<T, T>& DM_trans = this->cal_transition_density_matrix(istate, this->X, false);
@@ -97,7 +97,7 @@ namespace LR
     template<typename T>
     void LR::LR_Spectrum<T>::cal_transition_dipoles_velocity()
     {
-        const TD_current& vR = get_velocity_matrix_R(ucell, gd_, pmat, two_center_bundle_);     // velocity matrix v(R)
+        const Velocity_op<std::complex<double>>& vR = get_velocity_matrix_R(ucell, gd_, pmat, two_center_bundle_);     // velocity matrix v(R)
         transition_dipole_.resize(nstate);
         this->mean_squared_transition_dipole_.resize(nstate);
         for (int istate = 0;istate < nstate;++istate)
@@ -148,7 +148,7 @@ namespace LR
     void LR::LR_Spectrum<T>::test_transition_dipoles_velocity_ks(const double* const ks_eig)
     {
         // velocity matrix v(R)
-        const TD_current& vR = get_velocity_matrix_R(ucell, gd_, pmat, two_center_bundle_);
+        const Velocity_op<std::complex<double>>& vR = get_velocity_matrix_R(ucell, gd_, pmat, two_center_bundle_);
         //  (e_c-e_v) of KS eigenvalues
         std::vector<double> eig_ks_diff(this->ldim);
         for (int is = 0;is < this->nspin_x;++is)
@@ -183,4 +183,4 @@ namespace LR
     }
 }
 template class LR::LR_Spectrum<double>;
-template class LR::LR_Spectrum<std::complex<double>>;
\ No newline at end of file
+template class LR::LR_Spectrum<std::complex<double>>;
diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h
index 16db768542..a781ebe035 100644
--- a/source/module_parameter/input_parameter.h
+++ b/source/module_parameter/input_parameter.h
@@ -283,7 +283,8 @@ struct Input_para
     double bessel_descriptor_sigma = 0.1;           ///< spherical bessel smearing_sigma
 
     // ==============   #Parameters (9.rt-tddft) ===========================
-    double td_force_dt = 0.02; ///<"fs"
+    double td_dt = -1.0;            ///< time step for propagation
+    int estep_per_md = 1;           ///< number of electronic steps per MD step
     bool td_vext = false;      ///< add extern potential or not
     // std::string td_vext_dire = "1";   ///< vext direction
     std::vector<int> td_vext_dire = {1}; ///< vector of vext direction
diff --git a/source/module_parameter/system_parameter.h b/source/module_parameter/system_parameter.h
index 8df781ad0b..f607e42a63 100644
--- a/source/module_parameter/system_parameter.h
+++ b/source/module_parameter/system_parameter.h
@@ -43,6 +43,7 @@ struct System_para
     std::string global_readin_dir = "";     ///< global readin directory
     std::string global_stru_dir = "";       ///< global structure directory
     std::string global_matrix_dir = "";     ///< global matrix directory
+    std::string global_wfc_dir = "";       ///< global wavefunction directory
     std::string global_mlkedf_descriptor_dir = ""; ///< global ML KEDF descriptor directory
     std::string global_deepks_label_elec_dir = ""; ///< global directory for DeePKS labels during electronic steps 
     std::string log_file = "log";           ///< log file name
@@ -65,4 +66,4 @@ struct System_para
 
     bool search_pbc = true;                 ///< whether to search for periodic boundary conditions, force set to true
 };
-#endif
\ No newline at end of file
+#endif
diff --git a/source/source_base/global_file.cpp b/source/source_base/global_file.cpp
index b77fec489c..6d7f5a84a9 100644
--- a/source/source_base/global_file.cpp
+++ b/source/source_base/global_file.cpp
@@ -25,6 +25,7 @@ void ModuleBase::Global_File::make_dir_out(
     const std::string &suffix,
 	const std::string &calculation,
     const bool &out_dir,
+    const bool &out_wfc_dir,
     const int rank,
     const bool &restart,
     const bool out_alllog)
@@ -153,6 +154,46 @@ void ModuleBase::Global_File::make_dir_out(
 #endif
     }
 
+    if(out_wfc_dir)
+    {
+        int make_dir_wfc = 0;
+        std::string command1 =  "test -d " + PARAM.globalv.global_wfc_dir + " || mkdir " + PARAM.globalv.global_wfc_dir;
+
+        times = 0;
+        while(times<GlobalV::NPROC)
+        {
+            if(rank==times)
+            {
+                if ( system( command1.c_str() ) == 0 )
+                {
+                    std::cout << " MAKE THE WFC DIR    : " << PARAM.globalv.global_wfc_dir << std::endl;
+                    make_dir_wfc = 1;
+                }
+                else
+                {
+                    std::cout << " PROC " << rank << " CAN NOT MAKE THE WFC DIR !!! " << std::endl;
+                    make_dir_wfc = 0;
+                }
+            }
+#ifdef __MPI
+            Parallel_Reduce::reduce_all(make_dir_wfc);
+#endif
+            if(make_dir_wfc>0) { break;
+}
+            ++times;
+        }
+
+#ifdef __MPI
+        if(make_dir_wfc==0)
+        {
+            std::cout << " CAN NOT MAKE THE WFC DIR......." << std::endl;
+            ModuleBase::QUIT();
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    }
+
+
     if(PARAM.inp.of_ml_gene_data == 1)
     {
         int make_dir_descrip = 0;
diff --git a/source/source_base/global_file.h b/source/source_base/global_file.h
index 9b6895388c..2b52b838ce 100644
--- a/source/source_base/global_file.h
+++ b/source/source_base/global_file.h
@@ -23,6 +23,7 @@ namespace Global_File
 	void make_dir_out(const std::string &suffix,
 		const std::string &calculation,
         const bool &out_dir,
+        const bool &out_wfc_dir,
 		const int rank,
         const bool &restart,
 		const bool out_alllog = false); 
diff --git a/source/source_base/test/global_file_test.cpp b/source/source_base/test/global_file_test.cpp
index b0e34f9050..6102c115a5 100644
--- a/source/source_base/test/global_file_test.cpp
+++ b/source/source_base/test/global_file_test.cpp
@@ -33,7 +33,7 @@ TEST_F(GlobalFile,mkdirout)
 {
 	    std::string output;
 	    testing::internal::CaptureStdout();
-	    ModuleBase::Global_File::make_dir_out("Si","m",false,0,true,true);
+	    ModuleBase::Global_File::make_dir_out("Si","m",false,false,0,true,true);
 	    output = testing::internal::GetCapturedStdout();
 	    EXPECT_THAT(output,testing::HasSubstr("MAKE THE DIR"));
 	    GlobalV::ofs_warning.close();
@@ -43,7 +43,7 @@ TEST_F(GlobalFile,mkdirout)
         remove(dd.c_str());
 
 	    testing::internal::CaptureStdout();
-	    ModuleBase::Global_File::make_dir_out("Si","md",false,0,true,false);
+	    ModuleBase::Global_File::make_dir_out("Si","md",false,false,0,true,false);
 	    output = testing::internal::GetCapturedStdout();
 	    EXPECT_THAT(output,testing::HasSubstr("MAKE THE STRU DIR"));
 	    EXPECT_TRUE(GlobalV::ofs_running.is_open());
@@ -53,7 +53,7 @@ TEST_F(GlobalFile,mkdirout)
         remove(bb.c_str());
 
 	    testing::internal::CaptureStdout();
-	    ModuleBase::Global_File::make_dir_out("Si","md",true,0,true,true);
+	    ModuleBase::Global_File::make_dir_out("Si","md",true,false,0,true,true);
 	    output = testing::internal::GetCapturedStdout();
 	    EXPECT_THAT(output,testing::HasSubstr("MAKE THE MATRIX DIR"));
 	    EXPECT_TRUE(GlobalV::ofs_running.is_open());
diff --git a/source/source_basis/module_ao/ORB_nonlocal_lm.h b/source/source_basis/module_ao/ORB_nonlocal_lm.h
index 31ed209f14..b3504065ae 100644
--- a/source/source_basis/module_ao/ORB_nonlocal_lm.h
+++ b/source/source_basis/module_ao/ORB_nonlocal_lm.h
@@ -45,6 +45,9 @@ class Numerical_Nonlocal_Lm
 	const double& getKpoint(const int &ik) const { return this->k_radial[ik]; }
 	const double* getBeta_k() const { return this->beta_k; }
 	const double& getBeta_k(const int &ik) const { return this->beta_k[ik]; }
+
+	const int& getNk() const { return nk; }
+	const double& getDruniform() const { return dr_uniform; }
 	
     // enables deep copy
 	Numerical_Nonlocal_Lm& operator= (const Numerical_Nonlocal_Lm& nol );
diff --git a/source/source_esolver/esolver.cpp b/source/source_esolver/esolver.cpp
index bac35d027b..fcf8927d17 100644
--- a/source/source_esolver/esolver.cpp
+++ b/source/source_esolver/esolver.cpp
@@ -230,13 +230,26 @@ ESolver* init_esolver(const Input_para& inp, UnitCell& ucell)
     }
     else if (esolver_type == "ksdft_lcao_tddft")
     {
-#if ((defined __CUDA) /* || (defined __ROCM) */)
-        if (PARAM.inp.device == "gpu")
+        if (PARAM.inp.nspin < 4)
         {
-            return new ESolver_KS_LCAO_TDDFT<base_device::DEVICE_GPU>();
+#if ((defined __CUDA) /* || (defined __ROCM) */)
+            if (PARAM.inp.device == "gpu")
+            {
+                return new ESolver_KS_LCAO_TDDFT<double, base_device::DEVICE_GPU>();
+            }
+#endif
+            return new ESolver_KS_LCAO_TDDFT<double, base_device::DEVICE_CPU>();
         }
+        else
+        {
+#if ((defined __CUDA) /* || (defined __ROCM) */)
+            if (PARAM.inp.device == "gpu")
+            {
+                return new ESolver_KS_LCAO_TDDFT<std::complex<double>, base_device::DEVICE_GPU>();
+            }
 #endif
-        return new ESolver_KS_LCAO_TDDFT<base_device::DEVICE_CPU>();
+            return new ESolver_KS_LCAO_TDDFT<std::complex<double>, base_device::DEVICE_CPU>();
+        }
     }
     else if (esolver_type == "lr_lcao")
     {
diff --git a/source/source_esolver/esolver_ks.cpp b/source/source_esolver/esolver_ks.cpp
index eaa69eb93b..8e457c9d09 100644
--- a/source/source_esolver/esolver_ks.cpp
+++ b/source/source_esolver/esolver_ks.cpp
@@ -287,7 +287,10 @@ void ESolver_KS<T, Device>::before_scf(UnitCell& ucell, const int istep)
 template <typename T, typename Device>
 void ESolver_KS<T, Device>::iter_init(UnitCell& ucell, const int istep, const int iter)
 {
-    ModuleIO::write_head(GlobalV::ofs_running, istep, iter, this->basisname);
+    if(PARAM.inp.esolver_type != "tddft")
+    {
+        ModuleIO::write_head(GlobalV::ofs_running, istep, iter, this->basisname);
+    }
 
 #ifdef __MPI
     iter_time = MPI_Wtime();
diff --git a/source/source_esolver/esolver_ks_lcao.cpp b/source/source_esolver/esolver_ks_lcao.cpp
index 30e918e83e..24cd14ffb6 100644
--- a/source/source_esolver/esolver_ks_lcao.cpp
+++ b/source/source_esolver/esolver_ks_lcao.cpp
@@ -161,7 +161,7 @@ void ESolver_KS_LCAO<TK, TR>::before_all_runners(UnitCell& ucell, const Input_pa
     }
 
     // 5) read psi from file
-    if (PARAM.inp.init_wfc == "file")
+    if (PARAM.inp.init_wfc == "file" && PARAM.inp.esolver_type != "tddft")
 	{
 		if (!ModuleIO::read_wfc_nao(PARAM.globalv.global_readin_dir, 
 					this->pv, 
diff --git a/source/source_esolver/esolver_ks_lcao_tddft.cpp b/source/source_esolver/esolver_ks_lcao_tddft.cpp
index 8c71cc2df2..9d3528ea2c 100644
--- a/source/source_esolver/esolver_ks_lcao_tddft.cpp
+++ b/source/source_esolver/esolver_ks_lcao_tddft.cpp
@@ -3,8 +3,10 @@
 #include "source_io/cal_r_overlap_R.h"
 #include "source_io/dipole_io.h"
 #include "source_io/td_current_io.h"
+#include "source_io/read_wfc_nao.h"
 #include "source_io/write_HS.h"
 #include "source_io/write_HS_R.h"
+#include "source_io/output_log.h"
 #include "source_estate/elecstate_tools.h"
 
 //--------------temporary----------------------------
@@ -18,7 +20,6 @@
 #include "source_estate/module_dm/density_matrix.h"
 #include "source_estate/occupy.h"
 #include "source_lcao/module_tddft/evolve_elec.h"
-#include "source_lcao/module_tddft/td_velocity.h"
 #include "source_pw/hamilt_pwdft/global.h"
 #include "source_io/print_info.h"
 
@@ -29,6 +30,7 @@
 #include "source_hsolver/hsolver_lcao.h"
 #include "module_parameter/parameter.h"
 #include "source_psi/psi.h"
+#include "source_estate/module_pot/H_TDDFT_pw.h"
 
 //-----force& stress-------------------
 #include "source_lcao/hamilt_lcaodft/FORCE_STRESS.h"
@@ -38,11 +40,11 @@
 namespace ModuleESolver
 {
 
-template <typename Device>
-ESolver_KS_LCAO_TDDFT<Device>::ESolver_KS_LCAO_TDDFT()
+template <typename TR, typename Device>
+ESolver_KS_LCAO_TDDFT<TR, Device>::ESolver_KS_LCAO_TDDFT()
 {
-    classname = "ESolver_rtTDDFT";
-    basisname = "LCAO";
+    this->classname = "ESolver_rtTDDFT";
+    this->basisname = "LCAO";
 
     // If the device is GPU, we must open use_tensor and use_lapack
     ct::DeviceType ct_device_type = ct::DeviceTypeToEnum<Device>::value;
@@ -53,13 +55,13 @@ ESolver_KS_LCAO_TDDFT<Device>::ESolver_KS_LCAO_TDDFT()
     }
 }
 
-template <typename Device>
-ESolver_KS_LCAO_TDDFT<Device>::~ESolver_KS_LCAO_TDDFT()
+template <typename TR, typename Device>
+ESolver_KS_LCAO_TDDFT<TR, Device>::~ESolver_KS_LCAO_TDDFT()
 {
     delete psi_laststep;
     if (Hk_laststep != nullptr)
     {
-        for (int ik = 0; ik < kv.get_nks(); ++ik)
+        for (int ik = 0; ik < this->kv.get_nks(); ++ik)
         {
             delete[] Hk_laststep[ik];
         }
@@ -67,38 +69,192 @@ ESolver_KS_LCAO_TDDFT<Device>::~ESolver_KS_LCAO_TDDFT()
     }
     if (Sk_laststep != nullptr)
     {
-        for (int ik = 0; ik < kv.get_nks(); ++ik)
+        for (int ik = 0; ik < this->kv.get_nks(); ++ik)
         {
             delete[] Sk_laststep[ik];
         }
         delete[] Sk_laststep;
     }
+    if (td_p != nullptr)
+    {
+        delete td_p;
+    }
+    TD_info::td_vel_op = nullptr;
 }
 
-template <typename Device>
-void ESolver_KS_LCAO_TDDFT<Device>::before_all_runners(UnitCell& ucell, const Input_para& inp)
+template <typename TR, typename Device>
+void ESolver_KS_LCAO_TDDFT<TR, Device>::before_all_runners(UnitCell& ucell, const Input_para& inp)
 {
     // 1) run before_all_runners in ESolver_KS_LCAO
-    ESolver_KS_LCAO<std::complex<double>, double>::before_all_runners(ucell, inp);
+    ESolver_KS_LCAO<std::complex<double>, TR>::before_all_runners(ucell, inp);
 
     // this line should be optimized
     // this->pelec = dynamic_cast<elecstate::ElecStateLCAO_TDDFT*>(this->pelec);
+
+    td_p  = new TD_info(&ucell);
+    TD_info::td_vel_op = td_p;
+    totstep += TD_info::estep_shift;
+
+    if (PARAM.inp.init_wfc == "file")
+	{
+		if (!ModuleIO::read_wfc_nao(PARAM.globalv.global_readin_dir, 
+					this->pv, 
+					*(this->psi), 
+					this->pelec, 
+                    this->pelec->klist->ik2iktot,
+                    this->pelec->klist->get_nkstot(),
+					PARAM.inp.nspin,
+                    TD_info::estep_shift))
+        {
+            ModuleBase::WARNING_QUIT("ESolver_KS_LCAO", "read electronic wave functions failed");
+        }
+    }
 }
+template <typename TR, typename Device>
+void ESolver_KS_LCAO_TDDFT<TR, Device>::runner(UnitCell& ucell, const int istep)
+{
+    ModuleBase::TITLE("ESolver_KS_LCAO_TDDFT", "runner");
+    ModuleBase::timer::tick(this->classname, "runner");
+
+    //----------------------------------------------------------------
+    // 1) before_scf (electronic iteration loops)
+    //----------------------------------------------------------------
+    this->before_scf(ucell, istep);
+    ModuleBase::GlobalFunc::DONE(GlobalV::ofs_running, "INIT SCF");
+
+    // things only initialize once
+    //this->pelec_td->first_evolve = true;
+    if(PARAM.inp.td_stype!=1 && TD_info::out_current)
+    {
+        // initialize the velocity operator
+        velocity_mat = new Velocity_op<TR>(&ucell, &(this->gd), &this->pv, this->orb_, this->two_center_bundle_.overlap_orb.get());
+        //calculate velocity operator
+        velocity_mat->calculate_grad_term();
+        velocity_mat->calculate_vcomm_r();
+    }
+    int estep_max = (istep == 0) ? PARAM.inp.estep_per_md +1 : PARAM.inp.estep_per_md;
+    //int estep_max = PARAM.inp.estep_per_md;
+    for(int estep =0; estep < estep_max; estep++)
+    {
+        // calculate total time step
+        this->totstep++;
+        this->print_step();
+        //update At
+        if(PARAM.inp.td_stype > 0)
+        {
+            elecstate::H_TDDFT_pw::update_At();
+            td_p->cal_cart_At(elecstate::H_TDDFT_pw::At);
+        }
+        ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "Cartesian vector potential Ax(t)", td_p->cart_At[0]);
+        ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "Cartesian vector potential Ay(t)", td_p->cart_At[1]);
+        ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "Cartesian vector potential Az(t)", td_p->cart_At[2]);
+        //std::cout<<"Et: "<<elecstate::H_TDDFT_pw::Et[0]<<" "<<elecstate::H_TDDFT_pw::Et[1]<<" "<<elecstate::H_TDDFT_pw::Et[2]<<std::endl;
+        if(estep!=0)
+        {
+            this->CE.update_all_dis(ucell);
+            this->CE.extrapolate_charge(&this->Pgrid,
+                                        ucell,
+                                        &this->chr,
+                                        &this->sf,
+                                        GlobalV::ofs_running,
+                                        GlobalV::ofs_warning);
+            //need to test if correct when estep>0
+            this->pelec->init_scf(totstep, ucell, this->Pgrid, this->sf.strucFac, this->locpp.numeric, ucell.symm);
+            /*if(PARAM.inp.td_stype == 2)
+            {
+                dynamic_cast<elecstate::ElecStateLCAO<std::complex<double>>*>(this->pelec)->get_DM()->cal_DMR_td(ucell, td_p->cart_At);
+            }
+            else
+            {
+                dynamic_cast<elecstate::ElecStateLCAO<std::complex<double>>*>(this->pelec)->get_DM()->cal_DMR();
+            }*/
+            
+            if(totstep <= PARAM.inp.td_tend + 1)
+            {
+                TD_info::evolve_once = true;
+            }
+        }
+        //----------------------------------------------------------------
+        // 2) SCF iterations
+        //----------------------------------------------------------------
+        bool conv_esolver = false;
+        this->niter = this->maxniter;
+        this->diag_ethr = PARAM.inp.pw_diag_thr;
+        for (int iter = 1; iter <= this->maxniter; ++iter)
+        {
+            ModuleIO::write_head_td(GlobalV::ofs_running, istep, estep, iter, this->basisname);
+            //----------------------------------------------------------------
+            // 3) initialization of SCF iterations
+            //----------------------------------------------------------------
+            this->iter_init(ucell, totstep, iter);
+
+            //----------------------------------------------------------------
+            // 4) use Hamiltonian to obtain charge density
+            //----------------------------------------------------------------
+            this->hamilt2rho(ucell, totstep, iter, this->diag_ethr);
+
+            //----------------------------------------------------------------
+            // 5) finish scf iterations
+            //----------------------------------------------------------------
+            this->iter_finish(ucell, totstep, iter, conv_esolver);
+
+            //----------------------------------------------------------------
+            // 6) check convergence
+            //----------------------------------------------------------------
+            if (conv_esolver || this->oscillate_esolver)
+            {
+                this->niter = iter;
+                if (this->oscillate_esolver)
+                {
+                    std::cout << " !! Density oscillation is found, STOP HERE !!" << std::endl;
+                }
+                break;
+            }
+        } // end scf iterations
 
-template <typename Device>
-void ESolver_KS_LCAO_TDDFT<Device>::hamilt2rho_single(UnitCell& ucell,
+        //----------------------------------------------------------------
+        // 7) after scf
+        //----------------------------------------------------------------
+        this->after_scf(ucell, totstep, conv_esolver);
+        if(!restart_done && PARAM.inp.mdp.md_restart)
+        {
+            estep += TD_info::estep_shift%PARAM.inp.estep_per_md;
+            restart_done = true;
+            if(estep==0)break;
+        }
+    }
+    if(PARAM.inp.td_stype!=1 && TD_info::out_current)
+    {
+        delete velocity_mat;
+    }
+    ModuleBase::timer::tick(this->classname, "runner");
+    return;
+}
+//output electronic step infos
+template <typename TR, typename Device>
+void ESolver_KS_LCAO_TDDFT<TR, Device>::print_step()
+{
+    std::cout << " -------------------------------------------" << std::endl;
+	GlobalV::ofs_running << "\n -------------------------------------------" << std::endl;
+    std::cout << " STEP OF ELECTRON EVOLVE : " << unsigned(totstep) << std::endl;
+	GlobalV::ofs_running << " STEP OF ELECTRON EVOLVE : " << unsigned(totstep) << std::endl;
+    std::cout << " -------------------------------------------" << std::endl;
+    GlobalV::ofs_running << " -------------------------------------------" << std::endl;
+}
+template <typename TR, typename Device>
+void ESolver_KS_LCAO_TDDFT<TR, Device>::hamilt2rho_single(UnitCell& ucell,
                                                           const int istep,
                                                           const int iter,
                                                           const double ethr)
 {
     if (PARAM.inp.init_wfc == "file")
     {
-        if (istep >= 1)
+        if (istep >= TD_info::estep_shift + 1)
         {
             module_tddft::Evolve_elec<Device>::solve_psi(istep,
                                                          PARAM.inp.nbands,
                                                          PARAM.globalv.nlocal,
-                                                         kv.get_nks(),
+                                                         this->kv.get_nks(),
                                                          this->p_hamilt,
                                                          this->pv,
                                                          this->psi,
@@ -111,16 +267,15 @@ void ESolver_KS_LCAO_TDDFT<Device>::hamilt2rho_single(UnitCell& ucell,
                                                          PARAM.inp.propagator,
                                                          use_tensor,
                                                          use_lapack);
-            this->weight_dm_rho();
         }
-        this->weight_dm_rho();
+        this->weight_dm_rho(ucell);
     }
-    else if (istep >= 2)
+    else if (istep >= 1)
     {
         module_tddft::Evolve_elec<Device>::solve_psi(istep,
                                                      PARAM.inp.nbands,
                                                      PARAM.globalv.nlocal,
-                                                     kv.get_nks(),
+                                                     this->kv.get_nks(),
                                                      this->p_hamilt,
                                                      this->pv,
                                                      this->psi,
@@ -133,7 +288,7 @@ void ESolver_KS_LCAO_TDDFT<Device>::hamilt2rho_single(UnitCell& ucell,
                                                      PARAM.inp.propagator,
                                                      use_tensor,
                                                      use_lapack);
-        this->weight_dm_rho();
+        this->weight_dm_rho(ucell);
     }
     else
     {
@@ -154,7 +309,7 @@ void ESolver_KS_LCAO_TDDFT<Device>::hamilt2rho_single(UnitCell& ucell,
         Symmetry_rho srho;
         for (int is = 0; is < PARAM.inp.nspin; is++)
         {
-            srho.begin(is, this->chr, pw_rho, ucell.symm);
+            srho.begin(is, this->chr, this->pw_rho, ucell.symm);
         }
     }
 
@@ -162,8 +317,8 @@ void ESolver_KS_LCAO_TDDFT<Device>::hamilt2rho_single(UnitCell& ucell,
     this->pelec->f_en.deband = this->pelec->cal_delta_eband(ucell);
 }
 
-template <typename Device>
-void ESolver_KS_LCAO_TDDFT<Device>::iter_finish(
+template <typename TR, typename Device>
+void ESolver_KS_LCAO_TDDFT<TR, Device>::iter_finish(
 		UnitCell& ucell, 
 		const int istep, 
 		int& iter,
@@ -179,7 +334,7 @@ void ESolver_KS_LCAO_TDDFT<Device>::iter_finish(
         GlobalV::ofs_running << std::setiosflags(std::ios::showpoint);
         GlobalV::ofs_running << std::left;
         std::setprecision(6);
-        for (int ik = 0; ik < kv.get_nks(); ik++)
+        for (int ik = 0; ik < this->kv.get_nks(); ik++)
         {
             for (int ib = 0; ib < PARAM.inp.nbands; ib++)
             {
@@ -192,11 +347,11 @@ void ESolver_KS_LCAO_TDDFT<Device>::iter_finish(
                              << std::endl;
     }
 
-    ESolver_KS_LCAO<std::complex<double>, double>::iter_finish(ucell, istep, iter, conv_esolver);
+    ESolver_KS_LCAO<std::complex<double>, TR>::iter_finish(ucell, istep, iter, conv_esolver);
 }
 
-template <typename Device>
-void ESolver_KS_LCAO_TDDFT<Device>::update_pot(UnitCell& ucell, 
+template <typename TR, typename Device>
+void ESolver_KS_LCAO_TDDFT<TR, Device>::update_pot(UnitCell& ucell, 
 		const int istep, 
 		const int iter, 
 		const bool conv_esolver)
@@ -220,7 +375,7 @@ void ESolver_KS_LCAO_TDDFT<Device>::update_pot(UnitCell& ucell,
     const int nlocal = PARAM.globalv.nlocal;
 
     // store wfc and Hk laststep
-    if (istep >= (PARAM.inp.init_wfc == "file" ? 0 : 1) && conv_esolver)
+    if (conv_esolver)
     {
         if (this->psi_laststep == nullptr)
         {
@@ -233,7 +388,7 @@ void ESolver_KS_LCAO_TDDFT<Device>::update_pot(UnitCell& ucell,
             ncol_tmp = nbands;
             nrow_tmp = nlocal;
 #endif
-            this->psi_laststep = new psi::Psi<std::complex<double>>(kv.get_nks(), ncol_tmp, nrow_tmp, kv.ngk, true);
+            this->psi_laststep = new psi::Psi<std::complex<double>>(this->kv.get_nks(), ncol_tmp, nrow_tmp, this->kv.ngk, true);
 
         }
 
@@ -245,8 +400,8 @@ void ESolver_KS_LCAO_TDDFT<Device>::update_pot(UnitCell& ucell,
 
             if (this->Hk_laststep == nullptr)
             {
-                this->Hk_laststep = new std::complex<double>*[kv.get_nks()];
-                for (int ik = 0; ik < kv.get_nks(); ++ik)
+                this->Hk_laststep = new std::complex<double>*[this->kv.get_nks()];
+                for (int ik = 0; ik < this->kv.get_nks(); ++ik)
                 {
                     // Allocate memory for Hk_laststep, if (use_tensor && use_lapack), should be global
                     this->Hk_laststep[ik] = new std::complex<double>[len_HS];
@@ -255,8 +410,8 @@ void ESolver_KS_LCAO_TDDFT<Device>::update_pot(UnitCell& ucell,
             }
             if (this->Sk_laststep == nullptr)
             {
-                this->Sk_laststep = new std::complex<double>*[kv.get_nks()];
-                for (int ik = 0; ik < kv.get_nks(); ++ik)
+                this->Sk_laststep = new std::complex<double>*[this->kv.get_nks()];
+                for (int ik = 0; ik < this->kv.get_nks(); ++ik)
                 {
                     // Allocate memory for Sk_laststep, if (use_tensor && use_lapack), should be global
                     this->Sk_laststep[ik] = new std::complex<double>[len_HS];
@@ -266,16 +421,16 @@ void ESolver_KS_LCAO_TDDFT<Device>::update_pot(UnitCell& ucell,
         }
 
         // put information to Hk_laststep and Sk_laststep
-        for (int ik = 0; ik < kv.get_nks(); ++ik)
+        for (int ik = 0; ik < this->kv.get_nks(); ++ik)
         {
             this->psi->fix_k(ik);
             this->psi_laststep->fix_k(ik);
 
             // copy the data from psi to psi_laststep
-            const int size0 = psi->get_nbands() * psi->get_nbasis();
+            const int size0 = this->psi->get_nbands() * this->psi->get_nbasis();
             for (int index = 0; index < size0; ++index)
             {
-                psi_laststep[0].get_pointer()[index] = psi[0].get_pointer()[index];
+                psi_laststep[0].get_pointer()[index] = this->psi[0].get_pointer()[index];
             }
 
             // store Hamiltonian
@@ -316,7 +471,7 @@ void ESolver_KS_LCAO_TDDFT<Device>::update_pot(UnitCell& ucell,
         }
 
         // calculate energy density matrix for tddft
-        if (istep >= (PARAM.inp.init_wfc == "file" ? 0 : 2) && PARAM.inp.td_edm == 0)
+        if (istep >= (PARAM.inp.init_wfc == "file" ? 0 : 1) && PARAM.inp.td_edm == 0)
         {
             elecstate::cal_edm_tddft(this->pv, this->pelec, this->kv, this->p_hamilt);
         }
@@ -336,7 +491,7 @@ void ESolver_KS_LCAO_TDDFT<Device>::update_pot(UnitCell& ucell,
         GlobalV::ofs_running << std::setprecision(6);
         GlobalV::ofs_running << std::setiosflags(std::ios::showpoint);
 
-        for (int ik = 0; ik < kv.get_nks(); ik++)
+        for (int ik = 0; ik < this->kv.get_nks(); ik++)
         {
             for (int ib = 0; ib < PARAM.inp.nbands; ib++)
             {
@@ -350,13 +505,13 @@ void ESolver_KS_LCAO_TDDFT<Device>::update_pot(UnitCell& ucell,
 */
 }
 
-template <typename Device>
-void ESolver_KS_LCAO_TDDFT<Device>::after_scf(UnitCell& ucell, const int istep, const bool conv_esolver)
+template <typename TR, typename Device>
+void ESolver_KS_LCAO_TDDFT<TR, Device>::after_scf(UnitCell& ucell, const int istep, const bool conv_esolver)
 {
     ModuleBase::TITLE("ESolver_LCAO_TDDFT", "after_scf");
     ModuleBase::timer::tick("ESolver_LCAO_TDDFT", "after_scf");
 
-    ESolver_KS_LCAO<std::complex<double>, double>::after_scf(ucell, istep, conv_esolver);
+    ESolver_KS_LCAO<std::complex<double>, TR>::after_scf(ucell, istep, conv_esolver);
 
     // (1) write dipole information
     for (int is = 0; is < PARAM.inp.nspin; is++)
@@ -373,31 +528,52 @@ void ESolver_KS_LCAO_TDDFT<Device>::after_scf(UnitCell& ucell, const int istep,
                                    ss_dipole.str());
         }
     }
-
-     // (2) write current information
-    if (TD_Velocity::out_current == true)
-    {
-        elecstate::DensityMatrix<std::complex<double>, double>* tmp_DM
+    elecstate::DensityMatrix<std::complex<double>, double>* tmp_DM
             = dynamic_cast<elecstate::ElecStateLCAO<std::complex<double>>*>(this->pelec)->get_DM();
-
-        ModuleIO::write_current(ucell,
-                                this->gd,
-                                istep,
-                                this->psi,
-                                pelec,
-                                kv,
-                                two_center_bundle_.overlap_orb.get(),
-                                tmp_DM->get_paraV_pointer(),
-                                orb_,
-                                this->RA);
+    // (2) write current information
+    if(TD_info::out_current)
+    {
+        if(TD_info::out_current_k)
+        {
+            ModuleIO::write_current_eachk(ucell,
+                                    istep,
+                                    this->psi,
+                                    this->pelec,
+                                    this->kv,
+                                    this->two_center_bundle_.overlap_orb.get(),
+                                    tmp_DM->get_paraV_pointer(),
+                                    this->orb_,
+                                    this->velocity_mat,
+                                    this->RA);
+        }
+        else
+        {
+            ModuleIO::write_current(ucell,
+                                    istep,
+                                    this->psi,
+                                    this->pelec,
+                                    this->kv,
+                                    this->two_center_bundle_.overlap_orb.get(),
+                                    tmp_DM->get_paraV_pointer(),
+                                    this->orb_,
+                                    this->velocity_mat,
+                                    this->RA);
+        }
     }
+    // (3) output energy for sub loop
+    std::cout << "Potential (Ry): " << std::setprecision(15) << this->pelec->f_en.etot <<std::endl;
 
-
+    // (4) output file for restart
+    if(istep % PARAM.inp.out_interval == 0)
+    {
+        td_p->out_restart_info(istep, elecstate::H_TDDFT_pw::At, elecstate::H_TDDFT_pw::At_laststep);
+    }
+    
     ModuleBase::timer::tick("ESolver_LCAO_TDDFT", "after_scf");
 }
 
-template <typename Device>
-void ESolver_KS_LCAO_TDDFT<Device>::weight_dm_rho()
+template <typename TR, typename Device>
+void ESolver_KS_LCAO_TDDFT<TR, Device>::weight_dm_rho(const UnitCell& ucell)
 {
     if (PARAM.inp.ocp == 1)
     {
@@ -417,15 +593,24 @@ void ESolver_KS_LCAO_TDDFT<Device>::weight_dm_rho()
 
     auto _pes = dynamic_cast<elecstate::ElecStateLCAO<std::complex<double>>*>(this->pelec);
     elecstate::cal_dm_psi(_pes->DM->get_paraV_pointer(), _pes->wg, this->psi[0], *(_pes->DM));
-    _pes->DM->cal_DMR();
+    if(PARAM.inp.td_stype == 2)
+    {
+        _pes->DM->cal_DMR_td(ucell, td_p->cart_At);
+    }
+    else
+    {
+         _pes->DM->cal_DMR();
+    }
 
     // get the real-space charge density
     this->pelec->psiToRho(this->psi[0]);
 }
 
-template class ESolver_KS_LCAO_TDDFT<base_device::DEVICE_CPU>;
+template class ESolver_KS_LCAO_TDDFT<double, base_device::DEVICE_CPU>;
+template class ESolver_KS_LCAO_TDDFT<std::complex<double>, base_device::DEVICE_CPU>;
 #if ((defined __CUDA) /* || (defined __ROCM) */)
-template class ESolver_KS_LCAO_TDDFT<base_device::DEVICE_GPU>;
+template class ESolver_KS_LCAO_TDDFT<double, base_device::DEVICE_GPU>;
+template class ESolver_KS_LCAO_TDDFT<std::complex<double>, base_device::DEVICE_GPU>;
 #endif
 
 } // namespace ModuleESolver
diff --git a/source/source_esolver/esolver_ks_lcao_tddft.h b/source/source_esolver/esolver_ks_lcao_tddft.h
index 26bd770dc2..9071b65600 100644
--- a/source/source_esolver/esolver_ks_lcao_tddft.h
+++ b/source/source_esolver/esolver_ks_lcao_tddft.h
@@ -5,6 +5,8 @@
 #include "source_base/scalapack_connector.h" // Cpxgemr2d
 #include "source_lcao/hamilt_lcaodft/record_adj.h"
 #include "source_psi/psi.h"
+#include "source_lcao/module_tddft/velocity_op.h"
+#include "source_lcao/module_tddft/td_info.h"
 
 namespace ModuleESolver
 {
@@ -47,8 +49,8 @@ void gatherMatrix(const int myid, const int root_proc, const hamilt::MatrixBlock
 }
 //------------------------ MPI gathering and distributing functions ------------------------//
 
-template <typename Device = base_device::DEVICE_CPU>
-class ESolver_KS_LCAO_TDDFT : public ESolver_KS_LCAO<std::complex<double>, double>
+template <typename TR, typename Device = base_device::DEVICE_CPU>
+class ESolver_KS_LCAO_TDDFT : public ESolver_KS_LCAO<std::complex<double>, TR>
 {
   public:
     ESolver_KS_LCAO_TDDFT();
@@ -58,6 +60,8 @@ class ESolver_KS_LCAO_TDDFT : public ESolver_KS_LCAO<std::complex<double>, doubl
     void before_all_runners(UnitCell& ucell, const Input_para& inp) override;
 
   protected:
+    virtual void runner(UnitCell& cell, const int istep) override;
+
     virtual void hamilt2rho_single(UnitCell& ucell, const int istep, const int iter, const double ethr) override;
 
     virtual void update_pot(UnitCell& ucell, const int istep, const int iter, const bool conv_esolver) override;
@@ -66,6 +70,7 @@ class ESolver_KS_LCAO_TDDFT : public ESolver_KS_LCAO<std::complex<double>, doubl
 
     virtual void after_scf(UnitCell& ucell, const int istep, const bool conv_esolver) override;
 
+    void print_step();
     //! wave functions of last time step
     psi::Psi<std::complex<double>>* psi_laststep = nullptr;
 
@@ -81,9 +86,21 @@ class ESolver_KS_LCAO_TDDFT : public ESolver_KS_LCAO<std::complex<double>, doubl
     bool use_tensor = false;
     bool use_lapack = false;
 
+    //! Total steps for evolving the wave function
+    int totstep = -1;
+
+    //! Velocity matrix for calculating current
+    Velocity_op<TR>* velocity_mat = nullptr;
+
+    TD_info* td_p = nullptr;
+
+    //! doubt
+    bool restart_done = false;
+
   private:
-    void weight_dm_rho();
+    void weight_dm_rho(const UnitCell& ucell);
 };
 
 } // namespace ModuleESolver
 #endif
+
diff --git a/source/source_estate/module_dm/density_matrix.cpp b/source/source_estate/module_dm/density_matrix.cpp
index 4a0d21b377..19d83a76d8 100644
--- a/source/source_estate/module_dm/density_matrix.cpp
+++ b/source/source_estate/module_dm/density_matrix.cpp
@@ -220,6 +220,178 @@ void DensityMatrix<std::complex<double>, double>::cal_DMR(const int ik_in)
     ModuleBase::timer::tick("DensityMatrix", "cal_DMR");
 }
 
+// calculate DMR from DMK using blas for multi-k calculation
+template <>
+void DensityMatrix<std::complex<double>, double>::cal_DMR_td(const UnitCell& ucell, const ModuleBase::Vector3<double> At, const int ik_in)
+{
+    ModuleBase::TITLE("DensityMatrix", "cal_DMR_td");
+    // To check whether DMR has been initialized
+#ifdef __DEBUG
+    assert(!this->_DMR.empty() && "DMR has not been initialized!");
+#endif
+
+    ModuleBase::timer::tick("DensityMatrix", "cal_DMR_td");
+    int ld_hk = this->_paraV->nrow;
+    for (int is = 1; is <= this->_nspin; ++is)
+    {
+        int ik_begin = this->_nk * (is - 1); // jump this->_nk for spin_down if nspin==2
+        hamilt::HContainer<double>* target_DMR = this->_DMR[is - 1];
+        // set zero since this function is called in every scf step
+        target_DMR->set_zero();
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic)
+#endif
+        for (int i = 0; i < target_DMR->size_atom_pairs(); ++i)
+        {
+            hamilt::AtomPair<double>& target_ap = target_DMR->get_atom_pair(i);
+            int iat1 = target_ap.get_atom_i();
+            int iat2 = target_ap.get_atom_j();
+            // get global indexes of whole matrix for each atom in this process
+            int row_ap = this->_paraV->atom_begin_row[iat1];
+            int col_ap = this->_paraV->atom_begin_col[iat2];
+            const int row_size = this->_paraV->get_row_size(iat1);
+            const int col_size = this->_paraV->get_col_size(iat2);
+            const int mat_size = row_size * col_size;
+            const int r_size = target_ap.get_R_size();
+            if (row_ap == -1 || col_ap == -1)
+            {
+                throw std::string("Atom-pair not belong this process");
+            }
+            std::vector<std::complex<double>> tmp_DMR;
+            if (PARAM.inp.nspin == 4)
+            {
+                tmp_DMR.resize(mat_size * r_size, 0);
+            }
+
+            // calculate kphase and target_mat_ptr
+            std::vector<std::complex<double>> kphase_vec(r_size * this->_nk);
+            std::vector<double*> target_DMR_mat_vec(r_size);
+            for(int ir = 0; ir < r_size; ++ir)
+            {
+                const ModuleBase::Vector3<int> r_index = target_ap.get_R_index(ir);
+                hamilt::BaseMatrix<double>* target_mat = target_ap.find_matrix(r_index);
+#ifdef __DEBUG
+                if (target_mat == nullptr)
+                {
+                    std::cout << "target_mat is nullptr" << std::endl;
+                    continue;
+                }
+#endif
+                target_DMR_mat_vec[ir] = target_mat->get_pointer();
+                double arg_td = 0.0;
+                //cal tddft phase for hybrid gague
+                ModuleBase::Vector3<double> dtau = ucell.cal_dtau(iat1, iat2, r_index);
+                arg_td = At * dtau * ucell.lat0;
+                for(int ik = 0; ik < this->_nk; ++ik)
+                {
+                    if(ik_in >= 0 && ik_in != ik) 
+                    { 
+                        continue;
+                    }
+                    // cal k_phase
+                    // if TK==std::complex<double>, kphase is e^{ikR}
+                    const ModuleBase::Vector3<double> dR(r_index[0], r_index[1], r_index[2]);
+                    const double arg = (this->_kvec_d[ik] * dR) * ModuleBase::TWO_PI + arg_td;
+                    double sinp, cosp;
+                    ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                    kphase_vec[ik * r_size + ir] = std::complex<double>(cosp, sinp);
+                }
+            }
+
+            std::vector<std::complex<double>> tmp_DMK_mat(mat_size);
+            // step_trace = 0 for NSPIN=1,2; ={0, 1, local_col, local_col+1} for NSPIN=4
+            // step_trace is used when nspin = 4;
+            int step_trace[4]{};
+            if(PARAM.inp.nspin == 4)
+            {
+                const int npol = 2;
+                for (int is = 0; is < npol; is++)
+                {
+                    for (int is2 = 0; is2 < npol; is2++)
+                    {
+                        step_trace[is * npol + is2] = target_ap.get_col_size() * is + is2;
+                    }
+                }
+            }
+            for(int ik = 0; ik < this->_nk; ++ik)
+            {
+                if(ik_in >= 0 && ik_in != ik) 
+                { 
+                    continue;
+                }
+
+                // copy column-major DMK to row-major tmp_DMK_mat (for the purpose of computational efficiency)
+                const std::complex<double>* DMK_mat_ptr = this->_DMK[ik + ik_begin].data() + col_ap * this->_paraV->nrow + row_ap;
+                for(int icol = 0; icol < col_size; ++icol)
+                {
+                    for(int irow = 0; irow < row_size; ++irow)
+                    {
+                        tmp_DMK_mat[irow * col_size + icol] = DMK_mat_ptr[icol * ld_hk + irow];
+                    }
+                }
+
+                // if nspin != 4, fill DMR
+                // if nspin == 4, fill tmp_DMR
+                for(int ir = 0; ir < r_size; ++ir)
+                {
+                    std::complex<double> kphase = kphase_vec[ik * r_size + ir];
+                    if(PARAM.inp.nspin != 4)
+                    {
+                        double* target_DMR_mat = target_DMR_mat_vec[ir];
+                        for(int i = 0; i < mat_size; i++)
+                        {
+                            target_DMR_mat[i] += kphase.real() * tmp_DMK_mat[i].real() 
+                                    - kphase.imag() * tmp_DMK_mat[i].imag();
+                        }
+                    } else if(PARAM.inp.nspin == 4)
+                    {
+                        std::complex<double>* tmp_DMR_mat = &tmp_DMR[ir * mat_size];
+                        BlasConnector::axpy(mat_size,
+                                            kphase,
+                                            tmp_DMK_mat.data(),
+                                            1,
+                                            tmp_DMR_mat,
+                                            1);
+                    }
+                }
+            }
+
+            // if nspin == 4
+            // copy tmp_DMR to fill target_DMR
+            if(PARAM.inp.nspin == 4)
+            {
+                std::complex<double> tmp[4]{};
+                for(int ir = 0; ir < r_size; ++ir)
+                {
+                    std::complex<double>* tmp_DMR_mat = &tmp_DMR[ir * mat_size];
+                    double* target_DMR_mat = target_DMR_mat_vec[ir];
+                    for (int irow = 0; irow < row_size; irow += 2)
+                    {
+                        for (int icol = 0; icol < col_size; icol += 2)
+                        {
+                            // catch the 4 spin component value of one orbital pair
+                            tmp[0] = tmp_DMR_mat[icol + step_trace[0]];
+                            tmp[1] = tmp_DMR_mat[icol + step_trace[1]];
+                            tmp[2] = tmp_DMR_mat[icol + step_trace[2]];
+                            tmp[3] = tmp_DMR_mat[icol + step_trace[3]];
+                            // transfer to Pauli matrix and save the real part
+                            // save them back to the target_mat
+                            target_DMR_mat[icol + step_trace[0]] = tmp[0].real() + tmp[3].real();
+                            target_DMR_mat[icol + step_trace[1]] = tmp[1].real() + tmp[2].real();
+                            target_DMR_mat[icol + step_trace[2]]
+                                = -tmp[1].imag() + tmp[2].imag(); // (i * (rho_updown - rho_downup)).real()
+                            target_DMR_mat[icol + step_trace[3]] = tmp[0].real() - tmp[3].real();
+                        }
+                        tmp_DMR_mat += col_size * 2;
+                        target_DMR_mat += col_size * 2;
+                    }
+                }
+            }
+        }
+    }
+    ModuleBase::timer::tick("DensityMatrix", "cal_DMR_td");
+}
+
 // calculate DMR from DMK using blas for multi-k calculation
 template <>
 void DensityMatrix<double, double>::cal_DMR_full(hamilt::HContainer<std::complex<double>>* dmR_out)const{}
diff --git a/source/source_estate/module_dm/density_matrix.h b/source/source_estate/module_dm/density_matrix.h
index d672dc8c04..7e1a7de97c 100644
--- a/source/source_estate/module_dm/density_matrix.h
+++ b/source/source_estate/module_dm/density_matrix.h
@@ -181,6 +181,13 @@ class DensityMatrix
      */
     void cal_DMR(const int ik_in = -1);
 
+    /**
+     * @brief calculate density matrix DMR with additional vector potential phase, used for hybrid gague tddft
+     * if ik_in < 0, calculate all k-points
+     * if ik_in >= 0, calculate only one k-point without summing over k-points
+     */
+    void cal_DMR_td(const UnitCell& ucell, const ModuleBase::Vector3<double> At, const int ik_in = -1);
+
     /**
      * @brief calculate complex density matrix DMR with both real and imaginary part for noncollinear-spin calculation
      * the stored dm(k) has been used to calculate the passin DMR
diff --git a/source/source_estate/module_pot/H_TDDFT_pw.cpp b/source/source_estate/module_pot/H_TDDFT_pw.cpp
index 1fae700031..b9f7327f00 100644
--- a/source/source_estate/module_pot/H_TDDFT_pw.cpp
+++ b/source/source_estate/module_pot/H_TDDFT_pw.cpp
@@ -44,7 +44,9 @@ double H_TDDFT_pw::lcut2;
 
 // velocity gauge
 ModuleBase::Vector3<double> H_TDDFT_pw::At;
-
+ModuleBase::Vector3<double> H_TDDFT_pw::At_laststep;
+// hybrid gague
+ModuleBase::Vector3<double> H_TDDFT_pw::Et;
 // time domain parameters
 
 // Gauss
@@ -83,15 +85,18 @@ std::vector<double> H_TDDFT_pw::heavi_amp; // Ry/bohr
 void H_TDDFT_pw::current_step_info(const std::string& file_dir, int& istep)
 {
     std::stringstream ssc;
-    ssc << file_dir << "Restart_md.dat";
+    ssc << file_dir << "Restart_td.dat";
     std::ifstream file(ssc.str().c_str());
 
     if (!file)
     {
-        ModuleBase::WARNING_QUIT("H_TDDFT_pw::current_step_info", "No Restart_md.dat!");
+        ModuleBase::WARNING_QUIT("H_TDDFT_pw::current_step_info", "No Restart_td.dat!");
     }
 
     file >> istep;
+    file >> At[0] >> At[1] >>At[2];
+    file >> At_laststep[0] >> At_laststep[1] >> At_laststep[2];
+    At_laststep=-At_laststep;
     file.close();
 }
 
@@ -99,8 +104,8 @@ void H_TDDFT_pw::cal_fixed_v(double* vl_pseudo)
 {
     ModuleBase::TITLE("H_TDDFT_pw", "cal_fixed_v");
 
-    // skip if velocity_gauge
-    if (stype == 1)
+    // skip if not length gague
+    if (stype != 0)
     {
         return;
     }
@@ -283,6 +288,10 @@ void H_TDDFT_pw::update_At()
     //std::cout << "calculate electric potential" << std::endl;
     // time evolve
     H_TDDFT_pw::istep++;
+    // midpoint rule should be used both in Hamiltonian and here.
+    At = At + At_laststep/2.0;
+    At_laststep.set(0.0, 0.0, 0.0);
+    Et.set(0.0, 0.0, 0.0);
 
     // judgement to skip vext
     if (!PARAM.inp.td_vext || istep > tend || istep < tstart)
@@ -329,7 +338,11 @@ void H_TDDFT_pw::update_At()
         switch (stype)
         {
         case 1:
-            At[direc - 1] -= out;
+            At_laststep[direc - 1] -= out;
+            break;
+        case 2:
+            At_laststep[direc-1] -= out;
+            Et[direc-1] += vext_time[0];
             break;
         default:
             std::cout << "space_domain_type of electric field is wrong" << std::endl;
@@ -349,6 +362,7 @@ void H_TDDFT_pw::update_At()
         // total count++
         count++;
     }
+    At = At + At_laststep/2.0;
 
     ModuleBase::timer::tick("H_TDDFT_pw", "update_At");
     return;
@@ -373,7 +387,7 @@ double H_TDDFT_pw::cal_v_time(int t_type, const bool last)
         break;
 
     case 3:
-        vext_time = cal_v_time_heaviside();
+        vext_time = cal_v_time_heaviside(last);
         break;
 
         // case 4:
@@ -460,7 +474,7 @@ double H_TDDFT_pw::cal_v_time_trigonometric(const bool last)
     return vext_time;
 }
 
-double H_TDDFT_pw::cal_v_time_heaviside()
+double H_TDDFT_pw::cal_v_time_heaviside(const bool last)
 {
     double t0 = *(heavi_t0.begin() + heavi_count);
     double amp = *(heavi_amp.begin() + heavi_count);
@@ -473,7 +487,7 @@ double H_TDDFT_pw::cal_v_time_heaviside()
     {
         vext_time = 0.0;
     }
-    heavi_count++;
+    if(last)heavi_count++;
 
     return vext_time;
 }
@@ -524,3 +538,4 @@ void H_TDDFT_pw::compute_force(const UnitCell& cell, ModuleBase::matrix& fe)
 }
 
 } // namespace elecstate
+
diff --git a/source/source_estate/module_pot/H_TDDFT_pw.h b/source/source_estate/module_pot/H_TDDFT_pw.h
index f3c2ed2216..cba7cc8b5b 100644
--- a/source/source_estate/module_pot/H_TDDFT_pw.h
+++ b/source/source_estate/module_pot/H_TDDFT_pw.h
@@ -72,6 +72,8 @@ class H_TDDFT_pw : public PotBase
 
     // velocity gauge, vector magnetic potential
     static ModuleBase::Vector3<double> At;
+    static ModuleBase::Vector3<double> At_laststep;
+    static ModuleBase::Vector3<double> Et;
 
     // time domain parameters
 
@@ -144,7 +146,7 @@ class H_TDDFT_pw : public PotBase
     static double cal_v_time_Gauss(const bool last);
     static double cal_v_time_trapezoid(const bool last);
     static double cal_v_time_trigonometric(const bool last);
-    static double cal_v_time_heaviside();
+    static double cal_v_time_heaviside(const bool last);
     // double cal_v_time_HHG();
 
     // get ncut number for At integral
@@ -156,3 +158,4 @@ class H_TDDFT_pw : public PotBase
 } // namespace elecstate
 
 #endif
+
diff --git a/source/source_hamilt/operator.h b/source/source_hamilt/operator.h
index 4f7c9e2a0f..4198220a02 100644
--- a/source/source_hamilt/operator.h
+++ b/source/source_hamilt/operator.h
@@ -26,7 +26,7 @@ enum class calculation_type
     lcao_exx,
     lcao_dftu,
     lcao_sc_lambda,
-    lcao_tddft_velocity,
+    lcao_tddft_periodic,
 };
 
 // Basic class for operator module,
diff --git a/source/source_io/cal_r_overlap_R.cpp b/source/source_io/cal_r_overlap_R.cpp
index 237723e06e..3c9a156a8a 100644
--- a/source/source_io/cal_r_overlap_R.cpp
+++ b/source/source_io/cal_r_overlap_R.cpp
@@ -5,6 +5,7 @@
 #include "source_base/timer.h"
 #include "source_cell/module_neighbor/sltk_grid_driver.h"
 #include "source_pw/hamilt_pwdft/global.h"
+#include "source_base/mathzone_add1.h"
 
 cal_r_overlap_R::cal_r_overlap_R()
 {
@@ -226,6 +227,238 @@ void cal_r_overlap_R::construct_orbs_and_orb_r(const UnitCell& ucell,
     }
 }
 
+void cal_r_overlap_R::construct_orbs_and_nonlocal_and_orb_r(const UnitCell& ucell,const LCAO_Orbitals& orb)
+{
+    const InfoNonlocal& infoNL_ = ucell.infoNL;
+
+    int orb_r_ntype = 0;
+    int mat_Nr = orb.Phi[0].PhiLN(0, 0).getNr();
+    int count_Nr = 0;
+
+    orbs.resize(orb.get_ntype());
+    for (int T = 0; T < orb.get_ntype(); ++T)
+    {
+        count_Nr = orb.Phi[T].PhiLN(0, 0).getNr();
+        if (count_Nr > mat_Nr)
+        {
+            mat_Nr = count_Nr;
+            orb_r_ntype = T;
+        }
+
+        orbs[T].resize(orb.Phi[T].getLmax() + 1);
+        for (int L = 0; L <= orb.Phi[T].getLmax(); ++L)
+        {
+            orbs[T][L].resize(orb.Phi[T].getNchi(L));
+            for (int N = 0; N < orb.Phi[T].getNchi(L); ++N)
+            {
+                const auto& orb_origin = orb.Phi[T].PhiLN(L, N);
+                orbs[T][L][N].set_orbital_info(orb_origin.getLabel(),
+                                               orb_origin.getType(),
+                                               orb_origin.getL(),
+                                               orb_origin.getChi(),
+                                               orb_origin.getNr(),
+                                               orb_origin.getRab(),
+                                               orb_origin.getRadial(),
+                                               Numerical_Orbital_Lm::Psi_Type::Psi,
+                                               orb_origin.getPsi(),
+                                               static_cast<int>(orb_origin.getNk() * kmesh_times) | 1,
+                                               orb_origin.getDk(),
+                                               orb_origin.getDruniform(),
+                                               false,
+                                               true,
+                                               PARAM.inp.cal_force);
+            }
+        }
+    }
+
+    orb_r.set_orbital_info(orbs[orb_r_ntype][0][0].getLabel(),  // atom label
+                           orb_r_ntype,                         // atom type
+                           1,                                   // angular momentum L
+                           1,                                   // number of orbitals of this L , just N
+                           orbs[orb_r_ntype][0][0].getNr(),     // number of radial mesh
+                           orbs[orb_r_ntype][0][0].getRab(),    // the mesh interval in radial mesh
+                           orbs[orb_r_ntype][0][0].getRadial(), // radial mesh value(a.u.)
+                           Numerical_Orbital_Lm::Psi_Type::Psi,
+                           orbs[orb_r_ntype][0][0].getRadial(), // radial wave function
+                           orbs[orb_r_ntype][0][0].getNk(),
+                           orbs[orb_r_ntype][0][0].getDk(),
+                           orbs[orb_r_ntype][0][0].getDruniform(),
+                           false,
+                           true,
+                           PARAM.inp.cal_force);
+
+    orbs_nonlocal.resize(orb.get_ntype());
+    for (int T = 0; T < orb.get_ntype(); ++T)
+    {
+        const int nproj = infoNL_.nproj[T];
+        orbs_nonlocal[T].resize(nproj);
+        for (int ip = 0; ip < nproj; ip++)
+        {
+            int nr = infoNL_.Beta[T].Proj[ip].getNr();
+            double dr_uniform = 0.01;
+	        int nr_uniform = static_cast<int>((infoNL_.Beta[T].Proj[ip].getRadial(nr-1) - infoNL_.Beta[T].Proj[ip].getRadial(0))/dr_uniform) + 1;
+            double* rad = new double[nr_uniform];
+            double* rab = new double[nr_uniform];
+            for (int ir = 0; ir < nr_uniform; ir++)
+            {
+                rad[ir] = ir*dr_uniform;
+                rab[ir] = dr_uniform;
+            }
+            double* y2 = new double[nr];
+            double* Beta_r_uniform = new double[nr_uniform];
+            double* dbeta_uniform = new double[nr_uniform];
+            ModuleBase::Mathzone_Add1::SplineD2(infoNL_.Beta[T].Proj[ip].getRadial(), infoNL_.Beta[T].Proj[ip].getBeta_r(), nr, 0.0, 0.0, y2);
+            ModuleBase::Mathzone_Add1::Cubic_Spline_Interpolation(
+                infoNL_.Beta[T].Proj[ip].getRadial(), 
+                infoNL_.Beta[T].Proj[ip].getBeta_r(), 
+                y2, 
+                nr, 
+                rad, 
+                nr_uniform, 
+                Beta_r_uniform, 
+                dbeta_uniform
+            );
+
+            // linear extrapolation at the zero point
+            if (infoNL_.Beta[T].Proj[ip].getRadial(0) > 1e-10)
+            {
+                double slope = (infoNL_.Beta[T].Proj[ip].getBeta_r(1) - infoNL_.Beta[T].Proj[ip].getBeta_r(0)) / (infoNL_.Beta[T].Proj[ip].getRadial(1) - infoNL_.Beta[T].Proj[ip].getRadial(0));
+                Beta_r_uniform[0] = infoNL_.Beta[T].Proj[ip].getBeta_r(0) - slope * infoNL_.Beta[T].Proj[ip].getRadial(0);
+            }
+
+            // Here, the operation beta_r / r is performed. To avoid divergence at r=0, beta_r(0) is set to beta_r(1). 
+            // However, this may introduce issues, so caution is needed.
+            for (int ir = 1; ir < nr_uniform; ir++)
+            {
+                Beta_r_uniform[ir] = Beta_r_uniform[ir] / rad[ir];
+            }
+            Beta_r_uniform[0] = Beta_r_uniform[1];
+
+            orbs_nonlocal[T][ip].set_orbital_info(infoNL_.Beta[T].getLabel(),
+                                                  infoNL_.Beta[T].getType(),
+                                                  infoNL_.Beta[T].Proj[ip].getL(),
+                                                  1,
+                                                  nr_uniform,
+                                                  rab,
+                                                  rad,
+                                                  Numerical_Orbital_Lm::Psi_Type::Psi,
+                                                  Beta_r_uniform,
+                                                  static_cast<int>(infoNL_.Beta[T].Proj[ip].getNk() * kmesh_times) | 1,
+                                                  infoNL_.Beta[T].Proj[ip].getDk(),
+                                                  infoNL_.Beta[T].Proj[ip].getDruniform(),
+                                                  false,
+                                                  true,
+                                                  PARAM.inp.cal_force);
+
+            delete [] rad;
+            delete [] rab;
+            delete [] y2;
+            delete [] Beta_r_uniform;
+            delete [] dbeta_uniform;
+        }
+    }
+
+    for (int TA = 0; TA < orb.get_ntype(); ++TA)
+    {
+        for (int TB = 0; TB < orb.get_ntype(); ++TB)
+        {
+            for (int LA = 0; LA <= orb.Phi[TA].getLmax(); ++LA)
+            {
+                for (int NA = 0; NA < orb.Phi[TA].getNchi(LA); ++NA)
+                {
+                    for (int ip = 0; ip < infoNL_.nproj[TB]; ip++)
+                    {
+                        center2_orb11_nonlocal[TA][TB][LA][NA].insert(
+                            std::make_pair(ip, Center2_Orb::Orb11(orbs[TA][LA][NA], orbs_nonlocal[TB][ip], psb_, MGT)));
+                    }
+                }
+            }
+        }
+    }
+
+    for (int TA = 0; TA < orb.get_ntype(); ++TA)
+    {
+        for (int TB = 0; TB < orb.get_ntype(); ++TB)
+        {
+            for (int LA = 0; LA <= orb.Phi[TA].getLmax(); ++LA)
+            {
+                for (int NA = 0; NA < orb.Phi[TA].getNchi(LA); ++NA)
+                {
+                    for (int ip = 0; ip < infoNL_.nproj[TB]; ip++)
+                    {
+                        center2_orb21_r_nonlocal[TA][TB][LA][NA].insert(std::make_pair(
+                            ip,
+                            Center2_Orb::Orb21(orbs[TA][LA][NA], orb_r, orbs_nonlocal[TB][ip], psb_, MGT)));
+                    }
+                }
+            }
+        }
+    }
+
+    for (auto& co1: center2_orb11_nonlocal)
+    {
+        for (auto& co2: co1.second)
+        {
+            for (auto& co3: co2.second)
+            {
+                for (auto& co4: co3.second)
+                {
+                    for (auto& co5: co4.second)
+                    {
+                        co5.second.init_radial_table();
+                    }
+                }
+            }
+        }
+    }
+
+    for (auto& co1: center2_orb21_r_nonlocal)
+    {
+        for (auto& co2: co1.second)
+        {
+            for (auto& co3: co2.second)
+            {
+                for (auto& co4: co3.second)
+                {
+                    for (auto& co5: co4.second)
+                    {
+                        co5.second.init_radial_table();
+                    }
+                }
+            }
+        }
+    }
+
+    iw2it.resize(PARAM.globalv.nlocal);
+    iw2ia.resize(PARAM.globalv.nlocal);
+    iw2iL.resize(PARAM.globalv.nlocal);
+    iw2iN.resize(PARAM.globalv.nlocal);
+    iw2im.resize(PARAM.globalv.nlocal);
+
+    int iw = 0;
+    for (int it = 0; it < ucell.ntype; it++)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+        {
+            for (int iL = 0; iL < ucell.atoms[it].nwl + 1; iL++)
+            {
+                for (int iN = 0; iN < ucell.atoms[it].l_nchi[iL]; iN++)
+                {
+                    for (int im = 0; im < (2 * iL + 1); im++)
+                    {
+                        iw2it[iw] = it;
+                        iw2ia[iw] = ia;
+                        iw2iL[iw] = iL;
+                        iw2iN[iw] = iN;
+                        iw2im[iw] = im;
+                        iw++;
+                    }
+                }
+            }
+        }
+    }
+}
+
 void cal_r_overlap_R::init(const UnitCell& ucell,const Parallel_Orbitals& pv, const LCAO_Orbitals& orb)
 {
     ModuleBase::TITLE("cal_r_overlap_R", "init");
@@ -239,6 +472,141 @@ void cal_r_overlap_R::init(const UnitCell& ucell,const Parallel_Orbitals& pv, co
     return;
 }
 
+void cal_r_overlap_R::init_nonlocal(const UnitCell& ucell,const Parallel_Orbitals& pv, const LCAO_Orbitals& orb)
+{
+    ModuleBase::TITLE("cal_r_overlap_R", "init_nonlocal");
+    ModuleBase::timer::tick("cal_r_overlap_R", "init_nonlocal");
+    this->ParaV = &pv;
+
+    initialize_orb_table(ucell,orb);
+    construct_orbs_and_nonlocal_and_orb_r(ucell,orb);
+
+    ModuleBase::timer::tick("cal_r_overlap_R", "init_nonlocal");
+    return;
+}
+
+ModuleBase::Vector3<double> cal_r_overlap_R::get_psi_r_psi(const ModuleBase::Vector3<double>& R1,
+                                                           const int& T1,
+                                                           const int& L1,
+                                                           const int& m1,
+                                                           const int& N1,
+                                                           const ModuleBase::Vector3<double>& R2,
+                                                           const int& T2,
+                                                           const int& L2,
+                                                           const int& m2,
+                                                           const int& N2)
+{
+    ModuleBase::Vector3<double> origin_point(0.0, 0.0, 0.0);
+    double factor = sqrt(ModuleBase::FOUR_PI / 3.0);
+    const ModuleBase::Vector3<double>& distance = R2 - R1;
+
+    double overlap_o = center2_orb11[T1][T2][L1][N1][L2].at(N2).cal_overlap(origin_point, distance, m1, m2);
+
+    double overlap_x = -1 * factor
+                       * center2_orb21_r[T1][T2][L1][N1][L2].at(N2).cal_overlap(origin_point,
+                                                                                distance,
+                                                                                m1,
+                                                                                1,
+                                                                                m2); // m =  1
+
+    double overlap_y = -1 * factor
+                       * center2_orb21_r[T1][T2][L1][N1][L2].at(N2).cal_overlap(origin_point,
+                                                                                distance,
+                                                                                m1,
+                                                                                2,
+                                                                                m2); // m = -1
+
+    double overlap_z = factor
+                       * center2_orb21_r[T1][T2][L1][N1][L2].at(N2).cal_overlap(origin_point,
+                                                                                distance,
+                                                                                m1,
+                                                                                0,
+                                                                                m2); // m =  0
+
+    ModuleBase::Vector3<double> temp_prp
+        = ModuleBase::Vector3<double>(overlap_x, overlap_y, overlap_z) + R1 * overlap_o;
+
+    return temp_prp;
+}
+
+void cal_r_overlap_R::get_psi_r_beta(const UnitCell& ucell,
+                                     std::vector<std::vector<double>>& nlm,
+                                     const ModuleBase::Vector3<double>& R1,
+                                     const int& T1,
+                                     const int& L1,
+                                     const int& m1,
+                                     const int& N1,
+                                     const ModuleBase::Vector3<double>& R2,
+                                     const int& T2)
+{
+    ModuleBase::Vector3<double> origin_point(0.0, 0.0, 0.0);
+    double factor = sqrt(ModuleBase::FOUR_PI / 3.0);
+    const ModuleBase::Vector3<double>& distance = R2 - R1;
+    const InfoNonlocal& infoNL_ = ucell.infoNL;
+    const int nproj = infoNL_.nproj[T2];
+    nlm.resize(4);
+    if (nproj == 0)
+    {
+        for(int i = 0;i < 4;i++)
+        {
+            nlm[i].resize(1);
+        }
+        return;
+    }
+
+    int natomwfc = 0;
+    for (int ip = 0; ip < nproj; ip++)
+    {
+        const int L2 = infoNL_.Beta[T2].Proj[ip].getL(); // mohan add 2021-05-07
+        natomwfc += 2 * L2 + 1;
+    }
+    for(int i = 0;i < 4;i++)
+    {
+        nlm[i].resize(natomwfc);
+    }
+    int index = 0;
+    for (int ip = 0; ip < nproj; ip++)
+    {
+        const int L2 = infoNL_.Beta[T2].Proj[ip].getL();
+        for (int m2 = 0; m2 < 2 * L2 + 1; m2++)
+        {
+            double overlap_o
+                = center2_orb11_nonlocal[T1][T2][L1][N1].at(ip).cal_overlap(origin_point, distance, m1, m2);
+
+            double overlap_x = -1 * factor
+                               * center2_orb21_r_nonlocal[T1][T2][L1][N1].at(ip).cal_overlap(origin_point,
+                                                                                             distance,
+                                                                                             m1,
+                                                                                             1,
+                                                                                             m2); // m =  1
+
+            double overlap_y = -1 * factor
+                               * center2_orb21_r_nonlocal[T1][T2][L1][N1].at(ip).cal_overlap(origin_point,
+                                                                                             distance,
+                                                                                             m1,
+                                                                                             2,
+                                                                                             m2); // m = -1
+
+            double overlap_z = factor
+                               * center2_orb21_r_nonlocal[T1][T2][L1][N1].at(ip).cal_overlap(origin_point,
+                                                                                             distance,
+                                                                                             m1,
+                                                                                             0,
+                                                                                             m2); // m =  0
+
+            //nlm[index] = ModuleBase::Vector3<double>(overlap_x, overlap_y, overlap_z) + R1 * overlap_o;
+
+            //nlm[index] = ModuleBase::Vector3<double>(overlap_o, overlap_y, overlap_z);// + R1 * overlap_o;
+            nlm[0][index] = overlap_o;
+            nlm[1][index] = overlap_x + (R1 * overlap_o).x;
+            nlm[2][index] = overlap_y + (R1 * overlap_o).y;
+            nlm[3][index] = overlap_z + (R1 * overlap_o).z;
+            index++;
+        }
+    }
+}
+
+
 void cal_r_overlap_R::out_rR(const UnitCell& ucell, const Grid_Driver& gd, const int& istep)
 {
     ModuleBase::TITLE("cal_r_overlap_R", "out_rR");
diff --git a/source/source_io/cal_r_overlap_R.h b/source/source_io/cal_r_overlap_R.h
index f3d39d6a87..de30c0b8f5 100644
--- a/source/source_io/cal_r_overlap_R.h
+++ b/source/source_io/cal_r_overlap_R.h
@@ -33,12 +33,37 @@ class cal_r_overlap_R
     bool binary = false;
 
     void init(const UnitCell& ucell,const Parallel_Orbitals& pv, const LCAO_Orbitals& orb);
+    void init_nonlocal(const UnitCell& ucell,const Parallel_Orbitals& pv, const LCAO_Orbitals& orb);
+    ModuleBase::Vector3<double> get_psi_r_psi(
+      const ModuleBase::Vector3<double>& R1,
+      const int& T1,
+      const int& L1,
+      const int& m1,
+      const int& N1,
+      const ModuleBase::Vector3<double>& R2,
+      const int& T2,
+      const int& L2,
+      const int& m2,
+      const int& N2
+    );
+    void get_psi_r_beta(
+      const UnitCell& ucell,
+      std::vector<std::vector<double>>& nlm,
+      const ModuleBase::Vector3<double>& R1,
+      const int& T1,
+      const int& L1,
+      const int& m1,
+      const int& N1,
+      const ModuleBase::Vector3<double>& R2,
+      const int& T2
+    );
     void out_rR(const UnitCell& ucell, const Grid_Driver& gd, const int& istep);
     void out_rR_other(const UnitCell& ucell, const int& istep, const std::set<Abfs::Vector3_Order<int>>& output_R_coor);
 
   private:
     void initialize_orb_table(const UnitCell& ucell, const LCAO_Orbitals& orb);
     void construct_orbs_and_orb_r(const UnitCell& ucell,const LCAO_Orbitals& orb);
+    void construct_orbs_and_nonlocal_and_orb_r(const UnitCell& ucell,const LCAO_Orbitals& orb);
 
     std::vector<int> iw2ia;
     std::vector<int> iw2iL;
@@ -51,6 +76,7 @@ class cal_r_overlap_R
 
     Numerical_Orbital_Lm orb_r;
     std::vector<std::vector<std::vector<Numerical_Orbital_Lm>>> orbs;
+    std::vector<std::vector<Numerical_Orbital_Lm>> orbs_nonlocal;
 
     std::map<
         size_t,
@@ -62,6 +88,16 @@ class cal_r_overlap_R
         std::map<size_t, std::map<size_t, std::map<size_t, std::map<size_t, std::map<size_t, Center2_Orb::Orb21>>>>>>
         center2_orb21_r;
 
+    std::map<
+        size_t,
+        std::map<size_t, std::map<size_t, std::map<size_t, std::map<size_t, Center2_Orb::Orb11>>>>>
+        center2_orb11_nonlocal;
+
+    std::map<
+        size_t,
+        std::map<size_t, std::map<size_t, std::map<size_t, std::map<size_t, Center2_Orb::Orb21>>>>>
+        center2_orb21_r_nonlocal;
+
     const Parallel_Orbitals* ParaV;
 };
 #endif
diff --git a/source/source_io/input_conv.cpp b/source/source_io/input_conv.cpp
index 4f4e6cbfd1..240ff80894 100644
--- a/source/source_io/input_conv.cpp
+++ b/source/source_io/input_conv.cpp
@@ -24,7 +24,7 @@
 #include "source_estate/module_pot/H_TDDFT_pw.h"
 #include "source_lcao/hamilt_lcaodft/FORCE_STRESS.h"
 #include "source_lcao/module_tddft/evolve_elec.h"
-#include "source_lcao/module_tddft/td_velocity.h"
+#include "source_lcao/module_tddft/td_info.h"
 #endif
 #ifdef __PEXSI
 #include "source_hsolver/module_pexsi/pexsi_solver.h"
@@ -57,24 +57,25 @@ std::vector<double> Input_Conv::convert_units(std::string params, double c) {
 void Input_Conv::read_td_efield()
 {
     elecstate::H_TDDFT_pw::stype = PARAM.inp.td_stype;
-    if (PARAM.inp.esolver_type == "tddft" && elecstate::H_TDDFT_pw::stype == 1)
-    {
-        TD_Velocity::tddft_velocity = true;
-    } else {
-        TD_Velocity::tddft_velocity = false;
-    }
     if (PARAM.inp.out_mat_hs2 == 1)
     {
-        TD_Velocity::out_mat_R = true;
+        TD_info::out_mat_R = true;
     } else {
-        TD_Velocity::out_mat_R = false;
+        TD_info::out_mat_R = false;
     }
     parse_expression(PARAM.inp.td_ttype, elecstate::H_TDDFT_pw::ttype);
 
     elecstate::H_TDDFT_pw::tstart = PARAM.inp.td_tstart;
     elecstate::H_TDDFT_pw::tend = PARAM.inp.td_tend;
+    if(PARAM.inp.td_dt!=-1.0)
+    {
+        elecstate::H_TDDFT_pw::dt = PARAM.inp.td_dt / ModuleBase::AU_to_FS;
+    }
+    else
+    {
+        elecstate::H_TDDFT_pw::dt = PARAM.mdp.md_dt / PARAM.inp.estep_per_md / ModuleBase::AU_to_FS;
+    }
 
-    elecstate::H_TDDFT_pw::dt = PARAM.mdp.md_dt / ModuleBase::AU_to_FS;
     elecstate::H_TDDFT_pw::dt_int = elecstate::H_TDDFT_pw::dt;
 
     // space domain parameters
@@ -247,10 +248,10 @@ void Input_Conv::Convert()
 // Fuxiang He add 2016-10-26
 //----------------------------------------------------------
 #ifdef __LCAO
-    TD_Velocity::out_current = PARAM.inp.out_current;
-    TD_Velocity::out_current_k = PARAM.inp.out_current_k;
-    TD_Velocity::out_vecpot = PARAM.inp.out_vecpot;
-    TD_Velocity::init_vecpot_file = PARAM.inp.init_vecpot_file;
+    TD_info::out_current = PARAM.inp.out_current;
+    TD_info::out_current_k = PARAM.inp.out_current_k;
+    TD_info::out_vecpot = PARAM.inp.out_vecpot;
+    TD_info::init_vecpot_file = PARAM.inp.init_vecpot_file;
     read_td_efield();
 #endif // __LCAO
 
diff --git a/source/source_io/output_log.cpp b/source/source_io/output_log.cpp
index 52f924b36d..bc202aec03 100644
--- a/source/source_io/output_log.cpp
+++ b/source/source_io/output_log.cpp
@@ -346,5 +346,17 @@ void write_head(std::ofstream& ofs, const int& istep, const int& iter, const std
 //    ofs << "\n " << basisname << " ALGORITHM --------------- ION=" << std::setw(4) << istep + 1
 //                << "  ELEC=" << std::setw(4) << iter << "--------------------------------\n";
 }
+void write_head_td(std::ofstream& ofs, const int& istep, const int& estep, const int& iter, const std::string& basisname)
+{
+    ofs << "\n";
+    ofs << " ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<< std::endl;
+    ofs << " --> IONIC RELAXATION STEP=" << std::setw(6) << istep+1
+        << " ELECTRON PROPAGATION STEP=" << std::setw(6) << estep
+        << "  ELECTRONIC ITERATION STEP=" << std::setw(6) << iter << "\n"; 
+    ofs << " ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<< std::endl;
+
+//    ofs << "\n " << basisname << " ALGORITHM --------------- ION=" << std::setw(4) << istep + 1
+//                << "  ELEC=" << std::setw(4) << estep << "  iter=" << std::setw(4)  << iter << "--------------------------------\n";
+}
 
 }// namespace ModuleIO
diff --git a/source/source_io/output_log.h b/source/source_io/output_log.h
index d1b476fd0f..7eba64b0c4 100644
--- a/source/source_io/output_log.h
+++ b/source/source_io/output_log.h
@@ -77,6 +77,14 @@ void print_stress(const std::string& name,
 /// @param basisname basis set name
 void write_head(std::ofstream& ofs_running, const int& istep, const int& iter, const std::string& basisname);
 
+/// @brief write head for scf iteration
+/// @param ofs_running output stream
+/// @param istep the ion step
+/// @param estep the electron step
+/// @param iter the scf iteration step
+/// @param basisname basis set name
+void write_head_td(std::ofstream& ofs_running, const int& istep, const int& estep, const int& iter, const std::string& basisname);
+
 } // namespace ModuleIO
 
 #endif
diff --git a/source/source_io/read_input.cpp b/source/source_io/read_input.cpp
index e8ee9ef3fd..1bd8520519 100644
--- a/source/source_io/read_input.cpp
+++ b/source/source_io/read_input.cpp
@@ -194,6 +194,7 @@ void ReadInput::create_directory(const Parameter& param)
     ModuleBase::Global_File::make_dir_out(param.input.suffix,
                                           param.input.calculation,
                                           out_dir,
+                                          param.input.out_wfc_lcao,
                                           this->rank,
                                           param.input.mdp.md_restart,
                                           param.input.out_alllog); // xiaohui add 2013-09-01
@@ -372,7 +373,7 @@ void ReadInput::write_txt_input(const Parameter& param, const std::string& filen
         {
             ofs << "\n#Parameters (8.DeepKS)" << std::endl;
         }
-        else if (p_item->label == "td_force_dt")
+        else if (p_item->label == "td_dt")
         {
             ofs << "\n#Parameters (9.rt-tddft)" << std::endl;
         }
diff --git a/source/source_io/read_input_item_md.cpp b/source/source_io/read_input_item_md.cpp
index a49c1bd3e6..4ba795d63a 100644
--- a/source/source_io/read_input_item_md.cpp
+++ b/source/source_io/read_input_item_md.cpp
@@ -23,7 +23,7 @@ void ReadInput::item_md()
         Input_Item item("md_nstep");
         item.annotation = "md steps";
         item.reset_value = [](const Input_Item& item, Parameter& para) {
-            if (para.input.mdp.md_nstep == 0)
+            if (para.input.mdp.md_nstep == 0 && para.input.esolver_type != "tddft")
             {
                 GlobalV::ofs_running << "md_nstep should be set. Autoset md_nstep to 50!" << std::endl;
                 para.input.mdp.md_nstep = 50;
@@ -40,6 +40,13 @@ void ReadInput::item_md()
                 ModuleBase::WARNING_QUIT("ReadInput", "time interval of MD calculation should be positive");
 }
         };
+        item.reset_value = [](const Input_Item& item, Parameter& para) {
+            if (para.input.td_dt != -1.0)
+            {
+                GlobalV::ofs_running << "td_dt exist, set md_dt with td_dt" << std::endl;
+                para.input.mdp.md_dt = para.input.td_dt * para.input.estep_per_md;
+            }
+        };
         read_sync_double(input.mdp.md_dt);
         this->add_item(item);
     }
@@ -403,4 +410,4 @@ void ReadInput::item_md()
         this->add_item(item);
     }
 }
-} // namespace ModuleIO
\ No newline at end of file
+} // namespace ModuleIO
diff --git a/source/source_io/read_input_item_system.cpp b/source/source_io/read_input_item_system.cpp
index d754f98465..ed8cd2f778 100644
--- a/source/source_io/read_input_item_system.cpp
+++ b/source/source_io/read_input_item_system.cpp
@@ -158,6 +158,10 @@ void ReadInput::item_system()
             {
                 para.input.symmetry = "0";
             }
+            if (para.input.esolver_type == "tddft")
+            {
+                para.input.symmetry = "-1";
+            }
             if (para.input.qo_switch)
             {
                 para.input.symmetry = "-1"; // disable kpoint reduce
diff --git a/source/source_io/read_input_item_tddft.cpp b/source/source_io/read_input_item_tddft.cpp
index 5a3ffe3214..0e44754db1 100644
--- a/source/source_io/read_input_item_tddft.cpp
+++ b/source/source_io/read_input_item_tddft.cpp
@@ -9,9 +9,28 @@ void ReadInput::item_rt_tddft()
 { 
     // real time TDDFT
     {
-        Input_Item item("td_force_dt");
-        item.annotation = "time of force change";
-        read_sync_double(input.td_force_dt);
+        Input_Item item("td_dt");
+        item.annotation = "time step for evolving wavefunction";
+        item.reset_value = [](const Input_Item& item, Parameter& para) {
+            if (para.input.td_dt == -1.0)
+            {
+                GlobalV::ofs_running << "td_dt don't exist, set td_dt with md_dt" << std::endl;
+                para.input.td_dt = para.input.mdp.md_dt / para.input.estep_per_md;
+            }
+        };
+        read_sync_double(input.td_dt); 
+        this->add_item(item);
+    }
+    {
+        Input_Item item("estep_per_md");
+        item.annotation = "steps of force change";
+        read_sync_int(input.estep_per_md);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("td_dt");
+        item.annotation = "time step of propagation";
+        read_sync_double(input.td_dt);
         this->add_item(item);
     }
     {
@@ -388,4 +407,4 @@ void ReadInput::item_lr_tddft()
         this->add_item(item);
     }
 }
-}
\ No newline at end of file
+}
diff --git a/source/source_io/read_set_globalv.cpp b/source/source_io/read_set_globalv.cpp
index be49b0a514..02fb9c8fa0 100644
--- a/source/source_io/read_set_globalv.cpp
+++ b/source/source_io/read_set_globalv.cpp
@@ -93,6 +93,11 @@ void ReadInput::set_global_dir(const Input_para& inp, System_para& sys)
     sys.global_matrix_dir = sys.global_out_dir + "matrix/";
     sys.global_matrix_dir = to_dir(sys.global_matrix_dir);
 
+    /// get the global output directory
+    sys.global_wfc_dir = sys.global_out_dir + "WFC/";
+    sys.global_wfc_dir = to_dir(sys.global_wfc_dir);
+
+
     /// get the global ML KEDF descriptor directory
     sys.global_mlkedf_descriptor_dir = sys.global_out_dir + "MLKEDF_Descriptors/";
     sys.global_mlkedf_descriptor_dir = to_dir(sys.global_mlkedf_descriptor_dir);
@@ -144,6 +149,7 @@ void ReadInput::set_global_dir(const Input_para& inp, System_para& sys)
     Parallel_Common::bcast_string(sys.global_readin_dir);
     Parallel_Common::bcast_string(sys.global_stru_dir);
     Parallel_Common::bcast_string(sys.global_matrix_dir);
+    Parallel_Common::bcast_string(sys.global_wfc_dir);
     Parallel_Common::bcast_string(sys.global_in_stru);
 #endif
 }
diff --git a/source/source_io/read_wfc_nao.cpp b/source/source_io/read_wfc_nao.cpp
index 0a7a62f5c2..b330213d1d 100644
--- a/source/source_io/read_wfc_nao.cpp
+++ b/source/source_io/read_wfc_nao.cpp
@@ -30,6 +30,7 @@ bool ModuleIO::read_wfc_nao(
 	const std::vector<int> &ik2iktot,
 	const int nkstot,
 	const int nspin,
+    const int nstep,
     const int skip_band)
 {
     ModuleBase::TITLE("ModuleIO", "read_wfc_nao");
@@ -155,11 +156,10 @@ bool ModuleIO::read_wfc_nao(
         if (myrank == 0)
         {
             const bool out_app_flag = false;
-            const int istep = -1;
             std::stringstream error_message;
 
-            std::string ss = ModuleIO::filename_output(global_readin_dir,"wf","nao",
-                    ik,ik2iktot,nspin,nkstot,out_type,out_app_flag,gamma_only,istep);
+            std::string ss = ModuleIO::filename_output(global_readin_dir + "WFC/","wf","nao",
+                    ik,ik2iktot,nspin,nkstot,out_type,out_app_flag,gamma_only,nstep);
 
             read_success = read_one_file(ss, error_message, ik, ctot);
             errors = error_message.str();
@@ -207,6 +207,7 @@ template bool ModuleIO::read_wfc_nao<double>(const std::string& global_readin_di
 	const std::vector<int> &ik2iktot,
 	const int nkstot,
 	const int nspin,
+    const int nstep,
     const int skip_band);
 
 template bool ModuleIO::read_wfc_nao<std::complex<double>>(const std::string& global_readin_dir,
@@ -216,4 +217,5 @@ template bool ModuleIO::read_wfc_nao<std::complex<double>>(const std::string& gl
 	const std::vector<int> &ik2iktot,
 	const int nkstot,
 	const int nspin,
+    const int nstep,
 	const int skip_band);
diff --git a/source/source_io/read_wfc_nao.h b/source/source_io/read_wfc_nao.h
index 370071f04b..e0993d2482 100644
--- a/source/source_io/read_wfc_nao.h
+++ b/source/source_io/read_wfc_nao.h
@@ -44,6 +44,7 @@ bool read_wfc_nao(
 	const std::vector<int> &ik2iktot,
 	const int nkstot,
 	const int nspin,
+    const int nstep = -1,
     const int skip_band = 0);
 
 } // namespace ModuleIO
diff --git a/source/source_io/td_current_io.cpp b/source/source_io/td_current_io.cpp
index c9c1f6e9aa..23c7f5a163 100644
--- a/source/source_io/td_current_io.cpp
+++ b/source/source_io/td_current_io.cpp
@@ -10,30 +10,210 @@
 #include "source_estate/module_dm/cal_dm_psi.h"
 #include "source_estate/module_pot/H_TDDFT_pw.h"
 #include "source_lcao/hamilt_lcaodft/LCAO_domain.h"
-#include "source_lcao/module_tddft/td_current.h"
-#include "source_lcao/module_tddft/td_velocity.h"
+#include "source_lcao/module_tddft/td_info.h"
 #include "source_pw/hamilt_pwdft/global.h"
 #include "module_parameter/parameter.h"
 
 #ifdef __LCAO
+void ModuleIO::cal_tmp_DM(const UnitCell& ucell,
+                        elecstate::DensityMatrix<std::complex<double>, double>& DM_real,
+                        elecstate::DensityMatrix<std::complex<double>, double>& DM_imag,
+                        int nspin_dm)
+{
+    ModuleBase::TITLE("ModuleIO", "cal_tmp_DM");
+    ModuleBase::timer::tick("ModuleIO", "cal_tmp_DM");
+    for (int is = 1; is <= nspin_dm; ++is)
+    {
+        for (int ik = 0; ik < DM_real.get_DMK_nks() / nspin_dm; ++ik)
+        {
+            cal_tmp_DM_k(ucell, DM_real, DM_imag, ik, nspin_dm, is, false);
+        }
+    }
+    ModuleBase::timer::tick("ModuleIO", "cal_tmp_DM");
+}
+template <typename TR>
+void ModuleIO::write_current(const UnitCell& ucell,
+                             const int istep,
+                             const psi::Psi<std::complex<double>>* psi,
+                             const elecstate::ElecState* pelec,
+                             const K_Vectors& kv,
+                             const TwoCenterIntegrator* intor,
+                             const Parallel_Orbitals* pv,
+                             const LCAO_Orbitals& orb,
+                             const Velocity_op<TR>* cal_current,
+                             Record_adj& ra)
+{
+
+    ModuleBase::TITLE("ModuleIO", "write_current");
+    ModuleBase::timer::tick("ModuleIO", "write_current");
+    std::vector<hamilt::HContainer<std::complex<double>>*> current_term = {nullptr, nullptr, nullptr};
+    if (PARAM.inp.td_stype!=1)
+    {
+        for (int dir = 0; dir < 3; dir++)
+        {
+            current_term[dir] = cal_current->get_current_term_pointer(dir);
+        }
+    }
+    else
+    {
+        if (TD_info::td_vel_op == nullptr)
+        {
+            ModuleBase::WARNING_QUIT("ModuleIO::write_current", "velocity gague infos is null!");
+        }
+        for (int dir = 0; dir < 3; dir++)
+        {
+            current_term[dir] = TD_info::td_vel_op->get_current_term_pointer(dir);
+        }
+    }
+    double omega=ucell.omega;
+    // construct a DensityMatrix object
+    // Since the function cal_dm_psi do not suport DMR in complex type, I replace it with two DMR in double type. Should
+    // be refactored in the future.
+    const int nspin0 = PARAM.inp.nspin;
+    const int nspin_dm = std::map<int, int>({ {1,1},{2,2},{4,1} })[nspin0];
+    elecstate::DensityMatrix<std::complex<double>, double> DM_real(pv, nspin_dm, kv.kvec_d, kv.get_nks() / nspin_dm);
+    elecstate::DensityMatrix<std::complex<double>, double> DM_imag(pv, nspin_dm, kv.kvec_d, kv.get_nks() / nspin_dm);
+    // calculate DMK
+    elecstate::cal_dm_psi(DM_real.get_paraV_pointer(), pelec->wg, psi[0], DM_real);
+
+    // init DMR
+    DM_real.init_DMR(ra, &ucell);
+    DM_imag.init_DMR(ra, &ucell);
+    cal_tmp_DM(ucell, DM_real, DM_imag, nspin_dm);
+    //DM_real.sum_DMR_spin();
+    //DM_imag.sum_DMR_spin();
+
+    double current_total[3] = {0.0, 0.0, 0.0};
+#ifdef _OPENMP
+#pragma omp parallel
+    {
+        double local_current[3] = {0.0, 0.0, 0.0};
+#else
+        // ModuleBase::matrix& local_soverlap = soverlap;
+        double* local_current = current_total;
+#endif
+        ModuleBase::Vector3<double> tau1, dtau, tau2;
+
+#ifdef _OPENMP
+#pragma omp for schedule(dynamic)
+#endif
+        for (int iat = 0; iat < ucell.nat; iat++)
+        {
+            const int T1 = ucell.iat2it[iat];
+            Atom* atom1 = &ucell.atoms[T1];
+            const int I1 = ucell.iat2ia[iat];
+            // get iat1
+            int iat1 = ucell.itia2iat(T1, I1);
+            const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
+            for (int cb = 0; cb < ra.na_each[iat]; ++cb)
+            {
+                const int T2 = ra.info[iat][cb][3];
+                const int I2 = ra.info[iat][cb][4];
+
+                const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
+
+                Atom* atom2 = &ucell.atoms[T2];
+
+                // get iat2
+                int iat2 = ucell.itia2iat(T2, I2);
+                double Rx = ra.info[iat][cb][0];
+                double Ry = ra.info[iat][cb][1];
+                double Rz = ra.info[iat][cb][2];
+                //std::cout<< "iat1: " << iat1 << " iat2: " << iat2 << " Rx: " << Rx << " Ry: " << Ry << " Rz:" << Rz << std::endl;
+                //  get BaseMatrix
+                hamilt::BaseMatrix<double>* tmp_matrix_real
+                    = DM_real.get_DMR_pointer(1)->find_matrix(iat1, iat2, Rx, Ry, Rz);
+                hamilt::BaseMatrix<double>* tmp_matrix_imag
+                    = DM_imag.get_DMR_pointer(1)->find_matrix(iat1, iat2, Rx, Ry, Rz);
+                // refactor
+                hamilt::BaseMatrix<std::complex<double>>* tmp_m_rvx
+                    = current_term[0]->find_matrix(iat1, iat2, Rx, Ry, Rz);
+                hamilt::BaseMatrix<std::complex<double>>* tmp_m_rvy
+                    = current_term[1]->find_matrix(iat1, iat2, Rx, Ry, Rz);
+                hamilt::BaseMatrix<std::complex<double>>* tmp_m_rvz
+                    = current_term[2]->find_matrix(iat1, iat2, Rx, Ry, Rz);
+                if (tmp_matrix_real == nullptr)
+                {
+                    continue;
+                }
+                int row_ap = pv->atom_begin_row[iat1];
+                int col_ap = pv->atom_begin_col[iat2];
+                // get DMR
+                for (int mu = 0; mu < pv->get_row_size(iat1); ++mu)
+                {
+                    for (int nu = 0; nu < pv->get_col_size(iat2); ++nu)
+                    {
+                        double dm2d1_real = tmp_matrix_real->get_value(mu, nu);
+                        double dm2d1_imag = tmp_matrix_imag->get_value(mu, nu);
+
+                        std::complex<double> rvx = {0, 0};
+                        std::complex<double> rvy = {0, 0};
+                        std::complex<double> rvz = {0, 0};
+
+                        if (tmp_m_rvx != nullptr)
+                        {
+                            rvx = tmp_m_rvx->get_value(mu, nu);
+                            rvy = tmp_m_rvy->get_value(mu, nu);
+                            rvz = tmp_m_rvz->get_value(mu, nu);
+                        }
+                        //std::cout<<"mu: "<< mu <<" nu: "<< nu << std::endl;
+                        // std::cout<<"dm2d1_real: "<< dm2d1_real << " dm2d1_imag: "<< dm2d1_imag << std::endl;
+                        //std::cout<<"rvz: "<< rvz.real() << " " << rvz.imag() << std::endl;
+                        local_current[0] -= dm2d1_real * rvx.real() - dm2d1_imag * rvx.imag();    
+                        local_current[1] -= dm2d1_real * rvy.real() - dm2d1_imag * rvy.imag();
+                        local_current[2] -= dm2d1_real * rvz.real() - dm2d1_imag * rvz.imag();
+                    } // end kk
+                } // end jj
+            } // end cb
+        } // end iat
+#ifdef _OPENMP
+#pragma omp critical(cal_current_k_reduce)
+        {
+            for (int i = 0; i < 3; ++i)
+            {
+                current_total[i] += local_current[i];
+            }
+        }
+    }
+#endif
+    Parallel_Reduce::reduce_all(current_total, 3);
+    // write end
+    if (GlobalV::MY_RANK == 0)
+    {
+        std::string filename = PARAM.globalv.global_out_dir + "current_total.dat";
+        std::ofstream fout;
+        fout.open(filename, std::ios::app);
+        fout << std::setprecision(16);
+        fout << std::scientific;
+        fout << istep << " " << current_total[0]/omega << " " << current_total[1]/omega << " " << current_total[2]/omega << std::endl;
+        fout.close();
+    }
 
-void ModuleIO::cal_tmp_DM(elecstate::DensityMatrix<std::complex<double>, double>& DM_real,
+    ModuleBase::timer::tick("ModuleIO", "write_current");
+    return;
+}
+void ModuleIO::cal_tmp_DM_k(const UnitCell& ucell,
+                          elecstate::DensityMatrix<std::complex<double>, double>& DM_real,
                           elecstate::DensityMatrix<std::complex<double>, double>& DM_imag,
                           const int ik,
                           const int nspin,
-                          const int is)
+                          const int is,
+                          const bool reset)
 {
-    ModuleBase::TITLE("ModuleIO", "cal_tmp_DM");
-    ModuleBase::timer::tick("ModuleIO", "cal_tmp_DM");
+    ModuleBase::TITLE("ModuleIO", "cal_tmp_DM_k");
+    ModuleBase::timer::tick("ModuleIO", "cal_tmp_DM_k");
     int ld_hk = DM_real.get_paraV_pointer()->nrow;
     int ld_hk2 = 2 * ld_hk;
     // tmp for is
     int ik_begin = DM_real.get_DMK_nks() / nspin * (is - 1); // jump nk for spin_down if nspin==2
-
-    hamilt::HContainer<double>* tmp_DMR_real = DM_real.get_DMR_vector()[is - 1];
-    hamilt::HContainer<double>* tmp_DMR_imag = DM_imag.get_DMR_vector()[is - 1];
-    tmp_DMR_real->set_zero();
-    tmp_DMR_imag->set_zero();
+    //sum spin up and down into up
+    hamilt::HContainer<double>* tmp_DMR_real = DM_real.get_DMR_vector()[0];
+    hamilt::HContainer<double>* tmp_DMR_imag = DM_imag.get_DMR_vector()[0];
+    if(reset)
+    {
+        tmp_DMR_real->set_zero();
+        tmp_DMR_imag->set_zero();
+    }
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
@@ -46,6 +226,12 @@ void ModuleIO::cal_tmp_DM(elecstate::DensityMatrix<std::complex<double>, double>
         // get global indexes of whole matrix for each atom in this process
         int row_ap = DM_real.get_paraV_pointer()->atom_begin_row[iat1];
         int col_ap = DM_real.get_paraV_pointer()->atom_begin_col[iat2];
+        // SOC
+        std::vector<std::complex<double>> tmp_DMR;
+        if (PARAM.inp.nspin == 4)
+        {
+            tmp_DMR.resize(tmp_ap_real.get_size());
+        }
         for (int ir = 0; ir < tmp_ap_real.get_R_size(); ++ir)
         {
             const ModuleBase::Vector3<int> r_index = tmp_ap_real.get_R_index(ir);
@@ -61,10 +247,20 @@ void ModuleIO::cal_tmp_DM(elecstate::DensityMatrix<std::complex<double>, double>
             // only ik
             if (PARAM.inp.nspin != 4)
             {
+                double arg_td = 0.0;
+                if(elecstate::H_TDDFT_pw::stype == 2)
+                {
+                    //cal tddft phase for hybrid gague
+                    const int iat1 = tmp_ap_real.get_atom_i();
+                    const int iat2 = tmp_ap_real.get_atom_j();
+                    ModuleBase::Vector3<double> dtau = ucell.cal_dtau(iat1, iat2, r_index);
+                    double& tmp_lat0 = ucell.lat0;
+                    arg_td = TD_info::td_vel_op->cart_At * dtau * tmp_lat0;
+                }
                 // cal k_phase
                 // if TK==std::complex<double>, kphase is e^{ikR}
                 const ModuleBase::Vector3<double> dR(r_index.x, r_index.y, r_index.z);
-                const double arg = (DM_real.get_kvec_d()[ik] * dR) * ModuleBase::TWO_PI;
+                const double arg = (DM_real.get_kvec_d()[ik] * dR) * ModuleBase::TWO_PI + arg_td;
                 double sinp, cosp;
                 ModuleBase::libm::sincos(arg, &sinp, &cosp);
                 std::complex<double> kphase = std::complex<double>(cosp, sinp);
@@ -112,13 +308,97 @@ void ModuleIO::cal_tmp_DM(elecstate::DensityMatrix<std::complex<double>, double>
                     tmp_DMR_imag_pointer += DM_imag.get_paraV_pointer()->get_col_size(iat2);
                 }
             }
+            // treat DMR as pauli matrix when NSPIN=4
+            if (PARAM.inp.nspin == 4)
+            {
+                tmp_DMR.assign(tmp_ap_real.get_size(), std::complex<double>(0.0, 0.0));
+                {
+                    // cal k_phase
+                    // if TK==std::complex<double>, kphase is e^{ikR}
+                    const ModuleBase::Vector3<double> dR(r_index.x, r_index.y, r_index.z);
+                    double arg_td = 0.0;
+                    if(elecstate::H_TDDFT_pw::stype == 2)
+                    {
+                        //new
+                        //cal tddft phase for mixing gague
+                        const int iat1 = tmp_ap_real.get_atom_i();
+                        const int iat2 = tmp_ap_real.get_atom_j();
+                        ModuleBase::Vector3<double> dtau = ucell.cal_dtau(iat1, iat2, r_index);
+                        double& tmp_lat0 = ucell.lat0;
+                        arg_td = TD_info::td_vel_op->cart_At * dtau * tmp_lat0;
+                    }
+                    const double arg = (DM_real.get_kvec_d()[ik] * dR) * ModuleBase::TWO_PI + arg_td;
+                    double sinp, cosp;
+                    ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                    std::complex<double> kphase = std::complex<double>(cosp, sinp);
+                    // set DMR element
+                    std::complex<double>* tmp_DMR_pointer = tmp_DMR.data();
+                    std::complex<double>* tmp_DMK_pointer = DM_real.get_DMK_pointer(ik + ik_begin);;
+                    double* DMK_real_pointer = nullptr;
+                    double* DMK_imag_pointer = nullptr;
+                    // jump DMK to fill DMR
+                    // DMR is row-major, DMK is column-major
+                    tmp_DMK_pointer += col_ap * DM_real.get_paraV_pointer()->nrow + row_ap;
+                    for (int mu = 0; mu < tmp_ap_real.get_row_size(); ++mu)
+                    {
+                        BlasConnector::axpy(tmp_ap_real.get_col_size(),
+                                            kphase,
+                                            tmp_DMK_pointer,
+                                            ld_hk,
+                                            tmp_DMR_pointer,
+                                            1);
+                        tmp_DMK_pointer += 1;
+                        tmp_DMR_pointer += tmp_ap_real.get_col_size();
+                    }
+                }
+                int npol = 2;
+                // step_trace = 0 for NSPIN=1,2; ={0, 1, local_col, local_col+1} for NSPIN=4
+                int step_trace[4];
+                for (int is = 0; is < npol; is++)
+                {
+                    for (int is2 = 0; is2 < npol; is2++)
+                    {
+                        step_trace[is * npol + is2] = tmp_ap_real.get_col_size() * is + is2;
+                    }
+                }
+                std::complex<double> tmp[4];
+                double* target_DMR_real = tmp_matrix_real->get_pointer();
+                double* target_DMR_imag = tmp_matrix_imag->get_pointer();
+                std::complex<double>* tmp_DMR_pointer = tmp_DMR.data();
+                for (int irow = 0; irow < tmp_ap_real.get_row_size(); irow += 2)
+                {
+                    for (int icol = 0; icol < tmp_ap_real.get_col_size(); icol += 2)
+                    {
+                        // catch the 4 spin component value of one orbital pair
+                        tmp[0] = tmp_DMR_pointer[icol + step_trace[0]];
+                        tmp[1] = tmp_DMR_pointer[icol + step_trace[1]];
+                        tmp[2] = tmp_DMR_pointer[icol + step_trace[2]];
+                        tmp[3] = tmp_DMR_pointer[icol + step_trace[3]];
+                        // transfer to Pauli matrix and save the real part
+                        // save them back to the tmp_matrix
+                        target_DMR_real[icol + step_trace[0]] += tmp[0].real() + tmp[3].real();
+                        target_DMR_real[icol + step_trace[1]] += tmp[1].real() + tmp[2].real();
+                        target_DMR_real[icol + step_trace[2]]
+                            += -tmp[1].imag() + tmp[2].imag(); // (i * (rho_updown - rho_downup)).real()
+                        target_DMR_real[icol + step_trace[3]] += tmp[0].real() - tmp[3].real();
+                        //imag part
+                        target_DMR_imag[icol + step_trace[0]] += tmp[0].imag() + tmp[3].imag();
+                        target_DMR_imag[icol + step_trace[1]] += tmp[1].imag() + tmp[2].imag();
+                        target_DMR_imag[icol + step_trace[2]]
+                            += tmp[1].real() - tmp[2].real(); // (i * (rho_updown - rho_downup)).real()
+                        target_DMR_imag[icol + step_trace[3]] += tmp[0].imag() - tmp[3].imag();
+                    }
+                    tmp_DMR_pointer += tmp_ap_real.get_col_size() * 2;
+                    target_DMR_real += tmp_ap_real.get_col_size() * 2;
+                    target_DMR_imag += tmp_ap_real.get_col_size() * 2;
+                }
+            }
         }
     }
-    ModuleBase::timer::tick("ModuleIO", "cal_tmp_DM");
+    ModuleBase::timer::tick("ModuleIO", "cal_tmp_DM_k");
 }
-
-void ModuleIO::write_current(const UnitCell& ucell,
-                             const Grid_Driver& gd,
+template <typename TR>
+void ModuleIO::write_current_eachk(const UnitCell& ucell,
                              const int istep,
                              const psi::Psi<std::complex<double>>* psi,
                              const elecstate::ElecState* pelec,
@@ -126,19 +406,15 @@ void ModuleIO::write_current(const UnitCell& ucell,
                              const TwoCenterIntegrator* intor,
                              const Parallel_Orbitals* pv,
                              const LCAO_Orbitals& orb,
+                             const Velocity_op<TR>* cal_current,
                              Record_adj& ra)
 {
+
     ModuleBase::TITLE("ModuleIO", "write_current");
     ModuleBase::timer::tick("ModuleIO", "write_current");
-
-    TD_current* cal_current = nullptr;
     std::vector<hamilt::HContainer<std::complex<double>>*> current_term = {nullptr, nullptr, nullptr};
-
-    if (!TD_Velocity::tddft_velocity)
+    if (PARAM.inp.td_stype != 1)
     {
-        cal_current = new TD_current(&ucell, &gd, pv, orb, intor);
-        cal_current->calculate_vcomm_r();
-        cal_current->calculate_grad_term();
         for (int dir = 0; dir < 3; dir++)
         {
             current_term[dir] = cal_current->get_current_term_pointer(dir);
@@ -146,16 +422,16 @@ void ModuleIO::write_current(const UnitCell& ucell,
     }
     else
     {
-        if (TD_Velocity::td_vel_op == nullptr)
+        if (TD_info::td_vel_op == nullptr)
         {
-            ModuleBase::WARNING_QUIT("ModuleIO::write_current", "velocity gauge infos is null!");
+            ModuleBase::WARNING_QUIT("ModuleIO::write_current", "velocity gague infos is null!");
         }
         for (int dir = 0; dir < 3; dir++)
         {
-            current_term[dir] = TD_Velocity::td_vel_op->get_current_term_pointer(dir);
+            current_term[dir] = TD_info::td_vel_op->get_current_term_pointer(dir);
         }
     }
-
+    double omega=ucell.omega;
     // construct a DensityMatrix object
     // Since the function cal_dm_psi do not suport DMR in complex type, 
     // I replace it with two DMR in double type.
@@ -163,10 +439,8 @@ void ModuleIO::write_current(const UnitCell& ucell,
 
     const int nspin0 = PARAM.inp.nspin;
     const int nspin_dm = std::map<int, int>({ {1,1},{2,2},{4,1} })[nspin0];
-
     elecstate::DensityMatrix<std::complex<double>, double> DM_real(pv, nspin_dm, kv.kvec_d, kv.get_nks() / nspin_dm);
     elecstate::DensityMatrix<std::complex<double>, double> DM_imag(pv, nspin_dm, kv.kvec_d, kv.get_nks() / nspin_dm);
-
     // calculate DMK
     elecstate::cal_dm_psi(DM_real.get_paraV_pointer(), pelec->wg, psi[0], DM_real);
 
@@ -174,32 +448,23 @@ void ModuleIO::write_current(const UnitCell& ucell,
     DM_real.init_DMR(ra, &ucell);
     DM_imag.init_DMR(ra, &ucell);
 
-    int nks = DM_real.get_DMK_nks();
-    if (nspin0 == 2)
-    {
-        nks /= 2;
-    }
-
+    int nks = DM_real.get_DMK_nks() / nspin_dm;
     double current_total[3] = {0.0, 0.0, 0.0};
-    for (int is = 1; is <= nspin0; ++is)
+    for (int is = 1; is <= nspin_dm; ++is)
     {
         for (int ik = 0; ik < nks; ++ik)
         {
-            cal_tmp_DM(DM_real, DM_imag, ik, nspin0, is);
+            cal_tmp_DM_k(ucell, DM_real, DM_imag, ik, nspin_dm, is);
             // check later
             double current_ik[3] = {0.0, 0.0, 0.0};
-            int total_irr = 0;
-
 #ifdef _OPENMP
 #pragma omp parallel
             {
                 int num_threads = omp_get_num_threads();
                 double local_current_ik[3] = {0.0, 0.0, 0.0};
-                int local_total_irr = 0;
 #else
             // ModuleBase::matrix& local_soverlap = soverlap;
             double* local_current_ik = current_ik;
-            int& local_total_irr = total_irr;
 #endif
 
                 ModuleBase::Vector3<double> tau1, dtau, tau2;
@@ -214,8 +479,6 @@ void ModuleIO::write_current(const UnitCell& ucell,
                     const int I1 = ucell.iat2ia[iat];
                     // get iat1
                     int iat1 = ucell.itia2iat(T1, I1);
-
-                    int irr = pv->nlocstart[iat];
                     const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
                     for (int cb = 0; cb < ra.na_each[iat]; ++cb)
                     {
@@ -231,13 +494,12 @@ void ModuleIO::write_current(const UnitCell& ucell,
                         double Rx = ra.info[iat][cb][0];
                         double Ry = ra.info[iat][cb][1];
                         double Rz = ra.info[iat][cb][2];
-
+                        //std::cout<< "iat1: " << iat1 << " iat2: " << iat2 << " Rx: " << Rx << " Ry: " << Ry << " Rz:" << Rz << std::endl;
                         //  get BaseMatrix
                         hamilt::BaseMatrix<double>* tmp_matrix_real
                             = DM_real.get_DMR_pointer(is)->find_matrix(iat1, iat2, Rx, Ry, Rz);
                         hamilt::BaseMatrix<double>* tmp_matrix_imag
                             = DM_imag.get_DMR_pointer(is)->find_matrix(iat1, iat2, Rx, Ry, Rz);
-
                         // refactor
                         hamilt::BaseMatrix<std::complex<double>>* tmp_m_rvx
                             = current_term[0]->find_matrix(iat1, iat2, Rx, Ry, Rz);
@@ -245,15 +507,12 @@ void ModuleIO::write_current(const UnitCell& ucell,
                             = current_term[1]->find_matrix(iat1, iat2, Rx, Ry, Rz);
                         hamilt::BaseMatrix<std::complex<double>>* tmp_m_rvz
                             = current_term[2]->find_matrix(iat1, iat2, Rx, Ry, Rz);
-
                         if (tmp_matrix_real == nullptr)
                         {
                             continue;
                         }
-
                         int row_ap = pv->atom_begin_row[iat1];
                         int col_ap = pv->atom_begin_col[iat2];
-
                         // get DMR
                         for (int mu = 0; mu < pv->get_row_size(iat1); ++mu)
                         {
@@ -272,12 +531,12 @@ void ModuleIO::write_current(const UnitCell& ucell,
                                     rvy = tmp_m_rvy->get_value(mu, nu);
                                     rvz = tmp_m_rvz->get_value(mu, nu);
                                 }
-                                local_current_ik[0] -= dm2d1_real * rvx.real() - dm2d1_imag * rvx.imag();
+                                // std::cout<<"mu: "<< mu <<" nu: "<< nu << std::endl;
+                                // std::cout<<"dm2d1_real: "<< dm2d1_real << " dm2d1_imag: "<< dm2d1_imag << std::endl;
+                                // std::cout<<"rvz: "<< rvz.real() << " " << rvz.imag() << std::endl;
+                                local_current_ik[0] -= dm2d1_real * rvx.real() - dm2d1_imag * rvx.imag();    
                                 local_current_ik[1] -= dm2d1_real * rvy.real() - dm2d1_imag * rvy.imag();
                                 local_current_ik[2] -= dm2d1_real * rvz.real() - dm2d1_imag * rvz.imag();
-
-                                ++local_total_irr;
-                                ++irr;
                             } // end kk
                         } // end jj
                     } // end cb
@@ -285,7 +544,6 @@ void ModuleIO::write_current(const UnitCell& ucell,
 #ifdef _OPENMP
 #pragma omp critical(cal_current_k_reduce)
                 {
-                    total_irr += local_total_irr;
                     for (int i = 0; i < 3; ++i)
                     {
                         current_ik[i] += local_current_ik[i];
@@ -298,9 +556,8 @@ void ModuleIO::write_current(const UnitCell& ucell,
             {
                 current_total[i] += current_ik[i];
             }
-
             // MPI_Reduce(local_current_ik, current_ik, 3, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-            if (GlobalV::MY_RANK == 0 && TD_Velocity::out_current_k)
+            if (GlobalV::MY_RANK == 0 && TD_info::out_current_k)
             {
                 std::string filename = PARAM.globalv.global_out_dir + "current_spin" + std::to_string(is) + "_ik"
                                        + std::to_string(ik) + ".dat";
@@ -308,7 +565,7 @@ void ModuleIO::write_current(const UnitCell& ucell,
                 fout.open(filename, std::ios::app);
                 fout << std::setprecision(16);
                 fout << std::scientific;
-                fout << istep << " " << current_ik[0] << " " << current_ik[1] << " " << current_ik[2] << std::endl;
+                fout << istep << " " << current_ik[0]/omega << " " << current_ik[1]/omega << " " << current_ik[2]/omega << std::endl;
                 fout.close();
             }
             // write end
@@ -321,15 +578,57 @@ void ModuleIO::write_current(const UnitCell& ucell,
         fout.open(filename, std::ios::app);
         fout << std::setprecision(16);
         fout << std::scientific;
-        fout << istep << " " << current_total[0] << " " << current_total[1] << " " << current_total[2] << std::endl;
+        fout << istep << " " << current_total[0]/omega << " " << current_total[1]/omega << " " << current_total[2]/omega << std::endl;
         fout.close();
     }
-    if (!TD_Velocity::tddft_velocity)
-    {
-        delete cal_current;
-    }
 
     ModuleBase::timer::tick("ModuleIO", "write_current");
     return;
 }
+template 
+void ModuleIO::write_current_eachk<double>(
+                        const UnitCell& ucell,
+                        const int istep,
+                        const psi::Psi<std::complex<double>>* psi,
+                        const elecstate::ElecState* pelec,
+                        const K_Vectors& kv,
+                        const TwoCenterIntegrator* intor,
+                        const Parallel_Orbitals* pv,
+                        const LCAO_Orbitals& orb,
+                        const Velocity_op<double>* cal_current,
+                        Record_adj& ra);
+template 
+void ModuleIO::write_current_eachk<std::complex<double>>(const UnitCell& ucell,
+                        const int istep,
+                        const psi::Psi<std::complex<double>>* psi,
+                        const elecstate::ElecState* pelec,
+                        const K_Vectors& kv,
+                        const TwoCenterIntegrator* intor,
+                        const Parallel_Orbitals* pv,
+                        const LCAO_Orbitals& orb,
+                        const Velocity_op<std::complex<double>>* cal_current,
+                        Record_adj& ra);
+template 
+void ModuleIO::write_current<double>(const UnitCell& ucell,
+                const int istep,
+                const psi::Psi<std::complex<double>>* psi,
+                const elecstate::ElecState* pelec,
+                const K_Vectors& kv,
+                const TwoCenterIntegrator* intor,
+                const Parallel_Orbitals* pv,
+                const LCAO_Orbitals& orb,
+                const Velocity_op<double>* cal_current,
+                Record_adj& ra);
+template 
+void ModuleIO::write_current<std::complex<double>>(const UnitCell& ucell,
+                const int istep,
+                const psi::Psi<std::complex<double>>* psi,
+                const elecstate::ElecState* pelec,
+                const K_Vectors& kv,
+                const TwoCenterIntegrator* intor,
+                const Parallel_Orbitals* pv,
+                const LCAO_Orbitals& orb,
+                const Velocity_op<std::complex<double>>* cal_current,
+                Record_adj& ra);
 #endif //__LCAO
+
diff --git a/source/source_io/td_current_io.h b/source/source_io/td_current_io.h
index 5f1424c3b8..9fd079fd42 100644
--- a/source/source_io/td_current_io.h
+++ b/source/source_io/td_current_io.h
@@ -5,30 +5,49 @@
 #include "source_estate/elecstate_lcao.h"
 #include "source_estate/module_dm/density_matrix.h"
 #include "source_psi/psi.h"
+#include "source_lcao/module_tddft/velocity_op.h"
 
 namespace ModuleIO
 {
 #ifdef __LCAO
 /// @brief func to output current, only used in tddft
+template <typename TR>
+void write_current_eachk(const UnitCell& ucell,
+                        const int istep,
+                        const psi::Psi<std::complex<double>>* psi,
+                        const elecstate::ElecState* pelec,
+                        const K_Vectors& kv,
+                        const TwoCenterIntegrator* intor,
+                        const Parallel_Orbitals* pv,
+                        const LCAO_Orbitals& orb,
+                        const Velocity_op<TR>* cal_current,
+                        Record_adj& ra);
+template <typename TR>
 void write_current(const UnitCell& ucell,
-                   const Grid_Driver& gd,
-                   const int istep,
-                   const psi::Psi<std::complex<double>>* psi,
-                   const elecstate::ElecState* pelec,
-                   const K_Vectors& kv,
-                   const TwoCenterIntegrator* intor,
-                   const Parallel_Orbitals* pv,
-                   const LCAO_Orbitals& orb,
-                   Record_adj& ra);
+                const int istep,
+                const psi::Psi<std::complex<double>>* psi,
+                const elecstate::ElecState* pelec,
+                const K_Vectors& kv,
+                const TwoCenterIntegrator* intor,
+                const Parallel_Orbitals* pv,
+                const LCAO_Orbitals& orb,
+                const Velocity_op<TR>* cal_current,
+                Record_adj& ra);
 
 /// @brief calculate sum_n[𝜌_(𝑛𝑘,𝜇𝜈)] for current calculation
-void cal_tmp_DM(elecstate::DensityMatrix<std::complex<double>, double>& DM_real,
+void cal_tmp_DM_k(const UnitCell& ucell,
+                elecstate::DensityMatrix<std::complex<double>, double>& DM_real,
                 elecstate::DensityMatrix<std::complex<double>, double>& DM_imag,
                 const int ik,
                 const int nspin,
-                const int is);
+                const int is,
+                const bool reset = true);
+
+void cal_tmp_DM(const UnitCell& ucell,
+                elecstate::DensityMatrix<std::complex<double>, double>& DM_real,
+                elecstate::DensityMatrix<std::complex<double>, double>& DM_imag,
+                const int nspin);
 
 #endif // __LCAO
 } // namespace ModuleIO
-
-#endif
+#endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_IO_TD_CURRENT_IO_H
diff --git a/source/source_io/write_HS_sparse.cpp b/source/source_io/write_HS_sparse.cpp
index 1ccb46c7f5..98ec8a0385 100644
--- a/source/source_io/write_HS_sparse.cpp
+++ b/source/source_io/write_HS_sparse.cpp
@@ -3,7 +3,7 @@
 #include "module_parameter/parameter.h"
 #include "source_base/parallel_reduce.h"
 #include "source_base/timer.h"
-#include "source_lcao/module_tddft/td_velocity.h"
+#include "source_lcao/module_tddft/td_info.h"
 #include "source_pw/hamilt_pwdft/global.h"
 #include "single_R_io.h"
 
@@ -48,12 +48,12 @@ void ModuleIO::save_HSR_sparse(const int& istep,
     for (auto& R_coor: all_R_coor_ptr) {
         if (PARAM.inp.nspin != 4) {
             for (int ispin = 0; ispin < spin_loop; ++ispin) {
-                if (TD_Velocity::tddft_velocity) {
+                if (PARAM.inp.esolver_type == "tddft" && PARAM.inp.td_stype == 1) {
                     auto iter
-                        = TD_Velocity::td_vel_op->HR_sparse_td_vel[ispin].find(
+                        = TD_info::td_vel_op->HR_sparse_td_vel[ispin].find(
                             R_coor);
                     if (iter
-                        != TD_Velocity::td_vel_op->HR_sparse_td_vel[ispin]
+                        != TD_info::td_vel_op->HR_sparse_td_vel[ispin]
                                .end()) {
                         for (auto& row_loop: iter->second) {
                             H_nonzero_num[ispin][count]
@@ -254,9 +254,9 @@ void ModuleIO::save_HSR_sparse(const int& istep,
                 // }
             } else {
                 if (PARAM.inp.nspin != 4) {
-                    if (TD_Velocity::tddft_velocity) {
+                    if (PARAM.inp.esolver_type == "tddft" && PARAM.inp.td_stype == 1) {
                         output_single_R(g1[ispin],
-                                        TD_Velocity::td_vel_op
+                                        TD_info::td_vel_op
                                             ->HR_sparse_td_vel[ispin][R_coor],
                                         sparse_thr,
                                         binary,
diff --git a/source/source_io/write_wfc_nao.cpp b/source/source_io/write_wfc_nao.cpp
index 162a280b13..8297b7e35c 100644
--- a/source/source_io/write_wfc_nao.cpp
+++ b/source/source_io/write_wfc_nao.cpp
@@ -269,7 +269,7 @@ void write_wfc_nao(const int out_type,
 
         if (myid == 0)
         {
-            std::string fn = filename_output(PARAM.globalv.global_out_dir,"wf","nao",ik,ik2iktot,nspin,nkstot,
+            std::string fn = filename_output(PARAM.globalv.global_wfc_dir,"wf","nao",ik,ik2iktot,nspin,nkstot,
               out_type,out_app_flag,gamma_only,istep);
 
             bool append_flag = (istep > 0 && out_app_flag);
diff --git a/source/source_lcao/hamilt_lcaodft/CMakeLists.txt b/source/source_lcao/hamilt_lcaodft/CMakeLists.txt
index 2f1cd951ae..ba00725b73 100644
--- a/source/source_lcao/hamilt_lcaodft/CMakeLists.txt
+++ b/source/source_lcao/hamilt_lcaodft/CMakeLists.txt
@@ -13,6 +13,7 @@ if(ENABLE_LCAO)
         operator_lcao/nonlocal_new.cpp
         operator_lcao/td_ekinetic_lcao.cpp
         operator_lcao/td_nonlocal_lcao.cpp
+        operator_lcao/td_pot_hybrid.cpp
         operator_lcao/dspin_lcao.cpp
         operator_lcao/dftu_lcao.cpp
         pulay_force_stress_center2.cpp
diff --git a/source/source_lcao/hamilt_lcaodft/hamilt_lcao.cpp b/source/source_lcao/hamilt_lcaodft/hamilt_lcao.cpp
index 0a4fd2a802..d1d0568e0f 100644
--- a/source/source_lcao/hamilt_lcaodft/hamilt_lcao.cpp
+++ b/source/source_lcao/hamilt_lcaodft/hamilt_lcao.cpp
@@ -38,6 +38,7 @@
 #include "operator_lcao/overlap_new.h"
 #include "operator_lcao/td_ekinetic_lcao.h"
 #include "operator_lcao/td_nonlocal_lcao.h"
+#include "operator_lcao/td_pot_hybrid.h"
 #include "operator_lcao/veff_lcao.h"
 
 namespace hamilt
@@ -344,12 +345,8 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
         }
 #endif
         // TDDFT_velocity_gauge
-        if (TD_Velocity::tddft_velocity)
+        if (PARAM.inp.esolver_type == "tddft" && PARAM.inp.td_stype == 1)
         {
-            if (!TD_Velocity::init_vecpot_file)
-            {
-                elecstate::H_TDDFT_pw::update_At();
-            }
             Operator<TK>* td_ekinetic = new TDEkinetic<OperatorLCAO<TK, TR>>(this->hsk,
                                                                              this->hR,
                                                                              this->kv,
@@ -359,10 +356,27 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
                                                                              two_center_bundle.overlap_orb.get());
             this->getOperator()->add(td_ekinetic);
 
-            Operator<TK>* td_nonlocal
-                = new TDNonlocal<OperatorLCAO<TK, TR>>(this->hsk, this->kv->kvec_d, this->hR, &ucell, orb, &grid_d);
+            Operator<TK>* td_nonlocal = new TDNonlocal<OperatorLCAO<TK, TR>>(this->hsk,
+                                                                             this->kv->kvec_d,
+                                                                             this->hR,
+                                                                             &ucell,
+                                                                             orb,
+                                                                             &grid_d);
             this->getOperator()->add(td_nonlocal);
         }
+        if (PARAM.inp.esolver_type == "tddft" && PARAM.inp.td_stype == 2)
+        {
+            Operator<TK>* td_pot_hybrid = new TD_pot_hybrid<OperatorLCAO<TK, TR>>(this->hsk,
+                                                                           this->kv,
+                                                                           this->hR,
+                                                                           this->sR,
+                                                                           orb,
+                                                                           &ucell,
+                                                                           orb.cutoffs(),
+                                                                           &grid_d,
+                                                                           two_center_bundle.kinetic_orb.get());
+            this->getOperator()->add(td_pot_hybrid);
+        }
         if (PARAM.inp.dft_plus_u)
         {
             Operator<TK>* dftu = nullptr;
diff --git a/source/source_lcao/hamilt_lcaodft/operator_lcao/CMakeLists.txt b/source/source_lcao/hamilt_lcaodft/operator_lcao/CMakeLists.txt
index 2e1afc328e..0e49a9dc43 100644
--- a/source/source_lcao/hamilt_lcaodft/operator_lcao/CMakeLists.txt
+++ b/source/source_lcao/hamilt_lcaodft/operator_lcao/CMakeLists.txt
@@ -11,6 +11,7 @@ add_library(
     nonlocal_new.cpp
     td_ekinetic_lcao.cpp
     td_nonlocal_lcao.cpp
+    td_pot_hybrid.cpp
     dspin_lcao.cpp
     dftu_lcao.cpp
 )
diff --git a/source/source_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.cpp b/source/source_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.cpp
index 4d119dfad8..44423f28ee 100644
--- a/source/source_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.cpp
+++ b/source/source_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.cpp
@@ -12,6 +12,8 @@
 #include "source_hsolver/diago_elpa_native.h"
 #endif
 
+#include "source_lcao/module_tddft/td_info.h"
+
 namespace hamilt {
 
 template <>
@@ -200,7 +202,7 @@ void OperatorLCAO<TK, TR>::init(const int ik_in) {
 
         break;
     }
-    case calculation_type::lcao_tddft_velocity: {
+    case calculation_type::lcao_tddft_periodic: {
         if (!this->hr_done) {
             // in cal_type=lcao_fixed, HR should be updated by each sub-chain
             // nodes
@@ -240,8 +242,8 @@ void OperatorLCAO<TK, TR>::init(const int ik_in) {
 }
 
 // contributeHk()
-template <typename TK, typename TR>
-void OperatorLCAO<TK, TR>::contributeHk(int ik) {
+template <>
+void OperatorLCAO<double, double>::contributeHk(int ik) {
     ModuleBase::TITLE("OperatorLCAO", "contributeHk");
     ModuleBase::timer::tick("OperatorLCAO", "contributeHk");
     if(ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
@@ -256,6 +258,37 @@ void OperatorLCAO<TK, TR>::contributeHk(int ik) {
     }
     ModuleBase::timer::tick("OperatorLCAO", "contributeHk");
 }
+// contributeHk()
+template <typename TK, typename TR>
+void OperatorLCAO<TK, TR>::contributeHk(int ik) {
+    ModuleBase::TITLE("OperatorLCAO", "contributeHk");
+    ModuleBase::timer::tick("OperatorLCAO", "contributeHk");
+    if(ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
+    {
+        const int nrow = this->hsk->get_pv()->get_row_size();
+        if(PARAM.inp.td_stype == 2)
+        {
+            TD_info::td_vel_op->folding_HR_td(*this->hR, this->hsk->get_hk(), this->kvec_d[ik], nrow, 1);
+        }
+        else
+        {
+            hamilt::folding_HR(*this->hR, this->hsk->get_hk(), this->kvec_d[ik], nrow, 1);
+        }
+    }
+    else
+    {
+        const int ncol = this->hsk->get_pv()->get_col_size();
+        if(PARAM.inp.td_stype == 2)
+        {
+            TD_info::td_vel_op->folding_HR_td(*this->hR, this->hsk->get_hk(), this->kvec_d[ik], ncol, 0);
+        }
+        else
+        {
+            hamilt::folding_HR(*this->hR, this->hsk->get_hk(), this->kvec_d[ik], ncol, 0);
+        }
+    }
+    ModuleBase::timer::tick("OperatorLCAO", "contributeHk");
+}
 
 template class OperatorLCAO<double, double>;
 template class OperatorLCAO<std::complex<double>, double>;
diff --git a/source/source_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp b/source/source_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
index abc6ce95fa..503cc584c4 100644
--- a/source/source_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
+++ b/source/source_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
@@ -7,6 +7,7 @@
 #include "source_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h"
 #include "source_lcao/module_hcontainer/hcontainer_funcs.h"
 #include <vector>
+#include "source_lcao/module_tddft/td_info.h"
 
 template <typename TK, typename TR>
 hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::OverlapNew(HS_Matrix_K<TK>* hsk_in,
@@ -188,11 +189,11 @@ void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
 }
 
 // contributeHk()
-template <typename TK, typename TR>
-void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::contributeHk(int ik)
+template <>
+void hamilt::OverlapNew<hamilt::OperatorLCAO<double, double>>::contributeHk(int ik)
 {
     //! if k vector is not changed, then do nothing and return, only for gamma_only case
-    if (this->kvec_d[ik] == this->kvec_d_old && std::is_same<TK, double>::value)
+    if (this->kvec_d[ik] == this->kvec_d_old)
     {
         return;
     }
@@ -217,7 +218,44 @@ void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::contributeHk(int ik)
 
     ModuleBase::timer::tick("OverlapNew", "contributeHk");
 }
+template <typename TK, typename TR>
+void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::contributeHk(int ik)
+{
+    ModuleBase::TITLE("OverlapNew", "contributeHk");
+    ModuleBase::timer::tick("OverlapNew", "contributeHk");
+    
+    //! set SK to zero and then calculate SK for each k vector
+    this->hsk->set_zero_sk();
+    if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
+    {
+        const int nrow = this->SR->get_atom_pair(0).get_paraV()->get_row_size();
+        if(PARAM.inp.td_stype == 2)
+        {
+            TD_info::td_vel_op->folding_HR_td(*this->SR, this->hsk->get_sk(), this->kvec_d[ik], nrow, 1);
+        }
+        else
+        {
+            hamilt::folding_HR(*this->SR, this->hsk->get_sk(), this->kvec_d[ik], nrow, 1);
+        }
+    }
+    else
+    {
+        const int ncol = this->SR->get_atom_pair(0).get_paraV()->get_col_size();
+        if(PARAM.inp.td_stype == 2)
+        {
+            TD_info::td_vel_op->folding_HR_td(*this->SR, this->hsk->get_sk(), this->kvec_d[ik], ncol, 0);
+        }
+        else
+        {
+            hamilt::folding_HR(*this->SR, this->hsk->get_sk(), this->kvec_d[ik], ncol, 0);
+        }
+    }
+    
+    // update kvec_d_old
+    this->kvec_d_old = this->kvec_d[ik];
 
+    ModuleBase::timer::tick("OverlapNew", "contributeHk");
+}
 template <typename TK, typename TR>
 TK* hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::getSk()
 {
diff --git a/source/source_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp b/source/source_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp
index b83608d64f..320e1a8f60 100644
--- a/source/source_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp
+++ b/source/source_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp
@@ -25,9 +25,8 @@ TDEkinetic<OperatorLCAO<TK, TR>>::TDEkinetic(HS_Matrix_K<TK>* hsk_in,
     : OperatorLCAO<TK, TR>(hsk_in, kv_in->kvec_d, hR_in), orb_cutoff_(orb_cutoff), kv(kv_in), intor_(intor)
 {
     this->ucell = ucell_in;
-    this->cal_type = calculation_type::lcao_tddft_velocity;
+    this->cal_type = calculation_type::lcao_tddft_periodic;
     this->Grid = GridD_in;
-    this->init_td();
     // initialize HR to get adjs info.
     this->initialize_HR(Grid);
 }
@@ -39,28 +38,19 @@ TDEkinetic<OperatorLCAO<TK, TR>>::~TDEkinetic()
     {
         delete this->hR_tmp;
     }
-    TD_Velocity::td_vel_op = nullptr;
+    TD_info::td_vel_op = nullptr;
 }
 
 // term A^2*S
 template <typename TK, typename TR>
-void TDEkinetic<OperatorLCAO<TK, TR>>::td_ekinetic_scalar(std::complex<double>* Hloc,const TR& overlap, int nnr)
-{
-    return;
-}
-
-// term A^2*S
-template <>
-void TDEkinetic<OperatorLCAO<std::complex<double>, double>>::td_ekinetic_scalar(std::complex<double>* Hloc,
-                                                                                const double& overlap,
-                                                                                int nnr)
+void TDEkinetic<OperatorLCAO<TK, TR>>::td_ekinetic_scalar(std::complex<double>* Hloc,
+                                                            const TR& overlap,
+                                                            int nnr)
 {
     // the correction term A^2/2. From Hatree to Ry, it needs to be multiplied by 2.0
-    std::complex<double> tmp = {cart_At.norm2() * overlap, 0};
-    Hloc[nnr] += tmp;
+    Hloc[nnr] += cart_At.norm2() * overlap;
     return;
 }
-
 // term A dot ∇
 template <typename TK, typename TR>
 void TDEkinetic<OperatorLCAO<TK, TR>>::td_ekinetic_grad(std::complex<double>* Hloc,
@@ -106,12 +96,12 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::calculate_HR()
             hamilt::BaseMatrix<std::complex<double>>* tmp = this->hR_tmp->find_matrix(iat1, iat2, R_index2);
             if (tmp != nullptr)
             {
-                if (TD_Velocity::out_current)
+                if (TD_info::out_current)
                 {
                     std::complex<double>* tmp_c[3] = {nullptr, nullptr, nullptr};
                     for (int i = 0; i < 3; i++)
                     {
-                        tmp_c[i] = td_velocity.get_current_term_pointer(i)->find_matrix(iat1, iat2, R_index2)->get_pointer();
+                        tmp_c[i] = TD_info::td_vel_op->get_current_term_pointer(i)->find_matrix(iat1, iat2, R_index2)->get_pointer();
                     }
                     this->cal_HR_IJR(iat1, iat2, paraV, dtau, tmp->get_pointer(), tmp_c);
                 }
@@ -233,19 +223,12 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::cal_HR_IJR(const int& iat1,
 		}
     }
 }
-// init two center integrals and vector potential for td_ekintic term
+//update vector potential for td_ekintic term
 template <typename TK, typename TR>
-void TDEkinetic<OperatorLCAO<TK, TR>>::init_td()
+void TDEkinetic<OperatorLCAO<TK, TR>>::update_td()
 {
-    TD_Velocity::td_vel_op = &td_velocity;
-    // calculate At in cartesian coorinates.
-    td_velocity.cal_cart_At(elecstate::H_TDDFT_pw::At);
-    this->cart_At = td_velocity.cart_At;
-
-    // mohan update 2025-04-20
-    ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "Cartesian vector potential Ax(t)", cart_At[0]);
-    ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "Cartesian vector potential Ay(t)", cart_At[1]);
-    ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "Cartesian vector potential Az(t)", cart_At[2]);
+    //std::cout<<"velocity"<<std::endl;
+    this->cart_At = TD_info::td_vel_op->cart_At;
 }
 
 template <typename TK, typename TR>
@@ -343,7 +326,7 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::contributeHR()
     {
         return;
     }
-    if (!this->hR_tmp_done)
+    if (!this->hR_tmp_done || TD_info::evolve_once)
     {
         const Parallel_Orbitals* paraV = this->hR->get_atom_pair(0).get_paraV();
         // if this Operator is the first node of the sub_chain, then hR_tmp is nullptr
@@ -360,11 +343,13 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::contributeHR()
             static_cast<OperatorLCAO<TK, TR>*>(this->next_sub_op)->set_HR_fixed(this->hR_tmp);
         }
         // initialize current term if needed
-        if (TD_Velocity::out_current)
+        if (TD_info::out_current)
         {
-            td_velocity.initialize_current_term(this->hR_tmp, paraV);
+            TD_info::td_vel_op->initialize_current_term(this->hR_tmp, paraV);
         }
         // calculate the values in hR_tmp
+        this->update_td();
+        this->hR_tmp->set_zero();
         this->calculate_HR();
         this->hR_tmp_done = true;
     }
@@ -376,12 +361,7 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::contributeHR()
 template <typename TK, typename TR>
 void TDEkinetic<OperatorLCAO<TK, TR>>::contributeHk(int ik)
 {
-    return;
-}
-template <>
-void TDEkinetic<OperatorLCAO<std::complex<double>, double>>::contributeHk(int ik)
-{
-    if (TD_Velocity::tddft_velocity == false)
+    if (PARAM.inp.td_stype != 1)
     {
         return;
     }
@@ -392,12 +372,7 @@ void TDEkinetic<OperatorLCAO<std::complex<double>, double>>::contributeHk(int ik
         const Parallel_Orbitals* paraV = this->hR_tmp->get_atom_pair(0).get_paraV();
         // save HR data for output
         int spin_tot = PARAM.inp.nspin;
-
-        if (spin_tot == 4)
-        {
-
-        }
-        else if (!output_hR_done && TD_Velocity::out_mat_R)
+        if (!output_hR_done && TD_info::out_mat_R)
         {
             for (int spin_now = 0; spin_now < spin_tot; spin_now++)
             {
@@ -405,11 +380,10 @@ void TDEkinetic<OperatorLCAO<std::complex<double>, double>>::contributeHk(int ik
                                                  spin_now,
                                                  1e-10,
                                                  *hR_tmp,
-                                                 td_velocity.HR_sparse_td_vel[spin_now]);
+                                                 TD_info::td_vel_op->HR_sparse_td_vel[spin_now]);
             }
             output_hR_done = true;
         }
-
         // folding inside HR to HK
         if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
         {
@@ -426,7 +400,6 @@ void TDEkinetic<OperatorLCAO<std::complex<double>, double>>::contributeHk(int ik
     }
 }
 
-template class TDEkinetic<hamilt::OperatorLCAO<double, double>>;
 template class TDEkinetic<hamilt::OperatorLCAO<std::complex<double>, double>>;
 template class TDEkinetic<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>;
 
diff --git a/source/source_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.h b/source/source_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.h
index 591815aae7..f429328dc1 100644
--- a/source/source_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.h
+++ b/source/source_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.h
@@ -5,7 +5,7 @@
 #include "source_cell/klist.h"
 #include "source_cell/module_neighbor/sltk_grid_driver.h"
 #include "source_lcao/module_hcontainer/hcontainer.h"
-#include "source_lcao/module_tddft/td_velocity.h"
+#include "source_lcao/module_tddft/td_info.h"
 #include "operator_lcao.h"
 #include <vector>
 
@@ -48,8 +48,8 @@ class TDEkinetic<OperatorLCAO<TK,TR>> : public OperatorLCAO<TK, TR>
 
     virtual void contributeHk(int ik) override;
 
-    /// @brief init two center integrals and vector potential for td_ekintic term
-    void init_td();
+    /// @brief update vector potential
+    void update_td();
 
     /**
      * @brief initialize HR, search the nearest neighbor atoms
@@ -83,7 +83,6 @@ class TDEkinetic<OperatorLCAO<TK,TR>> : public OperatorLCAO<TK, TR>
 
     virtual void set_HR_fixed(void*) override;
 
-    TD_Velocity td_velocity;
 
   private:
 
@@ -124,3 +123,4 @@ class TDEkinetic<OperatorLCAO<TK,TR>> : public OperatorLCAO<TK, TR>
 
 } // namespace hamilt
 #endif
+
diff --git a/source/source_lcao/hamilt_lcaodft/operator_lcao/td_nonlocal_lcao.cpp b/source/source_lcao/hamilt_lcaodft/operator_lcao/td_nonlocal_lcao.cpp
index 207fa31f1a..e24b71b8cb 100644
--- a/source/source_lcao/hamilt_lcaodft/operator_lcao/td_nonlocal_lcao.cpp
+++ b/source/source_lcao/hamilt_lcaodft/operator_lcao/td_nonlocal_lcao.cpp
@@ -22,14 +22,13 @@ hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::TDNonlocal(HS_Matrix_K<TK>* hs
                                                              const Grid_Driver* GridD_in)
     : hamilt::OperatorLCAO<TK, TR>(hsk_in, kvec_d_in, hR_in), orb_(orb)
 {
-    this->cal_type = calculation_type::lcao_tddft_velocity;
+    this->cal_type = calculation_type::lcao_tddft_periodic;
     this->ucell = ucell_in;
     this->Grid = GridD_in;
 #ifdef __DEBUG
     assert(this->ucell != nullptr);
 #endif
     // initialize HR to get adjs info.
-    this->init_td();
     this->initialize_HR(Grid);
 }
 
@@ -43,10 +42,10 @@ hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::~TDNonlocal()
     }
 }
 template <typename TK, typename TR>
-void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::init_td()
+void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::update_td()
 {
     // calculate At in cartesian coorinates.
-    this->cart_At = TD_Velocity::td_vel_op->cart_At;
+    this->cart_At = TD_info::td_vel_op->cart_At;
 }
 // initialize_HR()
 template <typename TK, typename TR>
@@ -130,7 +129,7 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
 
     const Parallel_Orbitals* paraV = this->hR_tmp->get_atom_pair(0).get_paraV();
     const int npol = this->ucell->get_npol();
-    const int nlm_dim = TD_Velocity::out_current ? 4 : 1;
+    const int nlm_dim = TD_info::out_current ? 4 : 1;
     // 1. calculate <psi|beta> for each pair of atoms
 
     for (int iat0 = 0; iat0 < this->ucell->nat; iat0++)
@@ -182,7 +181,7 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
                                                           tau0 * this->ucell->lat0,
                                                           T0,
                                                           cart_At,
-                                                          TD_Velocity::out_current);
+                                                          TD_info::out_current);
                     for (int dir = 0; dir < nlm_dim; dir++)
                     {
                         nlm_tot[ad][dir].insert({all_indexes[iw1l], nlm[dir]});
@@ -245,12 +244,12 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
                     // if not found , skip this pair of atoms
                     if (tmp != nullptr)
                     {
-                        if (TD_Velocity::out_current)
+                        if (TD_info::out_current)
                         {
                             std::complex<double>* tmp_c[3] = {nullptr, nullptr, nullptr};
                             for (int i = 0; i < 3; i++)
                             {
-                                tmp_c[i] = TD_Velocity::td_vel_op->get_current_term_pointer(i)
+                                tmp_c[i] = TD_info::td_vel_op->get_current_term_pointer(i)
                                                 ->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2])
                                                 ->get_pointer();
                             }
@@ -295,7 +294,7 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::cal_HR_IJR(
     std::complex<double>* data_pointer,
     std::complex<double>** data_pointer_c)
 {
-    const int nlm_dim = TD_Velocity::out_current ? 4 : 1;
+    const int nlm_dim = TD_info::out_current ? 4 : 1;
     // npol is the number of polarizations,
     // 1 for non-magnetic (one Hamiltonian matrix only has spin-up or spin-down),
     // 2 for magnetic (one Hamiltonian matrix has both spin-up and spin-down)
@@ -411,7 +410,7 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
 
     ModuleBase::timer::tick("TDNonlocal", "contributeHR");
 
-    if (!this->hR_tmp_done)
+    if (!this->hR_tmp_done || TD_info::evolve_once)
     {
         if (this->hR_tmp == nullptr)
         {
@@ -427,8 +426,10 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
         }
 
         // calculate the values in hR_tmp
+        this->update_td();
         this->calculate_HR();
         this->hR_tmp_done = true;
+        TD_info::evolve_once = false;
     }
 
     ModuleBase::timer::tick("TDNonlocal", "contributeHR");
@@ -442,35 +443,5 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::contributeHk(int ik)
     return;
 }
 
-
-template <>
-void hamilt::TDNonlocal<hamilt::OperatorLCAO<std::complex<double>, double>>::contributeHk(int ik)
-{
-    if (TD_Velocity::tddft_velocity == false)
-    {
-        return;
-    }
-    else
-    {
-        ModuleBase::TITLE("TDNonlocal", "contributeHk");
-        ModuleBase::timer::tick("TDNonlocal", "contributeHk");
-
-        // folding inside HR to HK
-        if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
-        {
-            const int nrow = this->hsk->get_pv()->get_row_size();
-            folding_HR(*this->hR_tmp, this->hsk->get_hk(), this->kvec_d[ik], nrow, 1);
-        }
-        else
-        {
-            const int ncol = this->hsk->get_pv()->get_col_size();
-            folding_HR(*this->hR_tmp, this->hsk->get_hk(), this->kvec_d[ik], ncol, 0);
-        }
-
-        ModuleBase::timer::tick("TDNonlocal", "contributeHk");
-    }
-}
-
-template class hamilt::TDNonlocal<hamilt::OperatorLCAO<double, double>>;
 template class hamilt::TDNonlocal<hamilt::OperatorLCAO<std::complex<double>, double>>;
 template class hamilt::TDNonlocal<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>;
diff --git a/source/source_lcao/hamilt_lcaodft/operator_lcao/td_nonlocal_lcao.h b/source/source_lcao/hamilt_lcaodft/operator_lcao/td_nonlocal_lcao.h
index 16af65660d..72c1f76dee 100644
--- a/source/source_lcao/hamilt_lcaodft/operator_lcao/td_nonlocal_lcao.h
+++ b/source/source_lcao/hamilt_lcaodft/operator_lcao/td_nonlocal_lcao.h
@@ -6,7 +6,7 @@
 #include "source_estate/module_pot/H_TDDFT_pw.h"
 #include "source_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h"
 #include "source_lcao/module_hcontainer/hcontainer.h"
-#include "source_lcao/module_tddft/td_velocity.h"
+#include "source_lcao/module_tddft/td_info.h"
 
 #include <unordered_map>
 
@@ -78,7 +78,7 @@ class TDNonlocal<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
      */
     void initialize_HR_tmp(const Parallel_Orbitals* paraV);
     /// @brief init vector potential for td_nonlocal term
-    void init_td();
+    void update_td();
     /**
      * @brief calculate the non-local pseudopotential matrix with specific <I,J,R> atom-pairs
      * nearest neighbor atoms don't need to be calculated again
diff --git a/source/source_lcao/hamilt_lcaodft/operator_lcao/td_pot_hybrid.cpp b/source/source_lcao/hamilt_lcaodft/operator_lcao/td_pot_hybrid.cpp
new file mode 100644
index 0000000000..d45f68d53d
--- /dev/null
+++ b/source/source_lcao/hamilt_lcaodft/operator_lcao/td_pot_hybrid.cpp
@@ -0,0 +1,300 @@
+#include "td_pot_hybrid.h"
+
+#include "source_base/timer.h"
+#include "source_base/tool_title.h"
+#include "source_cell/module_neighbor/sltk_grid_driver.h"
+#include "source_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h"
+#include "source_lcao/module_hcontainer/hcontainer_funcs.h"
+#include "source_pw/hamilt_pwdft/global.h"
+
+// Constructor
+template <typename TK, typename TR>
+cal_r_overlap_R hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<TK, TR>>::r_calculator;
+
+template <typename TK, typename TR>
+hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<TK, TR>>::TD_pot_hybrid(
+    HS_Matrix_K<TK>* hsk_in,
+    const K_Vectors* kv_in,
+    hamilt::HContainer<TR>* hR_in,
+    hamilt::HContainer<TR>* SR_in,
+    const LCAO_Orbitals& orb,
+    const UnitCell* ucell_in,
+    const std::vector<double>& orb_cutoff,
+    const Grid_Driver* GridD_in,
+    const TwoCenterIntegrator* intor)
+    : hamilt::OperatorLCAO<TK, TR>(hsk_in, kv_in->kvec_d, hR_in), SR(SR_in), orb_(orb), orb_cutoff_(orb_cutoff), intor_(intor)
+{
+    this->cal_type = calculation_type::lcao_tddft_periodic;
+    this->ucell = ucell_in;
+#ifdef __DEBUG
+    assert(this->ucell != nullptr);
+    assert(this->hsk != nullptr);
+#endif
+    this->init_td();
+    // initialize HR to allocate sparse Ekinetic matrix memory
+    this->initialize_HR(GridD_in);
+}
+
+// destructor
+template <typename TK, typename TR>
+hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<TK, TR>>::~TD_pot_hybrid()
+{
+    if (this->allocated)
+    {
+        delete this->HR_fixed;
+    }
+    /*if(TD_info::td_vel_op!=nullptr)
+    {
+        TD_info::td_vel_op->hk_hybrid = nullptr;
+    }*/
+}
+
+// initialize_HR()
+template <typename TK, typename TR>
+void hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(const Grid_Driver* GridD)
+{
+    ModuleBase::TITLE("TD_pot_hybrid", "initialize_HR");
+    ModuleBase::timer::tick("TD_pot_hybrid", "initialize_HR");
+
+    auto* paraV = this->hR->get_paraV();// get parallel orbitals from HR
+    // TODO: if paraV is nullptr, AtomPair can not use paraV for constructor, I will repair it in the future.
+
+    for (int iat1 = 0; iat1 < ucell->nat; iat1++)
+    {
+        auto tau1 = ucell->get_tau(iat1);
+        int T1, I1;
+        ucell->iat2iait(iat1, &I1, &T1);
+        AdjacentAtomInfo adjs;
+        GridD->Find_atom(*ucell, tau1, T1, I1, &adjs);
+        std::vector<bool> is_adj(adjs.adj_num + 1, false);
+        for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
+        {
+            const int T2 = adjs.ntype[ad1];
+            const int I2 = adjs.natom[ad1];
+            const int iat2 = ucell->itia2iat(T2, I2);
+            if (paraV->get_row_size(iat1) <= 0 || paraV->get_col_size(iat2) <= 0)
+            {
+                continue;
+            }
+            const ModuleBase::Vector3<int>& R_index2 = adjs.box[ad1];
+            // choose the real adjacent atoms
+            // Note: the distance of atoms should less than the cutoff radius,
+            // When equal, the theoretical value of matrix element is zero,
+            // but the calculated value is not zero due to the numerical error, which would lead to result changes.
+            if (this->ucell->cal_dtau(iat1, iat2, R_index2).norm() * this->ucell->lat0
+                < orb_cutoff_[T1] + orb_cutoff_[T2])
+            {
+                is_adj[ad1] = true;
+            }
+        }
+        filter_adjs(is_adj, adjs);
+        this->adjs_all.push_back(adjs);
+        for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+        {
+            const int T2 = adjs.ntype[ad];
+            const int I2 = adjs.natom[ad];
+            int iat2 = ucell->itia2iat(T2, I2);
+            ModuleBase::Vector3<int>& R_index = adjs.box[ad];
+            hamilt::AtomPair<TR> tmp(iat1, iat2, R_index, paraV);
+            this->hR->insert_pair(tmp);
+        }
+    }
+    // allocate the memory of BaseMatrix in HR, and set the new values to zero
+    this->hR->allocate(nullptr, true);
+
+    ModuleBase::timer::tick("TD_pot_hybrid", "initialize_HR");
+}
+
+template <typename TK, typename TR>
+void hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
+{
+    ModuleBase::TITLE("TD_pot_hybrid", "calculate_HR");
+    if (this->HR_fixed == nullptr || this->HR_fixed->size_atom_pairs() <= 0)
+    {
+        ModuleBase::WARNING_QUIT("hamilt::TD_pot_hybrid::calculate_HR", "HR_fixed is nullptr or empty");
+    }
+    ModuleBase::timer::tick("TD_pot_hybrid", "calculate_HR");
+
+    const Parallel_Orbitals* paraV = this->HR_fixed->get_atom_pair(0).get_paraV();
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (int iat1 = 0; iat1 < this->ucell->nat; iat1++)
+    {
+        auto tau1 = ucell->get_tau(iat1);
+        int T1, I1;
+        ucell->iat2iait(iat1, &I1, &T1);
+        AdjacentAtomInfo& adjs = this->adjs_all[iat1];
+        for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+        {
+            const int T2 = adjs.ntype[ad];
+            const int I2 = adjs.natom[ad];
+            const int iat2 = ucell->itia2iat(T2, I2);
+            const ModuleBase::Vector3<int>& R_index2 = adjs.box[ad];
+            ModuleBase::Vector3<double> dtau = this->ucell->cal_dtau(iat1, iat2, R_index2);
+
+            hamilt::BaseMatrix<TR>* tmp = this->HR_fixed->find_matrix(iat1, iat2, R_index2);
+            hamilt::BaseMatrix<TR>* tmp_overlap = this->SR->find_matrix(iat1, iat2, R_index2);
+            if (tmp != nullptr)
+            {
+                this->cal_HR_IJR(iat1, iat2, paraV, dtau, tmp->get_pointer(), tmp_overlap->get_pointer());
+            }
+            else
+            {
+                ModuleBase::WARNING_QUIT("hamilt::TD_pot_hybrid::calculate_HR", "R_index not found in HR");
+            }
+        }
+    }
+
+    ModuleBase::timer::tick("TD_pot_hybrid", "calculate_HR");
+}
+
+// cal_HR_IJR()
+template <typename TK, typename TR>
+void hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<TK, TR>>::cal_HR_IJR(const int& iat1,
+                                                                   const int& iat2,
+                                                                   const Parallel_Orbitals* paraV,
+                                                                   const ModuleBase::Vector3<double>& dtau,
+                                                                   TR* hr_mat_p,
+                                                                   TR* sr_p)
+{
+    // ---------------------------------------------
+    // get info of orbitals of atom1 and atom2 from ucell
+    // ---------------------------------------------
+    int T1, I1;
+    this->ucell->iat2iait(iat1, &I1, &T1);
+    int T2, I2;
+    this->ucell->iat2iait(iat2, &I2, &T2);
+    Atom& atom1 = this->ucell->atoms[T1];
+    Atom& atom2 = this->ucell->atoms[T2];
+
+    // npol is the number of polarizations,
+    // 1 for non-magnetic (one Hamiltonian matrix only has spin-up or spin-down),
+    // 2 for magnetic (one Hamiltonian matrix has both spin-up and spin-down)
+    const int npol = this->ucell->get_npol();
+
+    const int* iw2l1 = atom1.iw2l.data();
+    const int* iw2n1 = atom1.iw2n.data();
+    const int* iw2m1 = atom1.iw2m.data();
+    const int* iw2l2 = atom2.iw2l.data();
+    const int* iw2n2 = atom2.iw2n.data();
+    const int* iw2m2 = atom2.iw2m.data();
+
+    // ---------------------------------------------
+    // calculate the Ekinetic matrix for each pair of orbitals
+    // ---------------------------------------------
+    double olm[3] = {0, 0, 0};
+    auto row_indexes = paraV->get_indexes_row(iat1);
+    auto col_indexes = paraV->get_indexes_col(iat2);
+    const int step_trace = col_indexes.size() + 1;
+
+    const ModuleBase::Vector3<double>& tau1 = this->ucell->get_tau(iat1);
+    const ModuleBase::Vector3<double> tau2 = tau1 + dtau;
+    for (int iw1l = 0; iw1l < row_indexes.size(); iw1l += npol)
+    {
+        const int iw1 = row_indexes[iw1l] / npol;
+        const int L1 = iw2l1[iw1];
+        const int N1 = iw2n1[iw1];
+        const int m1 = iw2m1[iw1];
+
+        // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
+        int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
+
+        for (int iw2l = 0; iw2l < col_indexes.size(); iw2l += npol)
+        {
+            const int iw2 = col_indexes[iw2l] / npol;
+            const int L2 = iw2l2[iw2];
+            const int N2 = iw2n2[iw2];
+            const int m2 = iw2m2[iw2];
+
+            ModuleBase::Vector3<double> tmp_r = r_calculator.get_psi_r_psi(tau1 * this->ucell->lat0, T1, L1, m1, N1, tau2 * this->ucell->lat0, T2, L2, m2, N2);
+            // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
+            int M2 = (m2 % 2 == 0) ? -m2 / 2 : (m2 + 1) / 2;
+
+            for (int ipol = 0; ipol < npol; ipol++)
+            {
+                hr_mat_p[ipol * step_trace] += tmp_r * Et;
+                hr_mat_p[ipol * step_trace] -= ((dtau + tau1) * Et) * sr_p[ipol * step_trace] * this->ucell->lat0;
+            }
+            hr_mat_p += npol;
+            sr_p += npol;
+        }
+        hr_mat_p += (npol - 1) * col_indexes.size();
+        sr_p += (npol - 1) * col_indexes.size();
+    }
+}
+// init two center integrals and vector potential for td_ekintic term
+template <typename TK, typename TR>
+void hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<TK, TR>>::init_td()
+{
+    // initialize the r_calculator
+    if(TD_info::td_vel_op->get_istep()==(TD_info::estep_shift-1))
+    {
+        //std::cout << "init_r_overlap" <<std::endl;
+        r_calculator.init(*ucell, *this->hR->get_paraV(), orb_);
+    }
+    //hk_hybrid.resize(this->hR->get_paraV()->nloc);
+}
+template <typename TK, typename TR>
+void hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<TK, TR>>::update_td()
+{
+    //std::cout<<"hybrid gague" <<std::endl;
+    this->cart_At = TD_info::td_vel_op->cart_At;
+    //std::cout<<"At: "<< TD_info::td_vel_op->cart_At[0] <<" "<<TD_info::td_vel_op->cart_At[1]<<" "<<TD_info::td_vel_op->cart_At[2]<<" "<<std::endl;
+    Et = elecstate::H_TDDFT_pw::Et;
+}
+// set_HR_fixed()
+template <typename TK, typename TR>
+void hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<TK, TR>>::set_HR_fixed(void* HR_fixed_in)
+{
+    this->HR_fixed = static_cast<hamilt::HContainer<TR>*>(HR_fixed_in);
+    this->allocated = false;
+}
+
+// contributeHR()
+template <typename TK, typename TR>
+void hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
+{
+    ModuleBase::TITLE("TD_pot_hybrid", "contributeHR");
+    ModuleBase::timer::tick("TD_pot_hybrid", "contributeHR");
+
+    if (!this->HR_fixed_done || TD_info::evolve_once)
+    {
+        // if this Operator is the first node of the sub_chain, then HR_fixed is nullptr
+        if (this->HR_fixed == nullptr)
+        {
+            this->HR_fixed = new hamilt::HContainer<TR>(*this->hR);
+            this->HR_fixed->set_zero();
+            this->allocated = true;
+        }
+        if (this->next_sub_op != nullptr)
+        {
+            // pass pointer of HR_fixed to the next node
+            static_cast<OperatorLCAO<TK, TR>*>(this->next_sub_op)->set_HR_fixed(this->HR_fixed);
+        }
+        // calculate the values in HR_fixed
+        this->update_td();
+        this->HR_fixed->set_zero();
+        this->calculate_HR();
+        this->HR_fixed_done = true;
+        TD_info::evolve_once = false;
+    }
+    // last node of sub-chain, add HR_fixed into HR
+    if (this->next_sub_op == nullptr)
+    {
+        this->hR->add(*(this->HR_fixed));
+    }
+
+    ModuleBase::timer::tick("TD_pot_hybrid", "contributeHR");
+    return;
+}
+
+//ETD
+// contributeHk()
+template <typename TK, typename TR>
+void hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<TK, TR>>::contributeHk(int ik) {
+    return;
+}
+
+template class hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<std::complex<double>, double>>;
+template class hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>;
diff --git a/source/source_lcao/hamilt_lcaodft/operator_lcao/td_pot_hybrid.h b/source/source_lcao/hamilt_lcaodft/operator_lcao/td_pot_hybrid.h
new file mode 100644
index 0000000000..349b847137
--- /dev/null
+++ b/source/source_lcao/hamilt_lcaodft/operator_lcao/td_pot_hybrid.h
@@ -0,0 +1,129 @@
+#ifndef TD_POT_HYBRID_H
+#define TD_POT_HYBRID_H
+#include "source_basis/module_ao/parallel_orbitals.h"
+#include "source_basis/module_nao/two_center_integrator.h"
+#include "source_cell/klist.h"
+#include "source_cell/module_neighbor/sltk_grid_driver.h"
+#include "source_cell/unitcell.h"
+#include "source_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h"
+#include "source_lcao/module_hcontainer/hcontainer.h"
+#include <vector>
+#include "source_io/cal_r_overlap_R.h"
+#include "source_lcao/module_tddft/td_info.h"
+#include "source_estate/module_pot/H_TDDFT_pw.h"
+
+namespace hamilt
+{
+
+#ifndef __TD_POT_HYBRIDTEMPLATE
+#define __TD_POT_HYBRIDTEMPLATE
+
+/// The EkineticNew class template inherits from class T
+/// it is used to calculate the electronic kinetic
+/// Template parameters:
+/// - T: base class, it would be OperatorLCAO<TK> or OperatorPW<TK>
+/// - TR: data type of real space Hamiltonian, it would be double or std::complex<double>
+template <class T>
+class TD_pot_hybrid : public T
+{
+};
+
+#endif
+
+/// EkineticNew class template specialization for OperatorLCAO<TK> base class
+/// It is used to calculate the electronic kinetic matrix in real space and fold it to k-space
+/// HR = <psi_{mu, 0}|-\Nabla^2|psi_{nu, R}>
+/// HK = <psi_{mu, k}|-\Nabla^2|psi_{nu, k}> = \sum_{R} e^{ikR} HR
+/// Template parameters:
+/// - TK: data type of k-space Hamiltonian
+/// - TR: data type of real space Hamiltonian
+template <typename TK, typename TR>
+class TD_pot_hybrid<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
+{
+  public:
+    /**
+     * @brief Construct a new EkineticNew object
+     */
+    TD_pot_hybrid<OperatorLCAO<TK, TR>>(HS_Matrix_K<TK>* hsk_in,
+                                      const K_Vectors* kv_in,
+                                      HContainer<TR>* hR_in,
+                                      HContainer<TR>* SR_in,
+                                      const LCAO_Orbitals& orb,
+                                      const UnitCell* ucell_in,
+                                      const std::vector<double>& orb_cutoff,
+                                      const Grid_Driver* GridD_in,
+                                      const TwoCenterIntegrator* intor);
+
+    /**
+     * @brief Destroy the EkineticNew object
+     */
+    ~TD_pot_hybrid<OperatorLCAO<TK, TR>>();
+
+    /**
+     * @brief contributeHR() is used to calculate the HR matrix
+     * <phi_{\mu, 0}|-\Nabla^2|phi_{\nu, R}>
+     */
+    virtual void contributeHR() override;
+    //ETD
+    virtual void contributeHk(int ik) override;
+    //ETD
+
+    virtual void set_HR_fixed(void*) override;
+
+
+  private:
+    const UnitCell* ucell = nullptr;
+    std::vector<double> orb_cutoff_;
+    const LCAO_Orbitals& orb_;
+
+    hamilt::HContainer<TR>* HR_fixed = nullptr;
+
+    hamilt::HContainer<TR>* SR = nullptr;
+
+    const TwoCenterIntegrator* intor_ = nullptr;
+
+    bool allocated = false;
+
+    bool HR_fixed_done = false;
+    //tddft part
+    static cal_r_overlap_R r_calculator;
+    //ETD
+    //std::vector<std::complex<double>> hk_hybrid;
+    //ETD
+    /// @brief Store the vector potential for td_ekinetic term
+    ModuleBase::Vector3<double> cart_At;
+    ModuleBase::Vector3<double> Et;
+
+
+    /**
+     * @brief initialize HR, search the nearest neighbor atoms
+     * HContainer is used to store the electronic kinetic matrix with specific <I,J,R> atom-pairs
+     * the size of HR will be fixed after initialization
+     */
+    void initialize_HR(const Grid_Driver* GridD_in);
+
+    void init_td();
+    void update_td();
+
+    /**
+     * @brief calculate the electronic kinetic matrix with specific <I,J,R> atom-pairs
+     * use the adjs_all to calculate the HR matrix
+     */
+    void calculate_HR();
+
+    /**
+     * @brief calculate the HR local matrix of <I,J,R> atom pair
+     */
+    void cal_HR_IJR(const int& iat1,
+                    const int& iat2,
+                    const Parallel_Orbitals* paraV,
+                    const ModuleBase::Vector3<double>& dtau,
+                    TR* hr_mat_p,
+                    TR* sr_p);
+
+    /// @brief exact the nearest neighbor atoms from all adjacent atoms
+    std::vector<AdjacentAtomInfo> adjs_all;
+};
+
+} // namespace hamilt
+#endif
diff --git a/source/source_lcao/hamilt_lcaodft/spar_hsr.cpp b/source/source_lcao/hamilt_lcaodft/spar_hsr.cpp
index 6e64be1c5f..22237a7041 100644
--- a/source/source_lcao/hamilt_lcaodft/spar_hsr.cpp
+++ b/source/source_lcao/hamilt_lcaodft/spar_hsr.cpp
@@ -1,7 +1,7 @@
 #include "spar_hsr.h"
 
 #include "source_lcao/module_hcontainer/hcontainer.h"
-#include "source_lcao/module_tddft/td_velocity.h"
+#include "source_lcao/module_tddft/td_info.h"
 #include "module_parameter/parameter.h"
 #include "spar_dh.h"
 #include "spar_exx.h"
@@ -96,13 +96,13 @@ void sparse_format::cal_HSR(const UnitCell& ucell,
 
         HS_Arrays.all_R_coor = get_R_range(*(p_ham_lcao->getHR()));
 
-        if (TD_Velocity::tddft_velocity)
+        if (PARAM.inp.esolver_type == "tddft" && PARAM.inp.td_stype == 1)
         {
             sparse_format::cal_HContainer_td(pv,
                                              current_spin,
                                              sparse_thr,
                                              *(p_ham_lcao->getHR()),
-                                             TD_Velocity::td_vel_op->HR_sparse_td_vel[current_spin]);
+                                             TD_info::td_vel_op->HR_sparse_td_vel[current_spin]);
         }
         else
         {
@@ -334,9 +334,9 @@ void sparse_format::clear_zero_elements(LCAO_HS_Arrays& HS_Arrays, const int& cu
                 }
             }
         }
-        if (TD_Velocity::tddft_velocity)
+        if (PARAM.inp.esolver_type == "tddft" && PARAM.inp.td_stype == 1)
         {
-            for (auto& R_loop: TD_Velocity::td_vel_op->HR_sparse_td_vel[current_spin])
+            for (auto& R_loop: TD_info::td_vel_op->HR_sparse_td_vel[current_spin])
             {
                 for (auto& row_loop: R_loop.second)
                 {
@@ -451,4 +451,4 @@ void sparse_format::destroy_HS_R_sparse(LCAO_HS_Arrays& HS_Arrays)
     // all_R_coor.swap(empty_all_R_coor);
 
     return;
-}
\ No newline at end of file
+}
diff --git a/source/source_lcao/module_tddft/CMakeLists.txt b/source/source_lcao/module_tddft/CMakeLists.txt
index e81ceb3368..58bc834a5f 100644
--- a/source/source_lcao/module_tddft/CMakeLists.txt
+++ b/source/source_lcao/module_tddft/CMakeLists.txt
@@ -10,9 +10,10 @@ if(ENABLE_LCAO)
         propagator_taylor.cpp
         propagator_etrs.cpp
         upsi.cpp
-        td_velocity.cpp
-        td_current.cpp
+        td_info.cpp
+        velocity_op.cpp
         snap_psibeta_half_tddft.cpp
+        td_folding.cpp
         solve_propagation.cpp
     )
 
diff --git a/source/source_lcao/module_tddft/evolve_elec.h b/source/source_lcao/module_tddft/evolve_elec.h
index 4763129f6e..30e55b859c 100644
--- a/source/source_lcao/module_tddft/evolve_elec.h
+++ b/source/source_lcao/module_tddft/evolve_elec.h
@@ -143,7 +143,8 @@ class Evolve_elec
     friend class ModuleESolver::ESolver_KS_LCAO<std::complex<double>, double>;
 
     // Template parameter is needed for the friend class declaration
-    friend class ModuleESolver::ESolver_KS_LCAO_TDDFT<Device>;
+    friend class ModuleESolver::ESolver_KS_LCAO_TDDFT<double, Device>;
+    friend class ModuleESolver::ESolver_KS_LCAO_TDDFT<std::complex<double>, Device>;
 
   public:
     Evolve_elec();
diff --git a/source/source_lcao/module_tddft/evolve_psi.cpp b/source/source_lcao/module_tddft/evolve_psi.cpp
index 269bd44fa7..8a4b0816e3 100644
--- a/source/source_lcao/module_tddft/evolve_psi.cpp
+++ b/source/source_lcao/module_tddft/evolve_psi.cpp
@@ -76,7 +76,7 @@ void evolve_psi(const int nband,
         /// @brief compute U_operator
         /// @input Stmp, Htmp, print_matrix
         /// @output U_operator
-        Propagator prop(propagator, pv, PARAM.mdp.md_dt);
+        Propagator prop(propagator, pv, PARAM.inp.td_dt);
         prop.compute_propagator(nlocal, Stmp, Htmp, H_laststep, U_operator, ofs_running, print_matrix);
     }
 
@@ -93,7 +93,7 @@ void evolve_psi(const int nband,
         /// @brief solve the propagation equation
         /// @input Stmp, Htmp, psi_k_laststep
         /// @output psi_k
-        solve_propagation(pv, nband, nlocal, PARAM.mdp.md_dt / ModuleBase::AU_to_FS, Stmp, Htmp, psi_k_laststep, psi_k);
+        solve_propagation(pv, nband, nlocal,  PARAM.inp.td_dt, Stmp, Htmp, psi_k_laststep, psi_k);
     }
 
     // (4)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
diff --git a/source/source_lcao/module_tddft/solve_propagation.cpp b/source/source_lcao/module_tddft/solve_propagation.cpp
index f8df6a6c71..5271f2fec3 100644
--- a/source/source_lcao/module_tddft/solve_propagation.cpp
+++ b/source/source_lcao/module_tddft/solve_propagation.cpp
@@ -4,6 +4,7 @@
 
 #include "source_base/lapack_connector.h"
 #include "source_base/scalapack_connector.h"
+#include "source_pw/hamilt_pwdft/global.h"
 
 namespace module_tddft
 {
@@ -25,14 +26,16 @@ void solve_propagation(const Parallel_Orbitals* pv,
     std::complex<double>* operator_B = new std::complex<double>[pv->nloc];
     ModuleBase::GlobalFunc::ZEROS(operator_B, pv->nloc);
     BlasConnector::copy(pv->nloc, Htmp, 1, operator_B, 1);
+
+    const double dt_au = dt / ModuleBase::AU_to_FS;
     
     // ->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
     // (2) compute operator_A & operator_B by GEADD
     // operator_A = Stmp + i*para * Htmp;   beta2 = para = 0.25 * dt
     // operator_B = Stmp - i*para * Htmp;     beta1 = - para = -0.25 * dt
     std::complex<double> alpha = {1.0, 0.0};
-    std::complex<double> beta1 = {0.0, -0.25 * dt};
-    std::complex<double> beta2 = {0.0, 0.25 * dt};
+    std::complex<double> beta1 = {0.0, -0.25 * dt_au};
+    std::complex<double> beta2 = {0.0, 0.25 * dt_au};
 
     ScalapackConnector::geadd('N',
                               nlocal,
diff --git a/source/source_lcao/module_tddft/td_folding.cpp b/source/source_lcao/module_tddft/td_folding.cpp
new file mode 100644
index 0000000000..a190ade46b
--- /dev/null
+++ b/source/source_lcao/module_tddft/td_folding.cpp
@@ -0,0 +1,53 @@
+#include "td_info.h"
+#include "source_base/libm/libm.h"
+#include "source_lcao/module_tddft/td_info.h"
+template<typename TR>
+void TD_info::folding_HR_td(const hamilt::HContainer<TR>& hR,
+                std::complex<double>* hk,
+                const ModuleBase::Vector3<double>& kvec_d_in,
+                const int ncol,
+                const int hk_type)
+{
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (int i = 0; i < hR.size_atom_pairs(); ++i)
+    {
+        hamilt::AtomPair<TR>& tmp = hR.get_atom_pair(i);
+        for(int ir = 0;ir < tmp.get_R_size(); ++ir )
+        {
+            const ModuleBase::Vector3<int> r_index = tmp.get_R_index(ir);
+
+            //new
+            //cal tddft phase for hybrid gague
+            const int iat1 = tmp.get_atom_i();
+            const int iat2 = tmp.get_atom_j();
+            ModuleBase::Vector3<double> dtau = ucell->cal_dtau(iat1, iat2, r_index);
+            const double arg_td = cart_At * dtau * ucell->lat0;
+            //new
+
+            // cal k_phase
+            // if TK==std::complex<double>, kphase is e^{ikR}
+            const ModuleBase::Vector3<double> dR(r_index.x, r_index.y, r_index.z);
+            const double arg = (kvec_d_in * dR) * ModuleBase::TWO_PI + arg_td;
+            double sinp, cosp;
+            ModuleBase::libm::sincos(arg, &sinp, &cosp);
+            std::complex<double> kphase = std::complex<double>(cosp, sinp);
+
+            tmp.find_R(r_index);
+            tmp.add_to_matrix(hk, ncol, kphase, hk_type);
+        }
+    }
+}
+template
+void TD_info::folding_HR_td<double>(const hamilt::HContainer<double>& hR,
+                std::complex<double>* hk,
+                const ModuleBase::Vector3<double>& kvec_d_in,
+                const int ncol,
+                const int hk_type);
+template
+void TD_info::folding_HR_td<std::complex<double>>(const hamilt::HContainer<std::complex<double>>& hR,
+                std::complex<double>* hk,
+                const ModuleBase::Vector3<double>& kvec_d_in,
+                const int ncol,
+                const int hk_type);
\ No newline at end of file
diff --git a/source/source_lcao/module_tddft/td_info.cpp b/source/source_lcao/module_tddft/td_info.cpp
new file mode 100644
index 0000000000..26e2bab0a5
--- /dev/null
+++ b/source/source_lcao/module_tddft/td_info.cpp
@@ -0,0 +1,227 @@
+#include "td_info.h"
+
+#include "source_estate/module_pot/H_TDDFT_pw.h"
+#include "module_parameter/parameter.h"
+
+bool TD_info::out_mat_R = false;
+bool TD_info::out_vecpot = false;
+bool TD_info::out_current = false;
+bool TD_info::out_current_k = false;
+bool TD_info::init_vecpot_file = false;
+bool TD_info::evolve_once = false;
+
+TD_info* TD_info::td_vel_op = nullptr;
+
+int TD_info::estep_shift = 0;
+int TD_info::istep = -1;
+int TD_info::max_istep = -1;
+std::vector<ModuleBase::Vector3<double>> TD_info::At_from_file;
+
+TD_info::TD_info(const UnitCell* ucell_in)
+{
+    this->ucell = ucell_in;
+    if (init_vecpot_file && istep == -1)
+    {
+        this->read_cart_At();
+    }
+    //read in restart step
+    if(PARAM.inp.mdp.md_restart)
+    {
+        std::stringstream ssc;
+        ssc << PARAM.globalv.global_readin_dir << "Restart_td.dat";
+        std::ifstream file(ssc.str().c_str());
+        if (!file)
+        {
+            ModuleBase::WARNING_QUIT("TD_info::TD_info", "No Restart_td.dat!");
+        }
+        file >> estep_shift;
+        std::cout<<"estep_shift"<<estep_shift<<std::endl;
+    }
+    this->istep += estep_shift;
+    return;
+}
+TD_info::~TD_info()
+{
+    if(elecstate::H_TDDFT_pw::stype == 1)
+    {
+        this->destroy_HS_R_td_sparse();
+    }
+    for (int dir = 0; dir < 3; dir++)
+    {
+        if (this->current_term[dir] != nullptr)
+        {
+            delete this->current_term[dir];
+        }
+    }
+}
+
+void TD_info::output_cart_At(const std::string& out_dir)
+{
+    if (GlobalV::MY_RANK == 0)
+    {
+        std::string out_file;
+        // generate the output file name
+        out_file = out_dir + "At.dat";
+        std::ofstream ofs;
+        // output title
+        if (istep == estep_shift)
+        {
+            ofs.open(out_file.c_str(), std::ofstream::out);
+            ofs << std::left << std::setw(8) << "#istep" << std::setw(15) << "A_x" << std::setw(15) << "A_y"
+                << std::setw(15) << "A_z" << std::endl;
+        }
+        else
+        {
+            ofs.open(out_file.c_str(), std::ofstream::app);
+        }
+        // output the vector potential
+        ofs << std::left << std::setw(8) << istep;
+        // divide by 2.0 to get the atomic unit
+        for (int i = 0; i < 3; i++)
+        {
+            ofs << std::scientific << std::setprecision(4) << std::setw(15) << cart_At[i];
+        }
+        ofs << std::endl;
+        ofs.close();
+    }
+    return;
+}
+
+void TD_info::cal_cart_At(const ModuleBase::Vector3<double>& At)
+{
+    istep++;
+    if (init_vecpot_file)
+    {
+        this->cart_At = At_from_file[istep > max_istep ? max_istep : istep];
+    }
+    else
+    {
+        // transfrom into atomic unit
+        this->cart_At = At / 2.0;
+    }
+    // output the vector potential if needed
+    if (out_vecpot == true)
+    {
+        this->output_cart_At(PARAM.globalv.global_out_dir);
+    }
+}
+
+void TD_info::read_cart_At(void)
+{
+    std::string in_file;
+    // generate the input file name
+    in_file = "At.dat";
+    std::ifstream ifs(in_file.c_str());
+    // check if the file is exist
+    if (!ifs)
+    {
+        ModuleBase::WARNING_QUIT("TD_info::read_cart_At", "Cannot open Vector potential file!");
+    }
+    std::string line;
+    std::vector<std::string> str_vec;
+    // use tmp to skip the istep number
+    int tmp = 0;
+    while (std::getline(ifs, line))
+    {
+        // A tmporary vector3 to store the data of this line
+        ModuleBase::Vector3<double> At;
+        if (line[0] == '#')
+        {
+            continue;
+        }
+        std::istringstream iss(line);
+        // skip the istep number
+        if (!(iss >> tmp))
+        {
+            ModuleBase::WARNING_QUIT("TD_info::read_cart_At", "Error reading istep!");
+        }
+        // read the vector potential
+        double component = 0;
+        // Read three components
+        for (int i = 0; i < 3; i++)
+        {
+            if (!(iss >> component))
+            {
+                ModuleBase::WARNING_QUIT("TD_info::read_cart_At",
+                                         "Error reading component " + std::to_string(i + 1) + " for istep "
+                                             + std::to_string(tmp) + "!");
+            }
+            At[i] = component;
+        }
+        // add the tmporary vector3 to the vector potential vector
+        At_from_file.push_back(At);
+    }
+    // set the max_istep
+    max_istep = At_from_file.size() - 1;
+    ifs.close();
+
+    return;
+}
+void TD_info::out_restart_info(const int nstep, 
+                      const ModuleBase::Vector3<double>& At_current, 
+                      const ModuleBase::Vector3<double>& At_laststep)
+{
+    if (GlobalV::MY_RANK == 0)
+    {
+        // open file
+        std::string outdir = PARAM.globalv.global_out_dir + "Restart_td.dat";
+        std::ofstream outFile(outdir);
+        if (!outFile) {
+            ModuleBase::WARNING_QUIT("out_restart_info", "no Restart_td.dat!");
+        }
+        // write data
+        outFile << nstep << std::endl;
+        outFile << At_current[0] << " " << At_current[1] << " " << At_current[2] << std::endl;
+        outFile << At_laststep[0] << " " << At_laststep[1] << " " << At_laststep[2] << std::endl;
+        outFile.close();
+    }
+    
+
+    return;
+}
+
+void TD_info::initialize_current_term(const hamilt::HContainer<std::complex<double>>* HR,
+                                          const Parallel_Orbitals* paraV)
+{
+    ModuleBase::TITLE("TD_info", "initialize_current_term");
+    ModuleBase::timer::tick("TD_info", "initialize_current_term");
+
+    for (int dir = 0; dir < 3; dir++)
+    {
+        if (this->current_term[dir] == nullptr)
+            this->current_term[dir] = new hamilt::HContainer<std::complex<double>>(paraV);
+    }
+
+    for (int i = 0; i < HR->size_atom_pairs(); ++i)
+    {
+        hamilt::AtomPair<std::complex<double>>& tmp = HR->get_atom_pair(i);
+        for (int ir = 0; ir < tmp.get_R_size(); ++ir)
+        {
+            const ModuleBase::Vector3<int> R_index = tmp.get_R_index(ir);
+            const int iat1 = tmp.get_atom_i();
+            const int iat2 = tmp.get_atom_j();
+
+            hamilt::AtomPair<std::complex<double>> tmp1(iat1, iat2, R_index, paraV);
+            for (int dir = 0; dir < 3; dir++)
+            {
+                this->current_term[dir]->insert_pair(tmp1);
+            }
+        }
+    }
+    for (int dir = 0; dir < 3; dir++)
+    {
+        this->current_term[dir]->allocate(nullptr, true);
+    }
+
+    ModuleBase::timer::tick("TD_info", "initialize_current_term");
+}
+
+void TD_info::destroy_HS_R_td_sparse(void)
+{
+    std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, std::complex<double>>>>
+        empty_HR_sparse_td_vel_up;
+    std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, std::complex<double>>>>
+        empty_HR_sparse_td_vel_down;
+    HR_sparse_td_vel[0].swap(empty_HR_sparse_td_vel_up);
+    HR_sparse_td_vel[1].swap(empty_HR_sparse_td_vel_down);
+}
diff --git a/source/source_lcao/module_tddft/td_info.h b/source/source_lcao/module_tddft/td_info.h
new file mode 100644
index 0000000000..60e4273266
--- /dev/null
+++ b/source/source_lcao/module_tddft/td_info.h
@@ -0,0 +1,108 @@
+#ifndef TD_INFO_H
+#define TD_INFO_H
+#include "source_base/abfs-vector3_order.h"
+#include "source_base/timer.h"
+#include "source_lcao/module_hcontainer/hcontainer.h"
+
+#include <map>
+// Class to store TDDFT infos, mainly for periodic system.
+class TD_info
+{
+  public:
+    TD_info(const UnitCell* ucell_in);
+    ~TD_info();
+
+    void init();
+
+    /// @brief switch to control the output of HR
+    static bool out_mat_R;
+
+    /// @brief pointer to the only TD_info object itself
+    static TD_info* td_vel_op;
+
+    /// @brief switch to control the output of At
+    static bool out_vecpot;
+
+    /// @brief switch to control the output of current
+    static bool out_current;
+
+    /// @brief switch to control the format of the output current, in total or in each k-point
+    static bool out_current_k;
+
+    /// @brief switch to control the source of At
+    static bool init_vecpot_file;
+
+    /// @brief if need to calculate more than once
+    static bool evolve_once;
+
+    /// @brief Restart step
+    static int estep_shift;
+
+    /// @brief Store the vector potential for tddft calculation
+    ModuleBase::Vector3<double> cart_At;
+
+    /// @brief calculate the At in cartesian coordinate
+    void cal_cart_At(const ModuleBase::Vector3<double>& At);
+
+    /// @brief output RT-TDDFT info for restart
+    void out_restart_info(const int nstep, 
+                          const ModuleBase::Vector3<double>& At_current, 
+                          const ModuleBase::Vector3<double>& At_laststep);
+
+    // allocate memory for current term.
+    void initialize_current_term(const hamilt::HContainer<std::complex<double>>* HR, const Parallel_Orbitals* paraV);
+
+    hamilt::HContainer<std::complex<double>>* get_current_term_pointer(const int& i) const
+    {
+        return this->current_term[i];
+    }
+
+
+    // folding HR to hk, for hybrid gague
+    template<typename TR>
+    void folding_HR_td(const hamilt::HContainer<TR>& hR,
+                std::complex<double>* hk,
+                const ModuleBase::Vector3<double>& kvec_d_in,
+                const int ncol,
+                const int hk_type);
+
+    int get_istep()
+    {
+      return istep;
+    }
+
+    const UnitCell* get_ucell()
+    {
+        return this->ucell;
+    }
+
+    // For TDDFT velocity gauge, to fix the output of HR
+    std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, std::complex<double>>>> HR_sparse_td_vel[2];
+
+  private:
+    /// @brief pointer to the unit cell
+    const UnitCell* ucell = nullptr;
+
+    /// @brief read At from output file
+    void read_cart_At();
+
+    /// @brief output cart_At to output file
+    void output_cart_At(const std::string& out_dir);
+
+    /// @brief store isteps now
+    static int istep;
+
+    /// @brief total steps of read in At
+    static int max_istep;
+
+    /// @brief store the read in At_data
+    static std::vector<ModuleBase::Vector3<double>> At_from_file;
+
+    /// @brief destory HSR data stored
+    void destroy_HS_R_td_sparse();
+
+    /// @brief part of Momentum operator, -i∇ - i[r,Vnl]. Used to calculate current.
+    std::vector<hamilt::HContainer<std::complex<double>>*> current_term = {nullptr, nullptr, nullptr};
+};
+
+#endif
diff --git a/source/source_lcao/module_tddft/velocity_op.cpp b/source/source_lcao/module_tddft/velocity_op.cpp
new file mode 100644
index 0000000000..dcdc80d059
--- /dev/null
+++ b/source/source_lcao/module_tddft/velocity_op.cpp
@@ -0,0 +1,532 @@
+#include "velocity_op.h"
+#include "source_base/timer.h"
+#include "source_base/tool_title.h"
+#include "source_lcao/module_tddft/snap_psibeta_half_tddft.h"
+#ifdef _OPENMP
+#include <unordered_set>
+#include <omp.h>
+#endif
+#include "module_parameter/parameter.h"
+template <typename TR>
+cal_r_overlap_R Velocity_op<TR>::r_calculator;
+template <typename TR>
+bool Velocity_op<TR>::init_done = false;
+template <typename TR>
+Velocity_op<TR>::Velocity_op(const UnitCell* ucell_in,
+                      const Grid_Driver* GridD_in,
+                      const Parallel_Orbitals* paraV,
+                      const LCAO_Orbitals& orb,
+                      const TwoCenterIntegrator* intor)
+    : ucell(ucell_in), paraV(paraV) , orb_(orb), intor_(intor)
+{   
+    // for length gague, the A(t) = 0 for all the time.
+    this->cart_At = ModuleBase::Vector3<double>(0,0,0);
+    this->initialize_grad_term(GridD_in, paraV);
+    this->initialize_vcomm_r(GridD_in, paraV);
+}
+template <typename TR>
+Velocity_op<TR>::~Velocity_op()
+{
+    for (int dir=0;dir<3;dir++)
+    {
+        delete this->current_term[dir];
+    }
+}
+//allocate space for current_term
+template <typename TR>
+void Velocity_op<TR>::initialize_vcomm_r(const Grid_Driver* GridD, const Parallel_Orbitals* paraV)
+{
+    ModuleBase::TITLE("Velocity_op", "initialize_vcomm_r");
+    ModuleBase::timer::tick("Velocity_op", "initialize_vcomm_r");
+    if(!init_done)
+    {
+        std::cout << "init_r_overlap_nonlocal" <<std::endl;
+        r_calculator.init_nonlocal(*ucell, *paraV, orb_);
+        init_done = true;
+    }
+
+    for (int dir=0;dir<3;dir++)
+    {
+        if (this->current_term[dir] == nullptr)
+        this->current_term[dir] = new hamilt::HContainer<std::complex<double>>(paraV);
+    }
+    this->adjs_vcommr.clear();
+    this->adjs_vcommr.reserve(this->ucell->nat);
+    for (int iat0 = 0; iat0 < ucell->nat; iat0++)
+    {
+        auto tau0 = ucell->get_tau(iat0);
+        int T0, I0;
+        ucell->iat2iait(iat0, &I0, &T0);
+        AdjacentAtomInfo adjs;
+        GridD->Find_atom(*ucell, tau0, T0, I0, &adjs);
+        std::vector<bool> is_adj(adjs.adj_num + 1, false);
+        for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
+        {
+            const int T1 = adjs.ntype[ad1];
+            const int I1 = adjs.natom[ad1];
+            const int iat1 = ucell->itia2iat(T1, I1);
+            const ModuleBase::Vector3<double>& tau1 = adjs.adjacent_tau[ad1];
+            const ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
+            // choose the real adjacent atoms
+            // Note: the distance of atoms should less than the cutoff radius, 
+            // When equal, the theoretical value of matrix element is zero, 
+            // but the calculated value is not zero due to the numerical error, which would lead to result changes.
+            if (this->ucell->cal_dtau(iat0, iat1, R_index1).norm() * this->ucell->lat0
+                < orb_.Phi[T1].getRcut() + this->ucell->infoNL.Beta[T0].get_rcut_max())
+            {
+                is_adj[ad1] = true;
+            }
+        }
+        filter_adjs(is_adj, adjs);
+        this->adjs_vcommr.push_back(adjs);
+        for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
+        {
+            const int T1 = adjs.ntype[ad1];
+            const int I1 = adjs.natom[ad1];
+            const int iat1 = ucell->itia2iat(T1, I1);
+            const ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
+            for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2)
+            {
+                const int T2 = adjs.ntype[ad2];
+                const int I2 = adjs.natom[ad2];
+                const int iat2 = ucell->itia2iat(T2, I2);
+                ModuleBase::Vector3<int>& R_index2 = adjs.box[ad2];
+                if (paraV->get_col_size(iat2) <= 0 || paraV->get_row_size(iat1) <= 0)
+                {
+                    continue;
+                }
+                hamilt::AtomPair<std::complex<double>> tmp(iat1,
+                                         iat2,
+                                         R_index2.x - R_index1.x,
+                                         R_index2.y - R_index1.y,
+                                         R_index2.z - R_index1.z,
+                                         paraV);
+                for (int dir=0;dir<3;dir++)
+                {
+                    this->current_term[dir]->insert_pair(tmp);
+                }
+            }
+        }
+    }
+    // allocate the memory of BaseMatrix in cal_vcomm_r_IJR, and set the new values to zero
+    for (int dir=0;dir<3;dir++)
+    {
+        this->current_term[dir]->allocate(nullptr, true);
+    }
+    ModuleBase::timer::tick("Velocity_op", "initialize_vcomm_r");
+}
+template <typename TR>
+void Velocity_op<TR>::initialize_grad_term(const Grid_Driver* GridD, const Parallel_Orbitals* paraV)
+{
+    ModuleBase::TITLE("Velocity_op", "initialize_grad_term");
+    ModuleBase::timer::tick("Velocity_op", "initialize_grad_term");
+    for (int dir=0;dir<3;dir++)
+    {
+        if (this->current_term[dir] == nullptr)
+        this->current_term[dir] = new hamilt::HContainer<std::complex<double>>(paraV);
+    }
+    this->adjs_grad.clear();
+    this->adjs_grad.reserve(this->ucell->nat);
+    for (int iat1 = 0; iat1 < ucell->nat; iat1++)
+    {
+        auto tau1 = ucell->get_tau(iat1);
+        int T1, I1;
+        ucell->iat2iait(iat1, &I1, &T1);
+        AdjacentAtomInfo adjs;
+        GridD->Find_atom(*ucell, tau1, T1, I1, &adjs);
+        std::vector<bool> is_adj(adjs.adj_num + 1, false);
+        for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
+        {
+            const int T2 = adjs.ntype[ad1];
+            const int I2 = adjs.natom[ad1];
+            const int iat2 = ucell->itia2iat(T2, I2);
+            if (paraV->get_row_size(iat1) <= 0 || paraV->get_col_size(iat2) <= 0)
+            {
+                continue;
+            }
+            const ModuleBase::Vector3<int>& R_index2 = adjs.box[ad1];
+            // choose the real adjacent atoms
+            // Note: the distance of atoms should less than the cutoff radius, 
+            // When equal, the theoretical value of matrix element is zero, 
+            // but the calculated value is not zero due to the numerical error, which would lead to result changes.
+            if (this->ucell->cal_dtau(iat1, iat2, R_index2).norm() * this->ucell->lat0
+                < orb_.Phi[T1].getRcut() + orb_.Phi[T2].getRcut())
+            {
+                is_adj[ad1] = true;
+            }
+        }
+        filter_adjs(is_adj, adjs);
+        this->adjs_grad.push_back(adjs);
+        for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+        {
+            const int T2 = adjs.ntype[ad];
+            const int I2 = adjs.natom[ad];
+            int iat2 = ucell->itia2iat(T2, I2);
+            ModuleBase::Vector3<int>& R_index = adjs.box[ad];
+            hamilt::AtomPair<std::complex<double>> tmp(iat1, iat2, R_index.x, R_index.y, R_index.z, paraV);
+            for (int dir=0;dir<3;dir++)
+            {
+                this->current_term[dir]->insert_pair(tmp);
+            }
+        }
+    }
+    // allocate the memory of BaseMatrix in HR, and set the new values to zero
+    for (int dir=0;dir<3;dir++)
+    {
+        this->current_term[dir]->allocate(nullptr, true);
+    }
+
+    ModuleBase::timer::tick("Velocity_op", "initialize_grad_term");
+}
+template <typename TR>
+void Velocity_op<TR>::calculate_vcomm_r()
+{
+    ModuleBase::TITLE("Velocity_op", "calculate_vcomm_r");
+    ModuleBase::timer::tick("Velocity_op", "calculate_vcomm_r");
+
+    const Parallel_Orbitals* paraV = this->current_term[0]->get_atom_pair(0).get_paraV();
+    const int npol = this->ucell->get_npol();
+
+    // 1. calculate <psi|beta> for each pair of atoms
+    for (int iat0 = 0; iat0 < this->ucell->nat; iat0++)
+    {
+        auto tau0 = ucell->get_tau(iat0);
+        int T0, I0;
+        ucell->iat2iait(iat0, &I0, &T0);
+        AdjacentAtomInfo& adjs = this->adjs_vcommr[iat0];
+        std::vector<std::vector<std::unordered_map<int, std::vector<double>>>> nlm_tot;
+        nlm_tot.resize(adjs.adj_num + 1);
+        for (int i = 0; i < adjs.adj_num + 1; i++)
+        {
+            nlm_tot[i].resize(4);
+        }
+
+        #pragma omp parallel
+        {
+            #pragma omp for schedule(dynamic)
+            for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+            {
+                const int T1 = adjs.ntype[ad];
+                const int I1 = adjs.natom[ad];
+                const int iat1 = ucell->itia2iat(T1, I1);
+                const ModuleBase::Vector3<double>& tau1 = adjs.adjacent_tau[ad];
+                const Atom* atom1 = &ucell->atoms[T1];
+                auto all_indexes = paraV->get_indexes_row(iat1);
+                auto col_indexes = paraV->get_indexes_col(iat1);
+                // insert col_indexes into all_indexes to get universal set with no repeat elements
+                all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end());
+                std::sort(all_indexes.begin(), all_indexes.end());
+                all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end());
+                for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol)
+                {
+                    const int iw1 = all_indexes[iw1l] / npol;
+                    //std::vector<std::vector<std::complex<double>>> nlm;
+                    std::vector<std::vector<double>> nlm;
+                    // nlm is a vector of vectors, but size of outer vector is only 1 when out_current is false
+                    // and size of outer vector is 4 when out_current is true (3 for <psi|r_i * exp(-iAr)|beta>, 1 for
+                    // <psi|exp(-iAr)|beta>) inner loop : all projectors (L0,M0)
+
+                    // snap_psibeta_half_tddft() are used to calculate <psi|exp(-iAr)|beta>
+                    // and <psi|rexp(-iAr)|beta> as well if current are needed
+                    ModuleBase::Vector3<double> dtau = tau0 - tau1;
+
+                    r_calculator.get_psi_r_beta(*ucell,
+                                                nlm,
+                                                tau1 * this->ucell->lat0,
+                                                T1,
+                                                atom1->iw2l[iw1],
+                                                atom1->iw2m[iw1],
+                                                atom1->iw2n[iw1],
+                                                tau0 * this->ucell->lat0,
+                                                T0);
+                    for (int dir = 0; dir < 4; dir++)
+                    {
+                        nlm_tot[ad][dir].insert({all_indexes[iw1l], nlm[dir]});
+                    }
+                }
+            }
+
+    #ifdef _OPENMP
+            // record the iat number of the adjacent atoms
+            std::set<int> ad_atom_set;
+            for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+            {
+                const int T1 = adjs.ntype[ad];
+                const int I1 = adjs.natom[ad];
+                const int iat1 = ucell->itia2iat(T1, I1);
+                ad_atom_set.insert(iat1);
+            }
+
+            // split the ad_atom_set into num_threads parts
+            const int num_threads = omp_get_num_threads();
+            const int thread_id = omp_get_thread_num();
+            std::set<int> ad_atom_set_thread;
+            int i = 0;
+            for(const auto iat1 : ad_atom_set)
+            {
+                if (i % num_threads == thread_id)
+                {
+                    ad_atom_set_thread.insert(iat1);
+                }
+                i++;
+            }
+    #endif
+    // 2. calculate <psi_I|beta>D<beta|psi_{J,R}> for each pair of <IJR> atoms
+            for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
+            {
+                const int T1 = adjs.ntype[ad1];
+                const int I1 = adjs.natom[ad1];
+                const int iat1 = ucell->itia2iat(T1, I1);
+    #ifdef _OPENMP
+                if (ad_atom_set_thread.find(iat1) == ad_atom_set_thread.end())
+                    {
+                        continue;
+                    }
+    #endif
+                ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
+                for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2)
+                {
+                    const int T2 = adjs.ntype[ad2];
+                    const int I2 = adjs.natom[ad2];
+                    const int iat2 = ucell->itia2iat(T2, I2);
+                    ModuleBase::Vector3<int>& R_index2 = adjs.box[ad2];
+                    ModuleBase::Vector3<int> R_vector(R_index2[0] - R_index1[0],
+                                                    R_index2[1] - R_index1[1],
+                                                    R_index2[2] - R_index1[2]);
+                    std::complex<double>* tmp_c[3] = {nullptr, nullptr, nullptr};
+                    for (int i = 0; i < 3; i++)
+                    {
+                        tmp_c[i] = this->current_term[i]->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2])->get_pointer();
+                    }
+                    // if not found , skip this pair of atoms
+                    if (tmp_c[0] != nullptr)
+                    {
+                        this->cal_vcomm_r_IJR(iat1,
+                                            iat2,
+                                            T0,
+                                            paraV,
+                                            nlm_tot[ad1],
+                                            nlm_tot[ad2],
+                                            tmp_c);
+                    }
+                }
+            }
+        }
+    }
+    ModuleBase::timer::tick("Velocity_op", "calculate_vcomm_r");
+}
+
+// cal_HR_IJR()
+template <typename TR>
+void Velocity_op<TR>::cal_vcomm_r_IJR(
+    const int& iat1,
+    const int& iat2,
+    const int& T0,
+    const Parallel_Orbitals* paraV,
+    const std::vector<std::unordered_map<int, std::vector<double>>>& nlm1_all,
+    const std::vector<std::unordered_map<int, std::vector<double>>>& nlm2_all,
+    std::complex<double>** current_mat_p)
+{
+    // npol is the number of polarizations,
+    // 1 for non-magnetic (one Hamiltonian matrix only has spin-up or spin-down),
+    // 2 for magnetic (one Hamiltonian matrix has both spin-up and spin-down)
+    const int npol = this->ucell->get_npol();
+    // ---------------------------------------------
+    // calculate the Nonlocal matrix for each pair of orbitals
+    // ---------------------------------------------
+    auto row_indexes = paraV->get_indexes_row(iat1);
+    auto col_indexes = paraV->get_indexes_col(iat2);
+    // step_trace = 0 for NSPIN=1,2; ={0, 1, local_col, local_col+1} for NSPIN=4
+    std::vector<int> step_trace(npol * npol, 0);
+    for (int is = 0; is < npol; is++)
+    {
+        for (int is2 = 0; is2 < npol; is2++)
+        {
+            step_trace[is * npol + is2] = col_indexes.size() * is + is2;
+        }
+    }
+    // calculate the local matrix
+    const TR* tmp_d = nullptr;
+    for (int iw1l = 0; iw1l < row_indexes.size(); iw1l += npol)
+    {
+        // const std::vector<std::complex<double>>* nlm1 = &(nlm1_all[0].find(row_indexes[iw1l])->second);
+        //std::vector<const std::vector<std::complex<double>>*> nlm1;
+        std::vector<const std::vector<double>*> nlm1;
+        for (int dir = 0; dir < 4; dir++)
+        {
+            nlm1.push_back(&(nlm1_all[dir].find(row_indexes[iw1l])->second));
+        }
+
+        for (int iw2l = 0; iw2l < col_indexes.size(); iw2l += npol)
+        {
+            //std::vector<const std::vector<std::complex<double>>*> nlm2;
+            std::vector<const std::vector<double>*> nlm2;
+            for (int dir = 0; dir < 4; dir++)
+            {
+                nlm2.push_back(&(nlm2_all[dir].find(col_indexes[iw2l])->second));
+            }
+
+#ifdef __DEBUG
+            assert(nlm1.size() == nlm2.size());
+#endif
+            for (int is = 0; is < npol * npol; ++is)
+            {
+                for (int dir = 0; dir < 3; dir++)
+                {
+                    std::complex<double> nlm_r_tmp = std::complex<double>{0, 0};
+                    std::complex<double> imag_unit = std::complex<double>{0, 1};
+                    for (int no = 0; no < this->ucell->atoms[T0].ncpp.non_zero_count_soc[is]; no++)
+                    {
+                        const int p1 = this->ucell->atoms[T0].ncpp.index1_soc[is][no];
+                        const int p2 = this->ucell->atoms[T0].ncpp.index2_soc[is][no];
+                        this->ucell->atoms[T0].ncpp.get_d(is, p1, p2, tmp_d);
+                        //<psi|rexp(-iAr)|beta><beta|exp(iAr)|psi>-<psi|exp(-iAr)|beta><beta|rexp(iAr)|psi>
+                        // multiply d in the end
+                        TR tmp = (nlm1[dir + 1]->at(p1) * nlm2[0]->at(p2)
+                                        - nlm1[0]->at(p1) * nlm2[dir + 1]->at(p2))
+                                        * (*tmp_d);
+                        nlm_r_tmp += tmp;
+                    }
+                    // -i[r,Vnl], 2.0 due to the unit transformation
+                    current_mat_p[dir][step_trace[is]] -= imag_unit * nlm_r_tmp / 2.0;
+                }
+            }
+            for (int dir = 0; dir < 3; dir++)
+            {
+                current_mat_p[dir] += npol;
+            }
+        }
+        for (int dir = 0; dir < 3; dir++)
+        {
+            current_mat_p[dir] += (npol - 1) * col_indexes.size();
+        }
+    }
+}
+template <typename TR>
+void Velocity_op<TR>::calculate_grad_term()
+{
+    ModuleBase::TITLE("Velocity_op", "calculate_grad_term");
+    if(this->current_term[0]==nullptr || this->current_term[0]->size_atom_pairs()<=0)
+    {
+        ModuleBase::WARNING_QUIT("Velocity_op::calculate_grad_term", "grad_term is nullptr or empty");
+    }
+    ModuleBase::timer::tick("Velocity_op", "calculate_grad_term");
+
+    const Parallel_Orbitals* paraV = this->current_term[0]->get_atom_pair(0).get_paraV();
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (int iat1 = 0; iat1 < this->ucell->nat; iat1++)
+    {
+        auto tau1 = ucell->get_tau(iat1);
+        int T1, I1;
+        ucell->iat2iait(iat1, &I1, &T1);
+        AdjacentAtomInfo& adjs = this->adjs_grad[iat1];
+        for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+        {
+            const int T2 = adjs.ntype[ad];
+            const int I2 = adjs.natom[ad];
+            const int iat2 = ucell->itia2iat(T2, I2);
+            const ModuleBase::Vector3<int>& R_index2 = adjs.box[ad];
+            ModuleBase::Vector3<double> dtau = this->ucell->cal_dtau(iat1, iat2, R_index2);
+
+            std::complex<double>* tmp_c[3] = {nullptr, nullptr, nullptr};
+            for (int i = 0; i < 3; i++)
+            {
+                tmp_c[i] = this->current_term[i]->find_matrix(iat1, iat2, R_index2)->get_pointer();
+            }
+            if (tmp_c[0] != nullptr)
+            {
+                this->cal_grad_IJR(iat1, iat2, paraV, dtau, tmp_c);
+            }
+            else
+            {
+                ModuleBase::WARNING_QUIT("Velocity_op::calculate_grad_term", "R_index not found in HR");
+            }
+        }
+    }
+    ModuleBase::timer::tick("Velocity_op", "calculate_grad_term");
+}
+template <typename TR>
+void Velocity_op<TR>::cal_grad_IJR(const int& iat1,
+                              const int& iat2,
+                              const Parallel_Orbitals* paraV,
+                              const ModuleBase::Vector3<double>& dtau,
+                              std::complex<double>** current_mat_p)
+{
+    // ---------------------------------------------
+    // get info of orbitals of atom1 and atom2 from ucell
+    // ---------------------------------------------
+    int T1, I1;
+    this->ucell->iat2iait(iat1, &I1, &T1);
+    int T2, I2;
+    this->ucell->iat2iait(iat2, &I2, &T2);
+    Atom& atom1 = this->ucell->atoms[T1];
+    Atom& atom2 = this->ucell->atoms[T2];
+
+    // npol is the number of polarizations,
+    // 1 for non-magnetic (one Hamiltonian matrix only has spin-up or spin-down),
+    // 2 for magnetic (one Hamiltonian matrix has both spin-up and spin-down)
+    const int npol = this->ucell->get_npol();
+
+    const int* iw2l1 = atom1.iw2l.data();
+    const int* iw2n1 = atom1.iw2n.data();
+    const int* iw2m1 = atom1.iw2m.data();
+    const int* iw2l2 = atom2.iw2l.data();
+    const int* iw2n2 = atom2.iw2n.data();
+    const int* iw2m2 = atom2.iw2m.data();
+    // ---------------------------------------------
+    // get tau1 (in cell <0,0,0>) and tau2 (in cell R)
+    // in principle, only dtau is needed in this function
+    // snap_psipsi should be refactored to use dtau directly
+    // ---------------------------------------------
+    const ModuleBase::Vector3<double>& tau1 = this->ucell->get_tau(iat1);
+    const ModuleBase::Vector3<double> tau2 = tau1 + dtau;
+    // ---------------------------------------------
+    // calculate the Ekinetic matrix for each pair of orbitals
+    // ---------------------------------------------
+    double grad[3] = {0, 0, 0};
+    auto row_indexes = paraV->get_indexes_row(iat1);
+    auto col_indexes = paraV->get_indexes_col(iat2);
+    const int step_trace = col_indexes.size() + 1;
+    for(int iw1l = 0; iw1l < row_indexes.size(); iw1l += npol)
+    {
+        const int iw1 = row_indexes[iw1l] / npol;
+        const int L1 = iw2l1[iw1];
+        const int N1 = iw2n1[iw1];
+        const int m1 = iw2m1[iw1];
+        // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
+        int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
+
+        for (int iw2l = 0; iw2l < col_indexes.size(); iw2l += npol)
+        {
+            const int iw2 = col_indexes[iw2l] / npol;
+            const int L2 = iw2l2[iw2];
+            const int N2 = iw2n2[iw2];
+            const int m2 = iw2m2[iw2];
+            // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
+            int M2 = (m2 % 2 == 0) ? -m2 / 2 : (m2 + 1) / 2;
+
+            // calculate <psi|∇R|psi>, which equals to -<psi|∇r|psi>.
+            intor_->calculate(T1, L1, N1, M1, T2, L2, N2, M2, dtau * this->ucell->lat0, nullptr, grad);
+            ModuleBase::Vector3<double> grad_overlap(grad[0], grad[1], grad[2]);
+
+            for (int dir = 0; dir < 3; dir++)
+            {
+                for (int ipol = 0; ipol < npol; ipol++)
+                {
+                    // part of Momentum operator, -i∇r,used to calculate the current
+                    // here is actually i∇R
+                    current_mat_p[dir][ipol * step_trace] += std::complex<double>(0, grad_overlap[dir]);
+                }
+                current_mat_p[dir] += npol;
+            }
+        }
+        for (int dir = 0; dir < 3; dir++)
+        {
+            current_mat_p[dir] += (npol - 1) * col_indexes.size();
+        }   
+    }
+}
+template class Velocity_op<double>;
+template class Velocity_op<std::complex<double>>;
diff --git a/source/source_lcao/module_tddft/velocity_op.h b/source/source_lcao/module_tddft/velocity_op.h
new file mode 100644
index 0000000000..7c3986a431
--- /dev/null
+++ b/source/source_lcao/module_tddft/velocity_op.h
@@ -0,0 +1,79 @@
+#ifndef TD_VELOCITY_OP_H
+#define TD_VELOCITY_OP_H
+#include <unordered_map>
+#include "source_basis/module_ao/parallel_orbitals.h"
+#include "source_cell/module_neighbor/sltk_grid_driver.h"
+#include "source_cell/unitcell.h"
+#include "source_lcao/module_hcontainer/hcontainer.h"
+#include "source_basis/module_nao/two_center_integrator.h"
+#include "source_base/vector3.h"
+#include "source_io/cal_r_overlap_R.h"
+
+//design to calculate velocity operator
+template <typename TR>
+class Velocity_op
+{
+  public:
+    Velocity_op(const UnitCell* ucell_in,
+               const Grid_Driver* GridD_in,
+               const Parallel_Orbitals* paraV,
+               const LCAO_Orbitals& orb,
+               const TwoCenterIntegrator* intor);
+    ~Velocity_op();
+
+    hamilt::HContainer<std::complex<double>>* get_current_term_pointer(const int& i)const 
+    {
+        return this->current_term[i];
+    }
+    void calculate_vcomm_r();
+    void calculate_grad_term();
+
+  private:
+    const UnitCell* ucell = nullptr;
+
+    const Parallel_Orbitals* paraV = nullptr;
+
+    const LCAO_Orbitals& orb_;
+
+    /// @brief Store real space hamiltonian. TD term should include imaginary part, thus it has to be complex type. Only shared between TD operators.
+    std::vector<hamilt::HContainer<std::complex<double>>*> current_term = {nullptr, nullptr, nullptr};
+    
+    const TwoCenterIntegrator* intor_ = nullptr;
+    const TwoCenterIntegrator* intorbeta_ = nullptr;
+
+    /**
+     * @brief initialize HR, search the nearest neighbor atoms
+     * HContainer is used to store the non-local pseudopotential matrix with specific <I,J,R> atom-pairs
+     * the size of HR will be fixed after initialization
+     */
+    void initialize_vcomm_r(const Grid_Driver* GridD_in, const Parallel_Orbitals* paraV);
+    void initialize_grad_term(const Grid_Driver* GridD_in, const Parallel_Orbitals* paraV);
+
+    /**
+     * @brief calculate the HR local matrix of <I,J,R> atom pair
+     */
+    void cal_vcomm_r_IJR(const int& iat1,
+                         const int& iat2,
+                         const int& T0,
+                         const Parallel_Orbitals* paraV,
+                         const std::vector<std::unordered_map<int, std::vector<double>>>& nlm1_all,
+                         const std::vector<std::unordered_map<int, std::vector<double>>>& nlm2_all,
+                         std::complex<double>** current_mat_p);
+    void cal_grad_IJR(const int& iat1,
+                      const int& iat2,
+                      const Parallel_Orbitals* paraV,
+                      const ModuleBase::Vector3<double>& dtau,
+                      std::complex<double>** current_mat_p);
+
+    /// @brief exact the nearest neighbor atoms from all adjacent atoms
+    std::vector<AdjacentAtomInfo> adjs_vcommr;
+    std::vector<AdjacentAtomInfo> adjs_grad;
+
+    /// @brief Store the vector potential for td_ekinetic term
+    ModuleBase::Vector3<double> cart_At;
+    static cal_r_overlap_R r_calculator;
+    static bool init_done;
+};
+
+
+#endif // TD_CURRENT_H