Skip to content

Commit b959593

Browse files
committed
sin,cos precalc
1 parent 5ad0313 commit b959593

File tree

3 files changed

+41
-15
lines changed

3 files changed

+41
-15
lines changed

multi/go_leo_prof.sh

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
#SBATCH --qos=boost_qos_dbg
1111
#SBATCH --error=test.err
1212
#### if mapping is on
13-
####SBATCH --cpus-per-task=8
13+
#SBATCH --cpus-per-task=8
1414

1515

1616
#module load nvhpc/24.3
@@ -38,10 +38,16 @@ export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH
3838
#srun -n 4 nsys profile -t cuda,nvtx,mpi --output=nsys_report_rank%t ./mhit36nsys profile --multiprocess=true -t cuda,nvtx,mpi -o report $
3939

4040
# profile + node mapping + nic
41-
#mpirun -np 4 --map-by node:PE=8 --rank-by core nsys profile -t cuda,nvtx,mpi,openacc --nic-metrics=true ./binder.sh ./mhit36
41+
mpirun -np 4 --map-by node:PE=8 --rank-by core nsys profile -t cuda,nvtx,mpi,openacc --nic-metrics=true ./binder.sh ./mhit36
4242

4343
# for nsight compute report
4444
#mpirun -n 4 ncu --kernel-name main_659 --set=full --import-source=yes -o profile -f --launch-skip 3 --launch-count 1 "./mhit36"
4545

46+
# for nsight compute report - all kernels
47+
# mpirun -n 4 ncu --kernel-name regex:main_ --set=full --import-source=yes --launch-skip 70 --launch-count 18 -o reportall.%p ./mhit36
48+
49+
# for nsight compute report - all kernels + mapping + nic
50+
# mpirun -np 4 --map-by node:PE=8 --rank-by core ncu --kernel-name regex:main_ --set=full --import-source=yes --launch-skip 70 --launch-count 18 -o reportall.%p ./binder.sh ./mhit36
51+
4652

4753

multi/main.f90

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ program main
1414

1515

1616
implicit none
17+
! timer for scaling test
18+
real :: t_start, t_end, elapsed
1719
! grid dimensions
1820
integer :: comm_backend
1921
integer :: pr, pc
@@ -230,6 +232,14 @@ program main
230232
enddo
231233
! allocate k_d on the device (later on remove and use OpenACC + managed memory?)
232234
allocate(kx_d, source=kx)
235+
236+
allocate(mysin(nx), mycos(nx))
237+
do i=1,nx
238+
! compute here the sin to avoid multiple computations of sin
239+
mysin(i)=sin(k0*x(i))
240+
! compute here the cos to avoid multiple computations of cos
241+
mycos(i)=cos(k0*x(i))
242+
enddo
233243
!########################################################################################################################################
234244
! 1. INITIALIZATION AND cuDECOMP AUTOTUNING : END
235245
!########################################################################################################################################
@@ -370,12 +380,12 @@ program main
370380
!Save initial fields (only if a fresh start)
371381
if (restart .eq. 0) then
372382
if (rank.eq.0) write(*,*) "Save initial fields"
373-
call writefield(tstart,1)
374-
call writefield(tstart,2)
375-
call writefield(tstart,3)
383+
!call writefield(tstart,1)
384+
!call writefield(tstart,2)
385+
!call writefield(tstart,3)
376386
!call writefield(tstart,4)
377387
#if phiflag == 1
378-
call writefield(tstart,5)
388+
!call writefield(tstart,5)
379389
#endif
380390
endif
381391
!########################################################################################################################################
@@ -399,6 +409,8 @@ program main
399409
gamma=1.d0*gumax
400410
!$acc data copyin(piX)
401411
!$acc data create(rhsu_o, rhsv_o, rhsw_o)
412+
!$acc data copyin(mysin, mycos)
413+
call cpu_time(t_start)
402414
! Start temporal loop
403415
do t=tstart,tfin
404416
! Create custom label for each marker
@@ -558,7 +570,7 @@ program main
558570
! Projection step, convective terms
559571
! 5.1a Convective terms NS
560572
! Loop on inner nodes
561-
!$acc parallel loop tile(16,4,2)
573+
!$acc parallel loop tile(16,4,2) present(mysin, mycos)
562574
do k=1+halo_ext, piX%shape(3)-halo_ext
563575
do j=1+halo_ext, piX%shape(2)-halo_ext
564576
do i=1,nx
@@ -612,9 +624,12 @@ program main
612624
kg = piX%lo(3) + k - 1
613625
jg = piX%lo(2) + j - 1
614626
! ABC forcing
615-
rhsu(i,j,k)= rhsu(i,j,k) + f3*sin(k0*x(kg))+f2*cos(k0*x(jg))
616-
rhsv(i,j,k)= rhsv(i,j,k) + f1*sin(k0*x(i))+f3*cos(k0*x(kg))
617-
rhsw(i,j,k)= rhsw(i,j,k) + f2*sin(k0*x(jg))+f1*cos(k0*x(i))
627+
! rhsu(i,j,k)= rhsu(i,j,k) + f3*sin(k0*x(kg))+f2*cos(k0*x(jg))
628+
rhsu(i,j,k)= rhsu(i,j,k) + f3*mysin(kg)+f2*mycos(jg)
629+
! rhsv(i,j,k)= rhsv(i,j,k) + f1*sin(k0*x(i))+f3*cos(k0*x(kg))
630+
rhsv(i,j,k)= rhsv(i,j,k) + f1*mysin(i)+f3*mycos(kg)
631+
! rhsw(i,j,k)= rhsw(i,j,k) + f2*sin(k0*x(jg))+f1*cos(k0*x(i))
632+
rhsw(i,j,k)= rhsw(i,j,k) + f2*mysin(jg)+f1*mycos(i)
618633
! TG Forcing
619634
!rhsu(i,j,k)= rhsu(i,j,k) + f1*sin(k0*x(i))*cos(k0*x(j))*cos(k0*x(k))
620635
!rhsv(i,j,k)= rhsv(i,j,k) - f1*cos(k0*x(i))*sin(k0*x(j))*sin(k0*x(k))
@@ -1002,13 +1017,13 @@ program main
10021017
if (mod(t,dump) .eq. 0) then
10031018
if (rank .eq. 0) write(*,*) "Saving output files"
10041019
! write velocity and pressure fiels (1-4)
1005-
call writefield(t,1)
1006-
call writefield(t,2)
1007-
call writefield(t,3)
1020+
!call writefield(t,1)
1021+
!call writefield(t,2)
1022+
!call writefield(t,3)
10081023
!call writefield(t,4)
10091024
#if phiflag == 1
10101025
! write phase-field (5)
1011-
call writefield(t,5)
1026+
!call writefield(t,5)
10121027
#endif
10131028
endif
10141029
!########################################################################################################################################
@@ -1018,12 +1033,16 @@ program main
10181033
call nvtxEndRange
10191034
!call nvtxEndRange
10201035
enddo
1036+
call cpu_time(t_end)
1037+
elapsed = t_end-t_start
1038+
if (rank .eq. 0) write(*,*) 'Elapsed time (seconds):', elapsed
1039+
!$acc end data
10211040
!$acc end data
10221041
!$acc end data
10231042

10241043
! Remove allocated variables (add new)
10251044
deallocate(u,v,w)
1026-
deallocate(tanh_psi)
1045+
deallocate(tanh_psi, mysin, mycos)
10271046
deallocate(rhsu,rhsv,rhsw)
10281047
deallocate(rhsu_o,rhsv_o,rhsw_o)
10291048
deallocate(phi,rhsphi,normx,normy,normz)

multi/module.f90

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ module velocity
5151
double precision :: uc, vc, wc, umax, gumax=1.0d0, cou, alpha, beta
5252
double precision :: h11, h12, h13, h21, h22, h23, h31, h32, h33
5353
double precision :: umean, vmean, wmean, gumean, gvmean, gwmean
54+
double precision, allocatable :: mysin(:), mycos(:)
5455
end module velocity
5556

5657

0 commit comments

Comments
 (0)