diff --git a/Makefile b/Makefile index f2d8b80e7..d77a3ef21 100755 --- a/Makefile +++ b/Makefile @@ -198,7 +198,7 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) SAMPLE_ENABLED := 1 -ALL_LDFLAGS := -lnetcdf -I +ALL_LDFLAGS := -I ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) @@ -207,10 +207,14 @@ ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) INCLUDES := -I/usr/includes LIBRARIES := +# Add NetCDF include library +INCLUDES += $(shell nc-config --cflags) +ALL_LDFLAGS += $(shell nc-config --libs) + ################################################################################ # Gencode arguments -SMS ?= 35 50 52 60 +SMS ?= 50 52 60 75 #SMS ?= 20 30 35 ifeq ($(SMS),) $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) @@ -318,7 +322,7 @@ ReadInput.o:./src/ReadInput.cu Read_netcdf.o:./src/Read_netcdf.cu $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< - + Reimann.o:./src/Reimann.cu $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< @@ -338,8 +342,7 @@ Write_netcdf.o:./src/Write_netcdf.cu $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< Write_txtlog.o:./src/Write_txtlog.cpp - $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< - + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< Spherical.o:./src/Spherical.cu $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< diff --git a/src/Advection.cu b/src/Advection.cu index 2f19765ce..f9622533d 100755 --- a/src/Advection.cu +++ b/src/Advection.cu @@ -251,11 +251,11 @@ template __global__ void AdvkernelGPU(Param XParam, BlockP XBlock, T ho, uo, vo; T dhi = XAdv.dh[i]; - T edt = dt;// dhi > T(0.0) ? dt : min(dt, hold / (T(-1.0) * dhi)); - + T edt = XParam.ForceMassConserve ? dt : dhi >= T(0.0) ? dt : min(dt, max(hold, XParam.eps) / abs(dhi)); + + //ho = max(hold + edt * dhi,T(0.0)); ho = hold + edt * dhi; - if (ho > eps) { // uo = (hold * XEv.u[i] + edt * XAdv.dhu[i]) / ho; @@ -307,7 +307,7 @@ template __host__ void AdvkernelCPU(Param XParam, BlockP XBlock, T dhi = XAdv.dh[i]; - T edt = dt;// dhi > T(0.0) ? dt : min(dt, hold / (T(-1.0) * dhi)); + T edt = XParam.ForceMassConserve ? dt : dhi >= T(0.0) ? dt : min(dt, max(hold, XParam.eps) / abs(dhi)); ho = hold + edt * dhi; @@ -475,7 +475,7 @@ template __host__ T CalctimestepGPU(Param XParam,Loop XLoop, BlockP //GPU Harris reduction #3. 8.3x reduction #0 Note #7 if a lot faster // This was successfully tested with a range of grid size - //reducemax3 << > >(dtmax_g, arrmax_g, nx*ny) + //reducemax3 <<>>(dtmax_g, arrmax_g, nx*ny) int maxThreads = 256; int threads = (s < maxThreads * 2) ? nextPow2((s + 1) / 2) : maxThreads; @@ -485,7 +485,7 @@ template __host__ T CalctimestepGPU(Param XParam,Loop XLoop, BlockP dim3 gridDimLine(blocks, 1, 1); - reducemin3 << > > (XTime.dtmax, XTime.arrmin, s); + reducemin3 <<>> (XTime.dtmax, XTime.arrmin, s); CUDA_CHECK(cudaDeviceSynchronize()); @@ -503,7 +503,7 @@ template __host__ T CalctimestepGPU(Param XParam,Loop XLoop, BlockP CUDA_CHECK(cudaMemcpy(XTime.dtmax, XTime.arrmin, s * sizeof(T), cudaMemcpyDeviceToDevice)); - reducemin3 << > > (XTime.dtmax, XTime.arrmin, s); + reducemin3 <<>> (XTime.dtmax, XTime.arrmin, s); CUDA_CHECK(cudaDeviceSynchronize()); s = (s + (threads * 2 - 1)) / (threads * 2); diff --git a/src/Arrays.h b/src/Arrays.h index f71e604ae..4c59b63c9 100755 --- a/src/Arrays.h +++ b/src/Arrays.h @@ -82,6 +82,23 @@ struct maskinfo }; +template +struct RiverInfo +{ + int nbir; + int nburmax; // size of (max number of) unique block with rivers + int nribmax; // size of (max number of) rivers in one block + int* Xbidir; // array of block id for each river size(nburmax,nribmax) + int* Xridib; // array of river id in each block size(nburmax,nribmax) + T* xstart; + T* xend; + T* ystart; + T *yend; + T* qnow; // qnow is a pin mapped and so both pointers are needed here + T* qnow_g; // this simplify the code later + +}; + // outzone info used to actually write the nc files (one nc file by zone, the default zone is the full domain) struct outzoneB @@ -125,7 +142,7 @@ struct AdaptP - +template struct BndblockP { int nblkriver, nblkTs, nbndblkleft, nbndblkright, nbndblktop, nbndblkbot; @@ -140,12 +157,15 @@ struct BndblockP int* top; int* bot; - + RiverInfo Riverinfo; }; - +struct RiverBlk +{ + std::vector block; +}; @@ -208,7 +228,7 @@ struct Model AdaptP adapt; - BndblockP bndblk; + BndblockP bndblk; diff --git a/src/Boundary.cu b/src/Boundary.cu index b49382284..47b4945e7 100644 --- a/src/Boundary.cu +++ b/src/Boundary.cu @@ -95,6 +95,11 @@ template void FlowbndFlux(Param XParam, double totaltime, BlockP XB T taper=T(1.0); if (bndseg.on) { + if (XParam.bndtaper > 0.0) + { + taper = min((totaltime - XParam.inittime) / XParam.bndtaper, 1.0); + } + if (bndseg.uniform) { int SLstepinbnd = 1; @@ -110,10 +115,7 @@ template void FlowbndFlux(Param XParam, double totaltime, BlockP XB zsbnd = interptime(bndseg.data[SLstepinbnd].wspeed, bndseg.data[SLstepinbnd - 1].wspeed, bndseg.data[SLstepinbnd].time - bndseg.data[SLstepinbnd - 1].time, totaltime - bndseg.data[SLstepinbnd - 1].time); - if (XParam.bndtaper > 0.0) - { - taper = min(totaltime / XParam.bndtaper, 1.0); - } + } else { @@ -131,25 +133,25 @@ template void FlowbndFlux(Param XParam, double totaltime, BlockP XB //Left //template __global__ void bndFluxGPUSide(Param XParam, bndsegmentside side, BlockP XBlock, DynForcingP Atmp, DynForcingP Zsmap, bool uniform, float zsbnd, T * zs, T * h, T * un, T * ut, T * Fh, T * Fq, T * Ss) //bndFluxGPUSide <<< gridDimBBND, blockDim, 0 >>> (XParam, bndseg.left, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), XEv.zs, XEv.h, un, ut, Fh, Fq, S); - bndFluxGPUSide << < gridDimBBNDLeft, blockDim, 0 >> > (XParam, bndseg.left, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.u, XEv.v, XFlux.Fhu, XFlux.Fqux, XFlux.Su); + bndFluxGPUSide <<< gridDimBBNDLeft, blockDim, 0 >>> (XParam, bndseg.left, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.u, XEv.v, XFlux.Fhu, XFlux.Fqux, XFlux.Su); CUDA_CHECK(cudaDeviceSynchronize()); } //if (bndseg.right.nblk > 0) { //Right - bndFluxGPUSide << < gridDimBBNDRight, blockDim, 0 >> > (XParam, bndseg.right, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.u, XEv.v, XFlux.Fhu, XFlux.Fqux, XFlux.Su); + bndFluxGPUSide <<< gridDimBBNDRight, blockDim, 0 >>> (XParam, bndseg.right, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.u, XEv.v, XFlux.Fhu, XFlux.Fqux, XFlux.Su); CUDA_CHECK(cudaDeviceSynchronize()); } //if (bndseg.top.nblk > 0) { //top - bndFluxGPUSide << < gridDimBBNDTop, blockDim, 0 >> > (XParam, bndseg.top, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.v, XEv.u, XFlux.Fhv, XFlux.Fqvy, XFlux.Sv); + bndFluxGPUSide <<< gridDimBBNDTop, blockDim, 0 >>> (XParam, bndseg.top, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.v, XEv.u, XFlux.Fhv, XFlux.Fqvy, XFlux.Sv); CUDA_CHECK(cudaDeviceSynchronize()); } //if (bndseg.bot.nblk > 0) { //bot - bndFluxGPUSide << < gridDimBBNDBot, blockDim, 0 >> > (XParam, bndseg.bot, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.v, XEv.u, XFlux.Fhv, XFlux.Fqvy, XFlux.Sv); + bndFluxGPUSide <<< gridDimBBNDBot, blockDim, 0 >>> (XParam, bndseg.bot, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.v, XEv.u, XFlux.Fhv, XFlux.Fqvy, XFlux.Sv); CUDA_CHECK(cudaDeviceSynchronize()); } } @@ -219,7 +221,7 @@ template void FlowbndFluxold(Param XParam, double totaltime, BlockP if (XParam.GPUDEVICE >= 0) { - //bndFluxGPU << < gridDimBBND, blockDim, 0 >> > (XParam, side, XBlock, Atmp, float(itime), XEv.zs, XEv.h, un, ut, Fh, Fq, S); + //bndFluxGPU <<< gridDimBBND, blockDim, 0 >>> (XParam, side, XBlock, Atmp, float(itime), XEv.zs, XEv.h, un, ut, Fh, Fq, S); //CUDA_CHECK(cudaDeviceSynchronize()); } else @@ -290,9 +292,9 @@ template __global__ void bndFluxGPUSide(Param XParam, bndsegmentside s } - - + zsbnd = zsbnd + XParam.zsoffset; + int inside = Inside(halowidth, blkmemwidth, side.isright, side.istop, ix, iy, ib); @@ -443,6 +445,9 @@ template void bndFluxGPUSideCPU(Param XParam, bndsegmentside side, Blo zsbnd = interp2BUQ(XParam.xo + xx, XParam.yo + yy, Zsmap); } + + zsbnd = zsbnd + XParam.zsoffset; + int i = memloc(halowidth, blkmemwidth, ix, iy, ib); int inside = Inside(halowidth, blkmemwidth, side.isright, side.istop, ix, iy, ib); diff --git a/src/ConserveElevation.cu b/src/ConserveElevation.cu index 940630b15..a2ecd572a 100755 --- a/src/ConserveElevation.cu +++ b/src/ConserveElevation.cu @@ -31,13 +31,13 @@ template void conserveElevationGPU(Param XParam, BlockP XBlock, Evo dim3 gridDim(XParam.nblk, 1, 1); - conserveElevationLeft << > > (XParam, XBlock, XEv, zb); + conserveElevationLeft <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - conserveElevationRight << > > (XParam, XBlock, XEv, zb); + conserveElevationRight <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - conserveElevationTop << > > (XParam, XBlock, XEv, zb); + conserveElevationTop <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - conserveElevationBot << > > (XParam, XBlock, XEv, zb); + conserveElevationBot <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -329,13 +329,13 @@ template void WetDryProlongationGPU(Param XParam, BlockP XBlock, Ev //WetDryProlongationGPUBot - WetDryProlongationGPULeft << > > (XParam, XBlock, XEv, zb); + WetDryProlongationGPULeft <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetDryProlongationGPURight << > > (XParam, XBlock, XEv, zb); + WetDryProlongationGPURight <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetDryProlongationGPUTop << > > (XParam, XBlock, XEv, zb); + WetDryProlongationGPUTop <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetDryProlongationGPUBot << > > (XParam, XBlock, XEv, zb); + WetDryProlongationGPUBot <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -350,13 +350,13 @@ template void WetDryRestrictionGPU(Param XParam, BlockP XBlock, Evo //WetDryProlongationGPUBot - WetDryRestrictionGPULeft << > > (XParam, XBlock, XEv, zb); + WetDryRestrictionGPULeft <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetDryRestrictionGPURight << > > (XParam, XBlock, XEv, zb); + WetDryRestrictionGPURight <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetDryRestrictionGPUTop << > > (XParam, XBlock, XEv, zb); + WetDryRestrictionGPUTop <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetDryRestrictionGPUBot << > > (XParam, XBlock, XEv, zb); + WetDryRestrictionGPUBot <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); } diff --git a/src/FlowGPU.cu b/src/FlowGPU.cu index 55ff88197..ab2d03bd2 100644 --- a/src/FlowGPU.cu +++ b/src/FlowGPU.cu @@ -35,7 +35,7 @@ template void FlowGPU(Param XParam, Loop& XLoop, Forcing XFo //Calc dpdx and dpdy - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.Patm, XModel.datmpdx, XModel.datmpdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.Patm, XModel.datmpdx, XModel.datmpdy); CUDA_CHECK(cudaDeviceSynchronize()); @@ -45,7 +45,7 @@ template void FlowGPU(Param XParam, Loop& XLoop, Forcing XFo refine_linearGPU(XParam, XModel.blocks, XModel.Patm, XModel.datmpdx, XModel.datmpdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.Patm, XModel.datmpdx, XModel.datmpdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.Patm, XModel.datmpdx, XModel.datmpdy); CUDA_CHECK(cudaDeviceSynchronize()); gradientHaloGPU(XParam, XModel.blocks, XModel.Patm, XModel.datmpdx, XModel.datmpdy); @@ -83,15 +83,15 @@ template void FlowGPU(Param XParam, Loop& XLoop, Forcing XFo if (XParam.engine == 1) { // X- direction - UpdateButtingerXGPU << < gridDim, blockDimKX, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); + UpdateButtingerXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); //updateKurgXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceXGPU <<< gridDim, blockDimKX, 0, XLoop.streams[0] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); CUDA_CHECK(cudaDeviceSynchronize()); // Y- direction - UpdateButtingerYGPU << < gridDim, blockDimKY, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); + UpdateButtingerYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); //updateKurgYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceYGPU <<< gridDim, blockDimKY, 0, XLoop.streams[1] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); - // //updateKurgY << < XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >> > (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); + // //updateKurgY <<< XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >>> (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -104,20 +104,20 @@ template void FlowGPU(Param XParam, Loop& XLoop, Forcing XFo // Y- direction updateKurgYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceYGPU <<< gridDim, blockDimKY, 0, XLoop.streams[1] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); - // //updateKurgY << < XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >> > (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); + // //updateKurgY <<< XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >>> (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); CUDA_CHECK(cudaDeviceSynchronize()); } else if (XParam.engine == 3) { // - updateKurgXATMGPU << < gridDim, blockDimKX, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdx); + updateKurgXATMGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdx); // //AddSlopeSourceXGPU <<< gridDim, blockDimKX, 0, XLoop.streams[0] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); CUDA_CHECK(cudaDeviceSynchronize()); // Y- direction - updateKurgYATMGPU << < gridDim, blockDimKY, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdy); + updateKurgYATMGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdy); // //AddSlopeSourceYGPU <<< gridDim, blockDimKY, 0, XLoop.streams[1] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); - // //updateKurgY << < XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >> > (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); + // //updateKurgY <<< XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >>> (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -185,12 +185,12 @@ template void FlowGPU(Param XParam, Loop& XLoop, Forcing XFo if (XParam.engine == 1) { // X- direction - UpdateButtingerXGPU << < gridDim, blockDimKX, 0 >> > (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); + UpdateButtingerXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); //updateKurgXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.zb); CUDA_CHECK(cudaDeviceSynchronize()); // Y- direction - UpdateButtingerYGPU << < gridDim, blockDimKY, 0 >> > (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); + UpdateButtingerYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); //updateKurgYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.zb); CUDA_CHECK(cudaDeviceSynchronize()); @@ -210,11 +210,11 @@ template void FlowGPU(Param XParam, Loop& XLoop, Forcing XFo { // // - updateKurgXATMGPU << < gridDim, blockDimKX, 0 >> > (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdx); + updateKurgXATMGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdx); CUDA_CHECK(cudaDeviceSynchronize()); // Y- direction - updateKurgYATMGPU << < gridDim, blockDimKY, 0 >> > (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdy); + updateKurgYATMGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdy); CUDA_CHECK(cudaDeviceSynchronize()); } //============================================ @@ -273,19 +273,19 @@ template void FlowGPU(Param XParam, Loop& XLoop, Forcing XFo if (!XForcing.Rain.inputfile.empty()) { - AddrainforcingImplicitGPU << < gridDim, blockDim, 0 >> > (XParam,XLoop, XModel.blocks, XForcing.Rain, XModel.evolv); + AddrainforcingImplicitGPU <<< gridDim, blockDim, 0 >>> (XParam,XLoop, XModel.blocks, XForcing.Rain, XModel.evolv); CUDA_CHECK(cudaDeviceSynchronize()); } if (XParam.infiltration) { - AddinfiltrationImplicitGPU << < gridDim, blockDim, 0 >> > (XParam, XLoop, XModel.blocks, XModel.il, XModel.cl, XModel.evolv, XModel.hgw); + AddinfiltrationImplicitGPU <<< gridDim, blockDim, 0 >>> (XParam, XLoop, XModel.blocks, XModel.il, XModel.cl, XModel.evolv, XModel.hgw); CUDA_CHECK(cudaDeviceSynchronize()); } if (XParam.VelThreshold > 0.0) { - TheresholdVelGPU << < gridDim, blockDim, 0 >> > (XParam, XModel.blocks, XModel.evolv); + TheresholdVelGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.evolv); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -328,7 +328,7 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing if (XParam.atmpforcing) { //Update atm press forcing - AddPatmforcingGPU << < gridDim, blockDim, 0 >> > (XParam, XModel.blocks, XForcing.Atmp, XModel); + AddPatmforcingGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XForcing.Atmp, XModel); CUDA_CHECK(cudaDeviceSynchronize()); //Fill atmp halo @@ -339,7 +339,7 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing cudaStreamDestroy(atmpstreams[0]); //Calc dpdx and dpdy - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.Patm, XModel.datmpdx, XModel.datmpdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.Patm, XModel.datmpdx, XModel.datmpdy); CUDA_CHECK(cudaDeviceSynchronize()); gradientHaloGPU(XParam, XModel.blocks, XModel.Patm, XModel.datmpdx, XModel.datmpdy); // @@ -347,7 +347,7 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing refine_linearGPU(XParam, XModel.blocks, XModel.Patm, XModel.datmpdx, XModel.datmpdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.Patm, XModel.datmpdx, XModel.datmpdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.Patm, XModel.datmpdx, XModel.datmpdy); CUDA_CHECK(cudaDeviceSynchronize()); gradientHaloGPU(XParam, XModel.blocks, XModel.Patm, XModel.datmpdx, XModel.datmpdy); @@ -367,7 +367,7 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing //============================================ // Reset DTmax - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel.blocks.active, XLoop.hugeposval, XModel.time.dtmax); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel.blocks.active, XLoop.hugeposval, XModel.time.dtmax); CUDA_CHECK(cudaDeviceSynchronize()); //============================================ @@ -385,41 +385,41 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing if (XParam.engine == 1) { // X- direction - UpdateButtingerXGPU << < gridDim, blockDimKX, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); + UpdateButtingerXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); //updateKurgXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceXGPU <<< gridDim, blockDimKX, 0, XLoop.streams[0] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); CUDA_CHECK(cudaDeviceSynchronize()); // Y- direction - UpdateButtingerYGPU << < gridDim, blockDimKY, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); + UpdateButtingerYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); //updateKurgYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceYGPU <<< gridDim, blockDimKY, 0, XLoop.streams[1] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); - // //updateKurgY << < XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >> > (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); + // //updateKurgY <<< XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >>> (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); CUDA_CHECK(cudaDeviceSynchronize()); } else if (XParam.engine == 2) { // X- direction - updateKurgXGPU << < gridDim, blockDimKX, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); + updateKurgXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceXGPU <<< gridDim, blockDimKX, 0, XLoop.streams[0] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); CUDA_CHECK(cudaDeviceSynchronize()); // Y- direction - updateKurgYGPU << < gridDim, blockDimKY, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); + updateKurgYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceYGPU <<< gridDim, blockDimKY, 0, XLoop.streams[1] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); - // //updateKurgY << < XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >> > (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); + // //updateKurgY <<< XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >>> (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); CUDA_CHECK(cudaDeviceSynchronize()); } else if (XParam.engine == 3) { // - updateKurgXATMGPU << < gridDim, blockDimKX, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdx); + updateKurgXATMGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdx); // //AddSlopeSourceXGPU <<< gridDim, blockDimKX, 0, XLoop.streams[0] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); CUDA_CHECK(cudaDeviceSynchronize()); // Y- direction - updateKurgYATMGPU << < gridDim, blockDimKY, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdy); + updateKurgYATMGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdy); // //AddSlopeSourceYGPU <<< gridDim, blockDimKY, 0, XLoop.streams[1] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); - // //updateKurgY << < XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >> > (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); + // //updateKurgY <<< XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >>> (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -437,7 +437,7 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing //============================================ // Update advection terms (dh dhu dhv) - updateEVGPU << < gridDim, blockDim, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.flux, XModel.adv); + updateEVGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.flux, XModel.adv); CUDA_CHECK(cudaDeviceSynchronize()); //============================================ @@ -448,7 +448,7 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing //} if (!XForcing.UWind.inputfile.empty())//&& !XForcing.UWind.inputfile.empty() { - AddwindforcingGPU << < gridDim, blockDim, 0 >> > (XParam, XModel.blocks, XForcing.UWind, XForcing.VWind, XModel.adv); + AddwindforcingGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XForcing.UWind, XForcing.VWind, XModel.adv); CUDA_CHECK(cudaDeviceSynchronize()); } if (XForcing.rivers.size() > 0) @@ -458,13 +458,13 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing //============================================ //Update evolving variable by 1/2 time step - AdvkernelGPU << < gridDim, blockDim, 0 >> > (XParam, XModel.blocks, XModel.time.dt * T(0.5), XModel.zb, XModel.evolv, XModel.adv, XModel.evolv_o); + AdvkernelGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.time.dt * T(0.5), XModel.zb, XModel.evolv, XModel.adv, XModel.evolv_o); CUDA_CHECK(cudaDeviceSynchronize()); //============================================ // Add bottom friction - bottomfrictionGPU << < gridDim, blockDim, 0 >> > (XParam, XModel.blocks, XModel.time.dt, XModel.cf, XModel.evolv_o); + bottomfrictionGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.time.dt, XModel.cf, XModel.evolv_o); //XiafrictionGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.time.dt, XModel.cf, XModel.evolv, XModel.evolv_o); @@ -472,18 +472,18 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing //============================================ //Copy updated evolving variable back - cleanupGPU << < gridDim, blockDim, 0 >> > (XParam, XModel.blocks, XModel.evolv_o, XModel.evolv); + cleanupGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.evolv); CUDA_CHECK(cudaDeviceSynchronize()); if (!XForcing.Rain.inputfile.empty()) { - AddrainforcingImplicitGPU << < gridDim, blockDim, 0 >> > (XParam, XLoop, XModel.blocks, XForcing.Rain, XModel.evolv); + AddrainforcingImplicitGPU <<< gridDim, blockDim, 0 >>> (XParam, XLoop, XModel.blocks, XForcing.Rain, XModel.evolv); CUDA_CHECK(cudaDeviceSynchronize()); } if (XParam.VelThreshold > 0.0) { - TheresholdVelGPU << < gridDim, blockDim, 0 >> > (XParam, XModel.blocks, XModel.evolv); + TheresholdVelGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.evolv); CUDA_CHECK(cudaDeviceSynchronize()); } diff --git a/src/Gradients.cu b/src/Gradients.cu index 40610ff11..65dbce32a 100644 --- a/src/Gradients.cu +++ b/src/Gradients.cu @@ -25,20 +25,20 @@ template void gradientGPU(Param XParam, BlockPXBlock, EvolvingP dim3 gridDim(XParam.nblk, 1, 1); - //gradient << < gridDim, blockDim, 0, streams[1] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx, XGrad.dhdy); + //gradient <<< gridDim, blockDim, 0, streams[1] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx, XGrad.dhdy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[2] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); + //gradient <<< gridDim, blockDim, 0, streams[2] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[3] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx, XGrad.dudy); + //gradient <<< gridDim, blockDim, 0, streams[3] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx, XGrad.dudy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[0] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx, XGrad.dvdy); + //gradient <<< gridDim, blockDim, 0, streams[0] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx, XGrad.dvdy); CUDA_CHECK(cudaDeviceSynchronize()); @@ -77,67 +77,67 @@ template void gradientGPU(Param XParam, BlockPXBlock, EvolvingP conserveElevationGPU(XParam, XBlock, XEv, zb); } - RecalculateZsGPU << < gridDim, blockDimfull, 0 >> > (XParam, XBlock, XEv, zb); + RecalculateZsGPU <<< gridDim, blockDimfull, 0 >>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[1] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx, XGrad.dhdy); + //gradient <<< gridDim, blockDim, 0, streams[1] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx, XGrad.dhdy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[2] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); + //gradient <<< gridDim, blockDim, 0, streams[2] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[3] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx, XGrad.dudy); + //gradient <<< gridDim, blockDim, 0, streams[3] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx, XGrad.dudy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[0] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx, XGrad.dvdy); + //gradient <<< gridDim, blockDim, 0, streams[0] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx, XGrad.dvdy); CUDA_CHECK(cudaDeviceSynchronize()); /* - gradientedgeX << < gridDim, blockDimLR2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx); + gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeX << < gridDim, blockDimLR, 0 >> > (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx); + //gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeY << < gridDim, blockDimBT2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdy); + gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeY << < gridDim, blockDimBT, 0>> > (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdy); + //gradientedgeY <<< gridDim, blockDimBT, 0>>> (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdy); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeX << < gridDim, blockDimLR2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx); + gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeX << < gridDim, blockDimLR, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx); + //gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeY << < gridDim, blockDimBT2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdy); + gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeY << < gridDim, blockDimBT, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdy); + //gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdy); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeX << < gridDim, blockDimLR2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx); + gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeX << < gridDim, blockDimLR, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx); + //gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeY << < gridDim, blockDimBT2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudy); + gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeY << < gridDim, blockDimBT, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudy); + //gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudy); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeX << < gridDim, blockDimLR2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx); + gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeX << < gridDim, blockDimLR, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx); + //gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeY << < gridDim, blockDimBT2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdy); + gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeY << < gridDim, blockDimBT, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdy); + //gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdy); CUDA_CHECK(cudaDeviceSynchronize()); */ @@ -154,23 +154,23 @@ template void gradientGPU(Param XParam, BlockPXBlock, EvolvingP if (XParam.engine == 1) { // wet slope limiter - WetsloperesetXGPU << < gridDim, blockDim, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetXGPU <<< gridDim, blockDim, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetsloperesetYGPU << < gridDim, blockDim, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetYGPU <<< gridDim, blockDim, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); // ALso do the slope limiter on the halo - WetsloperesetHaloLeftGPU << < gridDim, blockDimLR, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetHaloLeftGPU <<< gridDim, blockDimLR, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetsloperesetHaloRightGPU << < gridDim, blockDimLR, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetHaloRightGPU <<< gridDim, blockDimLR, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetsloperesetHaloBotGPU << < gridDim, blockDimBT, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetHaloBotGPU <<< gridDim, blockDimBT, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetsloperesetHaloTopGPU << < gridDim, blockDimBT, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetHaloTopGPU <<< gridDim, blockDimBT, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -205,20 +205,20 @@ template void gradientGPUnew(Param XParam, BlockPXBlock, EvolvingP< CUDA_CHECK(cudaStreamCreate(&streams[i])); } - //gradient << < gridDim, blockDim, 0, streams[1] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); - gradientSMC << < gridDim, blockDim, 0, streams[0] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx, XGrad.dhdy); + //gradient <<< gridDim, blockDim, 0, streams[1] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); + gradientSMC <<< gridDim, blockDim, 0, streams[0] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx, XGrad.dhdy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[2] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); - gradientSMC << < gridDim, blockDim, 0, streams[1] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); + //gradient <<< gridDim, blockDim, 0, streams[2] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); + gradientSMC <<< gridDim, blockDim, 0, streams[1] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[3] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); - gradientSMC << < gridDim, blockDim, 0, streams[2] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx, XGrad.dudy); + //gradient <<< gridDim, blockDim, 0, streams[3] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); + gradientSMC <<< gridDim, blockDim, 0, streams[2] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx, XGrad.dudy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[0] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); - gradientSMC << < gridDim, blockDim, 0, streams[3] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx, XGrad.dvdy); + //gradient <<< gridDim, blockDim, 0, streams[0] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); + gradientSMC <<< gridDim, blockDim, 0, streams[3] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx, XGrad.dvdy); //CUDA_CHECK(cudaDeviceSynchronize()); @@ -259,24 +259,24 @@ template void gradientGPUnew(Param XParam, BlockPXBlock, EvolvingP< WetDryProlongationGPU(XParam, XBlock, XEv, zb); } - RecalculateZsGPU << < gridDim, blockDimfull, 0 >> > (XParam, XBlock, XEv, zb); + RecalculateZsGPU <<< gridDim, blockDimfull, 0 >>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); /* - //gradient << < gridDim, blockDim, 0, streams[1] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); - gradientSM << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); + //gradient <<< gridDim, blockDim, 0, streams[1] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); + gradientSM <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[2] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); - gradientSM << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); + //gradient <<< gridDim, blockDim, 0, streams[2] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); + gradientSM <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[3] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); - gradientSM << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); + //gradient <<< gridDim, blockDim, 0, streams[3] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); + gradientSM <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[0] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); - gradientSM << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); + //gradient <<< gridDim, blockDim, 0, streams[0] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); + gradientSM <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); CUDA_CHECK(cudaDeviceSynchronize()); */ /* @@ -292,44 +292,44 @@ template void gradientGPUnew(Param XParam, BlockPXBlock, EvolvingP< - gradientedgeX << < gridDim, blockDimLR2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx); + gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeX << < gridDim, blockDimLR, 0 >> > (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx); + //gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeY << < gridDim, blockDimBT2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdy); + gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeY << < gridDim, blockDimBT, 0>> > (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdy); + //gradientedgeY <<< gridDim, blockDimBT, 0>>> (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdy); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeX << < gridDim, blockDimLR2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx); + gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeX << < gridDim, blockDimLR, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx); + //gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeY << < gridDim, blockDimBT2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdy); + gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeY << < gridDim, blockDimBT, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdy); + //gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdy); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeX << < gridDim, blockDimLR2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx); + gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeX << < gridDim, blockDimLR, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx); + //gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeY << < gridDim, blockDimBT2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudy); + gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeY << < gridDim, blockDimBT, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudy); + //gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudy); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeX << < gridDim, blockDimLR2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx); + gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeX << < gridDim, blockDimLR, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx); + //gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeY << < gridDim, blockDimBT2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdy); + gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeY << < gridDim, blockDimBT, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdy); + //gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdy); CUDA_CHECK(cudaDeviceSynchronize()); /* @@ -351,23 +351,23 @@ template void gradientGPUnew(Param XParam, BlockPXBlock, EvolvingP< if (XParam.engine == 1) { // wet slope limiter - WetsloperesetXGPU << < gridDim, blockDim, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetXGPU <<< gridDim, blockDim, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetsloperesetYGPU << < gridDim, blockDim, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetYGPU <<< gridDim, blockDim, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); // ALso do the slope limiter on the halo - WetsloperesetHaloLeftGPU << < gridDim, blockDimLR, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetHaloLeftGPU <<< gridDim, blockDimLR, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetsloperesetHaloRightGPU << < gridDim, blockDimLR, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetHaloRightGPU <<< gridDim, blockDimLR, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetsloperesetHaloBotGPU << < gridDim, blockDimBT, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetHaloBotGPU <<< gridDim, blockDimBT, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetsloperesetHaloTopGPU << < gridDim, blockDimBT, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetHaloTopGPU <<< gridDim, blockDimBT, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -2358,16 +2358,16 @@ template void gradientHaloGPU(Param XParam, BlockPXBlock, T* a, T* dim3 gridDim(XParam.nblk, 1, 1); - gradientHaloLeftGPU << < gridDim, blockDimL, 0 >> > (XParam, XBlock, a, dadx, dady); + gradientHaloLeftGPU <<< gridDim, blockDimL, 0 >>> (XParam, XBlock, a, dadx, dady); CUDA_CHECK(cudaDeviceSynchronize()); - gradientHaloRightGPU << < gridDim, blockDimL, 0 >> > (XParam, XBlock, a, dadx, dady); + gradientHaloRightGPU <<< gridDim, blockDimL, 0 >>> (XParam, XBlock, a, dadx, dady); CUDA_CHECK(cudaDeviceSynchronize()); - gradientHaloBotGPU << < gridDim, blockDimB, 0 >> > (XParam, XBlock, a, dadx, dady); + gradientHaloBotGPU <<< gridDim, blockDimB, 0 >>> (XParam, XBlock, a, dadx, dady); CUDA_CHECK(cudaDeviceSynchronize()); - gradientHaloTopGPU << < gridDim, blockDimB, 0 >> > (XParam, XBlock, a, dadx, dady); + gradientHaloTopGPU <<< gridDim, blockDimB, 0 >>> (XParam, XBlock, a, dadx, dady); CUDA_CHECK(cudaDeviceSynchronize()); @@ -2389,16 +2389,16 @@ template void gradientHaloGPUnew(Param XParam, BlockPXBlock, T* a, dim3 gridDim(ceil(XParam.nblk/2), 1, 1); - gradientHaloLeftGPUnew << < gridDim, blockDimL, 0, streams[0] >> > (XParam, XBlock, a, dadx, dady); + gradientHaloLeftGPUnew <<< gridDim, blockDimL, 0, streams[0] >>> (XParam, XBlock, a, dadx, dady); - gradientHaloRightGPUnew << < gridDim, blockDimL, 0, streams[1] >> > (XParam, XBlock, a, dadx, dady); + gradientHaloRightGPUnew <<< gridDim, blockDimL, 0, streams[1] >>> (XParam, XBlock, a, dadx, dady); - gradientHaloBotGPUnew << < gridDim, blockDimB, 0, streams[2] >> > (XParam, XBlock, a, dadx, dady); + gradientHaloBotGPUnew <<< gridDim, blockDimB, 0, streams[2] >>> (XParam, XBlock, a, dadx, dady); - gradientHaloTopGPUnew << < gridDim, blockDimB, 0, streams[3] >> > (XParam, XBlock, a, dadx, dady); + gradientHaloTopGPUnew <<< gridDim, blockDimB, 0, streams[3] >>> (XParam, XBlock, a, dadx, dady); //CUDA_CHECK(cudaDeviceSynchronize()); diff --git a/src/GridManip.cu b/src/GridManip.cu index 050acfb3c..dc6180b92 100755 --- a/src/GridManip.cu +++ b/src/GridManip.cu @@ -453,7 +453,7 @@ template void InterpstepCPU(int nx, int ny, int hdstep, double to template void InterpstepCPU(int nx, int ny, int hdstep, double totaltime, double hddt, double*& Ux, double* Uo, double* Un); -template __global__ void InterpstepGPU(int nx, int ny, int hdstp, T totaltime, T hddt, T*Ux, T* Uo, T* Un) +template __global__ void InterpstepGPU(int nx, int ny, T totaltime, T beforetime, T aftertime, T*Ux, T* Uo, T* Un) { unsigned int ix = blockIdx.x * blockDim.x + threadIdx.x; unsigned int iy = blockIdx.y * blockDim.y + threadIdx.y; @@ -464,19 +464,21 @@ template __global__ void InterpstepGPU(int nx, int ny, int hdstp, T to __shared__ T Uxn[16][16]; // __shared__ float Ums[16]; - + T hddt = aftertime - beforetime; if (ix < nx && iy < ny) { Uxo[tx][ty] = Uo[ix + nx * iy]/**Ums[tx]*/; Uxn[tx][ty] = Un[ix + nx * iy]/**Ums[tx]*/; - Ux[ix + nx * iy] = Uxo[tx][ty] + (totaltime - hddt * hdstp) * (Uxn[tx][ty] - Uxo[tx][ty]) / hddt; + Ux[ix + nx * iy] = Uxo[tx][ty] + (totaltime - beforetime) * (Uxn[tx][ty] - Uxo[tx][ty]) / (hddt); } } //template __global__ void InterpstepGPU(int nx, int ny, int hdstp, T totaltime, T hddt, T* Ux, T* Uo, T* Un); -template __global__ void InterpstepGPU(int nx, int ny, int hdstp, float totaltime, float hddt, float* Ux, float* Uo, float* Un); -template __global__ void InterpstepGPU(int nx, int ny, int hdstp, double totaltime, double hddt, double* Ux, double* Uo, double* Un); +template __global__ void InterpstepGPU(int nx, int ny, float totaltime, float beforetime, float aftertime, float* Ux, float* Uo, float* Un); +template __global__ void InterpstepGPU(int nx, int ny, double totaltime, double beforetime, double aftertime, double* Ux, double* Uo, double* Un); + + template void Copy2CartCPU(int nx, int ny, T* dest, T* src) { diff --git a/src/GridManip.h b/src/GridManip.h index 4ee385ad4..0bd725fdf 100755 --- a/src/GridManip.h +++ b/src/GridManip.h @@ -23,7 +23,8 @@ template T interp2BUQ(T x, T y, F forcing); template T interp2BUQ(T x, T y, T dx, F forcing); template void InterpstepCPU(int nx, int ny, int hdstep, F totaltime, F hddt, T*& Ux, T* Uo, T* Un); -template __global__ void InterpstepGPU(int nx, int ny, int hdstp, T totaltime, T hddt, T* Ux, T* Uo, T* Un); +//template __global__ void InterpstepGPU(int nx, int ny, int hdstp, T totaltime, T hddt, T* Ux, T* Uo, T* Un); +template __global__ void InterpstepGPU(int nx, int ny, T totaltime, T beforetime, T aftertime, T* Ux, T* Uo, T* Un); template void Copy2CartCPU(int nx, int ny, T* dest, T* src); diff --git a/src/Halo.cu b/src/Halo.cu index a867ff832..93ea8aeef 100644 --- a/src/Halo.cu +++ b/src/Halo.cu @@ -214,17 +214,17 @@ template void fillHaloGPU(Param XParam, BlockP XBlock, cudaStream_t dim3 blockDimHaloBT(XParam.blkwidth, 1, 1); dim3 gridDim(XParam.nblk, 1, 1); - fillLeft << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z); - //fillLeft << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z); + fillLeft <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z); + //fillLeft <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z); CUDA_CHECK(cudaDeviceSynchronize()); - fillRight << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); - //fillRight << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); + fillRight <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); + //fillRight <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); CUDA_CHECK(cudaDeviceSynchronize()); - fillBot << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z); - //fillBot << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z); + fillBot <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z); + //fillBot <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z); CUDA_CHECK(cudaDeviceSynchronize()); - fillTop << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); - //fillTop << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); + fillTop <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); + //fillTop <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); CUDA_CHECK(cudaDeviceSynchronize()); //CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -244,17 +244,17 @@ template void fillHaloGPUnew(Param XParam, BlockP XBlock, cudaStrea dim3 blockDimHaloBTx2(XParam.blkwidth, 2, 1); dim3 gridDimx2(ceil(XParam.nblk/2), 1, 1); - //fillLeftnew << > > (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z); - fillLeft << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z); + //fillLeftnew <<>> (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z); + fillLeft <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z); CUDA_CHECK(cudaDeviceSynchronize()); - //fillRightnew << > > (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); - fillRight << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); + //fillRightnew <<>> (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); + fillRight <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); CUDA_CHECK(cudaDeviceSynchronize()); - //fillBotnew << > > (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z); - fillBot << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z); + //fillBotnew <<>> (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z); + fillBot <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z); CUDA_CHECK(cudaDeviceSynchronize()); - //fillTopnew << > > (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); - fillTop << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); + //fillTopnew <<>> (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); + fillTop <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); CUDA_CHECK(cudaDeviceSynchronize()); //CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -350,13 +350,13 @@ template void fillHaloTopRightGPU(Param XParam, BlockP XBlock, cuda dim3 gridDim(XParam.nblk, 1, 1); - //fillLeft << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, a); - //fillRightFlux << > > (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); - HaloFluxGPULR << > > (XParam, XBlock, z); + //fillLeft <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, a); + //fillRightFlux <<>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); + HaloFluxGPULR <<>> (XParam, XBlock, z); - //fillBot << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, a); - //fillTopFlux << > > (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); - HaloFluxGPUBT << > > (XParam, XBlock, z); + //fillBot <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, a); + //fillTopFlux <<>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); + HaloFluxGPUBT <<>> (XParam, XBlock, z); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -371,13 +371,13 @@ template void fillHaloLeftRightGPU(Param XParam, BlockP XBlock, cud dim3 gridDim(XParam.nblk, 1, 1); - //fillLeft << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, a); - //fillRightFlux << > > (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); - HaloFluxGPULR << > > (XParam, XBlock, z); + //fillLeft <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, a); + //fillRightFlux <<>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); + HaloFluxGPULR <<>> (XParam, XBlock, z); - //fillBot << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, a); - //fillTopFlux << > > (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); - //HaloFluxGPUBT << > > (XParam, XBlock, z); + //fillBot <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, a); + //fillTopFlux <<>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); + //HaloFluxGPUBT <<>> (XParam, XBlock, z); //CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -391,7 +391,7 @@ template void fillHaloLeftRightGPUnew(Param XParam, BlockP XBlock, //dim3 blockDimHaloBT(16, 1, 1); dim3 gridDim(ceil(XParam.nblk/2), 1, 1); - HaloFluxGPULRnew << > > (XParam, XBlock, z); + HaloFluxGPULRnew <<>> (XParam, XBlock, z); @@ -407,13 +407,13 @@ template void fillHaloBotTopGPU(Param XParam, BlockP XBlock, cudaSt dim3 gridDim(XParam.nblk, 1, 1); - //fillLeft << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, a); - //fillRightFlux << > > (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); - //HaloFluxGPULR << > > (XParam, XBlock, z); + //fillLeft <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, a); + //fillRightFlux <<>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); + //HaloFluxGPULR <<>> (XParam, XBlock, z); - //fillBot << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, a); - //fillTopFlux << > > (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); - HaloFluxGPUBT << > > (XParam, XBlock, z); + //fillBot <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, a); + //fillTopFlux <<>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); + HaloFluxGPUBT <<>> (XParam, XBlock, z); //CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -428,7 +428,7 @@ template void fillHaloBotTopGPUnew(Param XParam, BlockP XBlock, cud dim3 gridDim(ceil(XParam.nblk/2), 1, 1); - HaloFluxGPUBTnew << > > (XParam, XBlock, z); + HaloFluxGPUBTnew <<>> (XParam, XBlock, z); } @@ -549,15 +549,15 @@ template void fillHaloGPU(Param XParam, BlockP XBlock, EvolvingP WetDryRestrictionGPU(XParam, XBlock, Xev, zb); } - RecalculateZsGPU << < gridDimfull, blockDimfull, 0 >> > (XParam, XBlock, Xev, zb); + RecalculateZsGPU <<< gridDimfull, blockDimfull, 0 >>> (XParam, XBlock, Xev, zb); CUDA_CHECK(cudaDeviceSynchronize()); //if (XBlock.mask.nblk > 0) //{ - // maskbndGPUleft << > > (XParam, XBlock, Xev, zb); - // maskbndGPUtop << > > (XParam, XBlock, Xev, zb); - // maskbndGPUright << > > (XParam, XBlock, Xev, zb); - // maskbndGPUtop << > > (XParam, XBlock, Xev, zb); + // maskbndGPUleft <<>> (XParam, XBlock, Xev, zb); + // maskbndGPUtop <<>> (XParam, XBlock, Xev, zb); + // maskbndGPUright <<>> (XParam, XBlock, Xev, zb); + // maskbndGPUtop <<>> (XParam, XBlock, Xev, zb); // //CUDA_CHECK(cudaDeviceSynchronize()); //} @@ -716,10 +716,10 @@ template void fillHaloGPU(Param XParam, BlockP XBlock, FluxP Flu // Below has now moved to its own function //if (XBlock.mask.nblk > 0) //{ - // maskbndGPUFluxleft << > > (XParam, XBlock, Flux); - // maskbndGPUFluxtop << > > (XParam, XBlock, Flux); - // maskbndGPUFluxright << > > (XParam, XBlock, Flux); - // maskbndGPUFluxbot << > > (XParam, XBlock, Flux); + // maskbndGPUFluxleft <<>> (XParam, XBlock, Flux); + // maskbndGPUFluxtop <<>> (XParam, XBlock, Flux); + // maskbndGPUFluxright <<>> (XParam, XBlock, Flux); + // maskbndGPUFluxbot <<>> (XParam, XBlock, Flux); // //CUDA_CHECK(cudaDeviceSynchronize()); //} @@ -750,10 +750,10 @@ template void bndmaskGPU(Param XParam, BlockP XBlock, EvolvingP dim3 gridDim(XBlock.mask.nblk, 1, 1); if (XBlock.mask.nblk > 0) { - maskbndGPUFluxleft << > > (XParam, XBlock, Xev, Flux); - maskbndGPUFluxtop << > > (XParam, XBlock, Flux); - maskbndGPUFluxright << > > (XParam, XBlock, Flux); - maskbndGPUFluxbot << > > (XParam, XBlock, Flux); + maskbndGPUFluxleft <<>> (XParam, XBlock, Xev, Flux); + maskbndGPUFluxtop <<>> (XParam, XBlock, Flux); + maskbndGPUFluxright <<>> (XParam, XBlock, Flux); + maskbndGPUFluxbot <<>> (XParam, XBlock, Flux); //CUDA_CHECK(cudaDeviceSynchronize()); } @@ -1071,9 +1071,9 @@ template void refine_linearGPU(Param XParam, BlockP XBlock, T* z, T dim3 gridDim(XParam.nblk, 1, 1); refine_linear_LeftGPU<<>>(XParam, XBlock, z, dzdx, dzdy); - refine_linear_RightGPU << > > (XParam, XBlock, z, dzdx, dzdy); - refine_linear_TopGPU << > > (XParam, XBlock, z, dzdx, dzdy); - refine_linear_BotGPU << > > (XParam, XBlock, z, dzdx, dzdy); + refine_linear_RightGPU <<>> (XParam, XBlock, z, dzdx, dzdy); + refine_linear_TopGPU <<>> (XParam, XBlock, z, dzdx, dzdy); + refine_linear_BotGPU <<>> (XParam, XBlock, z, dzdx, dzdy); CUDA_CHECK(cudaDeviceSynchronize()); } diff --git a/src/InitEvolv.cu b/src/InitEvolv.cu index fe85c5a8a..3dac1b64b 100755 --- a/src/InitEvolv.cu +++ b/src/InitEvolv.cu @@ -30,7 +30,7 @@ template void initevolv(Param XParam, BlockP XBlock,Forcing hotstartsucess = readhotstartfile(XParam, XBlock, XEv, zb); //add offset if present - if (!std::isnan(XParam.zsoffset)) // apply specified zsoffset + if (T(XParam.zsoffset) != T(0.0)) // apply specified zsoffset { printf("\t\tadd offset to zs and hh... "); // @@ -104,7 +104,7 @@ template int coldstart(Param XParam, BlockP XBlock, T* zb, EvolvingP & XEv) { T zzini = std::isnan(XParam.zsinit)? T(0.0): T(XParam.zsinit); - T zzoffset = std::isnan(XParam.zsoffset) ? T(0.0) : T(XParam.zsoffset); + T zzoffset = T(XParam.zsoffset); diff --git a/src/InitialConditions.cu b/src/InitialConditions.cu index ca65a2bc8..21da1f436 100644 --- a/src/InitialConditions.cu +++ b/src/InitialConditions.cu @@ -144,14 +144,14 @@ template void InitzbgradientGPU(Param XParam, Model XModel) cudaStreamDestroy(streams[0]); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.zb, XModel.grad.dzbdx, XModel.grad.dzbdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.zb, XModel.grad.dzbdx, XModel.grad.dzbdy); CUDA_CHECK(cudaDeviceSynchronize()); gradientHaloGPU(XParam, XModel.blocks, XModel.zb, XModel.grad.dzbdx, XModel.grad.dzbdy); refine_linearGPU(XParam, XModel.blocks, XModel.zb, XModel.grad.dzbdx, XModel.grad.dzbdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.zb, XModel.grad.dzbdx, XModel.grad.dzbdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.zb, XModel.grad.dzbdx, XModel.grad.dzbdy); CUDA_CHECK(cudaDeviceSynchronize()); gradientHaloGPU(XParam, XModel.blocks, XModel.zb, XModel.grad.dzbdx, XModel.grad.dzbdy); @@ -216,7 +216,7 @@ void InitTSOutput(Param XParam) } } -template void FindTSoutNodes(Param& XParam, BlockP XBlock, BndblockP & bnd) +template void FindTSoutNodes(Param& XParam, BlockP XBlock, BndblockP & bnd) { int ib; T levdx,x,y,blkxmin,blkxmax,blkymin,blkymax,dxblk; @@ -263,8 +263,8 @@ template void FindTSoutNodes(Param& XParam, BlockP XBlock, Bndblock } -template void FindTSoutNodes(Param& XParam, BlockP XBlock, BndblockP& bnd); -template void FindTSoutNodes(Param& XParam, BlockP XBlock, BndblockP& bnd); +template void FindTSoutNodes(Param& XParam, BlockP XBlock, BndblockP& bnd); +template void FindTSoutNodes(Param& XParam, BlockP XBlock, BndblockP& bnd); @@ -336,7 +336,7 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model XForcing.rivers[Rin].i = idis; XForcing.rivers[Rin].j = jdis; XForcing.rivers[Rin].block = blockdis; - XForcing.rivers[Rin].disarea = dischargeArea; // That is not valid for spherical grids + XForcing.rivers[Rin].disarea = dischargeArea; // That is valid for spherical grids @@ -352,8 +352,7 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model } } - - + //Now identify sort and unique blocks where rivers are being inserted std::vector activeRiverBlk; @@ -376,6 +375,135 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model { XModel.bndblk.river[b] = activeRiverBlk[b]; } + + // Setup the river info + + int nburmax = activeRiverBlk.size(); + int nribmax = 0; + for (int b = 0; b < activeRiverBlk.size(); b++) + { + int bur = activeRiverBlk[b]; + int nriverinblock = 0; + for (int Rin = 0; Rin < XForcing.rivers.size(); Rin++) + { + std::vector uniqblockforriver = XForcing.rivers[Rin].block; + + std::sort(uniqblockforriver.begin(), uniqblockforriver.end()); + uniqblockforriver.erase(std::unique(uniqblockforriver.begin(), uniqblockforriver.end()), uniqblockforriver.end()); + + for (int bir = 0; bir < uniqblockforriver.size(); bir++) + { + if (uniqblockforriver[bir] == bur) + { + nriverinblock = nriverinblock + 1; + } + } + + } + nribmax = max(nribmax, nriverinblock); + } + + // Allocate Qnow as pinned memory + AllocateMappedMemCPU(XForcing.rivers.size(), 1, XParam.GPUDEVICE,XModel.bndblk.Riverinfo.qnow); + AllocateCPU(nribmax, nburmax, XModel.bndblk.Riverinfo.xstart, XModel.bndblk.Riverinfo.xend, XModel.bndblk.Riverinfo.ystart, XModel.bndblk.Riverinfo.yend); + FillCPU(nribmax, nburmax, T(-1.0), XModel.bndblk.Riverinfo.xstart); + FillCPU(nribmax, nburmax, T(-1.0), XModel.bndblk.Riverinfo.xend); + FillCPU(nribmax, nburmax, T(-1.0), XModel.bndblk.Riverinfo.ystart); + FillCPU(nribmax, nburmax, T(-1.0), XModel.bndblk.Riverinfo.yend); + + + + // Allocate XXbidir and Xridib + ReallocArray(nribmax, nburmax, XModel.bndblk.Riverinfo.Xbidir); + ReallocArray(nribmax, nburmax, XModel.bndblk.Riverinfo.Xridib); + + // Fill them with a flag value + FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.Xbidir); + FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.Xridib); + + //Xbidir is an array that stores block id where n rivers apply + //along the row of Xbidir block id is unique. meaning that a block id ith two river injection will appear on two seperate row of Xbidir + //The number of column (size of row 1) in xbidir is nburmax = length(uniq(blockwith river injected)) + // + + //Xridib is an array that stores River id that a river is injected for the corresponding block id in Xbidir + + + XModel.bndblk.Riverinfo.nribmax = nribmax; + XModel.bndblk.Riverinfo.nburmax = nburmax; + + std::vector blocksalreadyin; + RiverBlk emptyvec; + for (int iblk = 0; iblk < nribmax; iblk++) + { + + blocksalreadyin.push_back(emptyvec); + + } + + //(n, 10) + // + std::vector iriv(nribmax,0); + for (int Rin = 0; Rin < XForcing.rivers.size(); Rin++) + { + std::vector uniqblockforriver = XForcing.rivers[Rin].block; + + std::sort(uniqblockforriver.begin(), uniqblockforriver.end()); + uniqblockforriver.erase(std::unique(uniqblockforriver.begin(), uniqblockforriver.end()), uniqblockforriver.end()); + + + + for (int bir = 0; bir < uniqblockforriver.size(); bir++) + { + + for (int iribm = 0; iribm < nribmax; iribm++) + { + + if (std::find(blocksalreadyin[iribm].block.begin(), blocksalreadyin[iribm].block.end(), uniqblockforriver[bir]) != blocksalreadyin[iribm].block.end()) + { + //block found already listed in that line; + + continue; + } + else + { + //not found; + // write to the array + XModel.bndblk.Riverinfo.Xbidir[iriv[iribm] + iribm * nburmax] = uniqblockforriver[bir]; + XModel.bndblk.Riverinfo.Xridib[iriv[iribm] + iribm * nburmax] = Rin; + + iriv[iribm] = iriv[iribm] + 1; + + // add it to the list + blocksalreadyin[iribm].block.push_back(uniqblockforriver[bir]); + + + + break; + } + } + + } + + } + for (int iribm = 0; iribm < nribmax; iribm++) + { + for (int ibur = 0; ibur < nburmax; ibur++) + { + int indx = ibur + iribm * nburmax; + int Rin = XModel.bndblk.Riverinfo.Xridib[indx]; + if (Rin > -1) + { + XModel.bndblk.Riverinfo.xstart[indx] = XForcing.rivers[Rin].xstart; + XModel.bndblk.Riverinfo.xend[indx] = XForcing.rivers[Rin].xend; + XModel.bndblk.Riverinfo.ystart[indx] = XForcing.rivers[Rin].ystart; + XModel.bndblk.Riverinfo.yend[indx] = XForcing.rivers[Rin].yend; + } + } + } + + + } @@ -949,6 +1077,8 @@ template void Initbndblks(Param& XParam, Forcing& XForcing, Blo } XForcing.bndseg[s].nblk = segcount; + log("\nBoundary Segment " + std::to_string(s) + " : " + XForcing.bndseg[s].inputfile + " has " + std::to_string(segcount) + " blocks "); + XForcing.bndseg[s].left.nblk = leftcount; XForcing.bndseg[s].right.nblk = rightcount; XForcing.bndseg[s].top.nblk = topcount; diff --git a/src/InitialConditions.h b/src/InitialConditions.h index d97a4121f..61ef02f85 100755 --- a/src/InitialConditions.h +++ b/src/InitialConditions.h @@ -23,7 +23,7 @@ template void initoutput(Param &XParam, Model& XModel); void InitTSOutput(Param XParam); //template void Initbnds(Param XParam, Forcing XForcing, Model& XModel); -template void FindTSoutNodes(Param& XParam, BlockP XBlock, BndblockP& bnd); +template void FindTSoutNodes(Param& XParam, BlockP XBlock, BndblockP& bnd); template void Calcbndblks(Param& XParam, Forcing& XForcing, BlockP XBlock); template void Findbndblks(Param XParam, Model XModel, Forcing& XForcing); template void Initoutzone(Param& XParam, BlockP& XBlock); diff --git a/src/Kurganov.cu b/src/Kurganov.cu index 4bb080808..93111f7ac 100755 --- a/src/Kurganov.cu +++ b/src/Kurganov.cu @@ -1,14 +1,32 @@ #include "Kurganov.h" +#define SHARED_MEM_HALO_WIDTH 1 +#define STATIC_MAX_BLOCK_X 16 +#define STATIC_MAX_BLOCK_Y 16 template __global__ void updateKurgXGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, T* dtmax, T*zb) { + // Shared memory declarations + __shared__ T s_h[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_zs[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_u[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_v[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_dhdx[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_dzsdx[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_dudx[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_dvdx[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; - unsigned int halowidth = XParam.halowidth; - unsigned int blkmemwidth = blockDim.y + halowidth * 2; + unsigned int halowidth = XParam.halowidth; // Runtime halowidth + unsigned int blkmemwidth = blockDim.y + halowidth * 2; // Keep original, used by memloc //unsigned int blksize = blkmemwidth * blkmemwidth; int ix = threadIdx.x; int iy = threadIdx.y; + + // Safety check for static shared memory bounds + if (ix >= STATIC_MAX_BLOCK_X || iy >= STATIC_MAX_BLOCK_Y) { + return; + } + unsigned int ibl = blockIdx.x; unsigned int ib = XBlock.active[ibl]; @@ -29,128 +47,135 @@ template __global__ void updateKurgXGPU(Param XParam, BlockP XBlock T CFL = T(XParam.CFL); // This is based on kurganov and Petrova 2007 + // Global memory indices + int global_idx_center = memloc(halowidth, blkmemwidth, ix, iy, ib); + int global_idx_left_halo = memloc(halowidth, blkmemwidth, ix - 1, iy, ib); // ix-1 for halo + + // Shared memory indices: data for (ix,iy) at s_array[iy][ix+1], halo for (ix-1,iy) at s_array[iy][ix] + int s_idx_current = ix + SHARED_MEM_HALO_WIDTH; // ix + 1 + int s_idx_halo = ix; // ix + + // Load data into shared memory + s_h[iy][s_idx_current] = XEv.h[global_idx_center]; + s_zs[iy][s_idx_current] = XEv.zs[global_idx_center]; + s_u[iy][s_idx_current] = XEv.u[global_idx_center]; + s_v[iy][s_idx_current] = XEv.v[global_idx_center]; + s_dhdx[iy][s_idx_current] = XGrad.dhdx[global_idx_center]; + s_dzsdx[iy][s_idx_current]= XGrad.dzsdx[global_idx_center]; + s_dudx[iy][s_idx_current] = XGrad.dudx[global_idx_center]; + s_dvdx[iy][s_idx_current] = XGrad.dvdx[global_idx_center]; + + if (ix == 0) { + s_h[iy][s_idx_halo] = XEv.h[global_idx_left_halo]; + s_zs[iy][s_idx_halo] = XEv.zs[global_idx_left_halo]; + s_u[iy][s_idx_halo] = XEv.u[global_idx_left_halo]; + s_v[iy][s_idx_halo] = XEv.v[global_idx_left_halo]; + s_dhdx[iy][s_idx_halo] = XGrad.dhdx[global_idx_left_halo]; + s_dzsdx[iy][s_idx_halo]= XGrad.dzsdx[global_idx_left_halo]; + s_dudx[iy][s_idx_halo] = XGrad.dudx[global_idx_left_halo]; + s_dvdx[iy][s_idx_halo] = XGrad.dvdx[global_idx_left_halo]; + } - int i = memloc(halowidth, blkmemwidth, ix, iy, ib); - int ileft = memloc(halowidth, blkmemwidth, ix-1, iy, ib); + __syncthreads(); T ybo = T(XParam.yo + XBlock.yo[ib]); - T dhdxi = XGrad.dhdx[i]; - T dhdxmin = XGrad.dhdx[ileft]; + // Access data from shared memory + T dhdxi = s_dhdx[iy][s_idx_current]; + T dhdxmin = s_dhdx[iy][s_idx_halo]; T cm = XParam.spherical ? calcCM(T(XParam.Radius), delta, ybo, iy) : T(1.0); T fmu = T(1.0); - T hi = XEv.h[i]; - - T hn = XEv.h[ileft]; + T hi = s_h[iy][s_idx_current]; + T hn = s_h[iy][s_idx_halo]; - if (hi > eps || hn > eps) { T dx, zi, zl, zn, zr, zlr, hl, up, hp, hr, um, hm, sl, sr,ga; - // along X dx = delta * T(0.5); - zi = XEv.zs[i] - hi; - - //printf("%f\n", zi); - - - //zl = zi - dx*(dzsdx[i] - dhdx[i]); - zl = zi - dx * (XGrad.dzsdx[i] - dhdxi); - //printf("%f\n", zl); - - zn = XEv.zs[ileft] - hn; - - //printf("%f\n", zn); - zr = zn + dx * (XGrad.dzsdx[ileft] - dhdxmin); - + zi = s_zs[iy][s_idx_current] - hi; + zl = zi - dx * (s_dzsdx[iy][s_idx_current] - dhdxi); + + zn = s_zs[iy][s_idx_halo] - hn; + zr = zn + dx * (s_dzsdx[iy][s_idx_halo] - dhdxmin); zlr = max(zl, zr); - //hl = hi - dx*dhdx[i]; hl = hi - dx * dhdxi; - up = XEv.u[i] - dx * XGrad.dudx[i]; + up = s_u[iy][s_idx_current] - dx * s_dudx[iy][s_idx_current]; hp = max(T(0.0), hl + zl - zlr); hr = hn + dx * dhdxmin; - um = XEv.u[ileft] + dx * XGrad.dudx[ileft]; + um = s_u[iy][s_idx_halo] + dx * s_dudx[iy][s_idx_halo]; hm = max(T(0.0), hr + zr - zlr); ga = g * T(0.5); - T fh, fu, fv, dt; - - //solver below also modifies fh and fu dt = KurgSolver(g, delta, epsi, CFL, cm, fmu, hp, hm, up, um, fh, fu); - - if (dt < dtmax[i]) + + // dtmax is an output array, write to global memory using global_idx_center + if (dt < dtmax[global_idx_center]) { - dtmax[i] = dt; + dtmax[global_idx_center] = dt; } - - - if (fh > T(0.0)) { - fv = (XEv.v[ileft] + dx * XGrad.dvdx[ileft]) * fh;// Eq 3.7 third term? (X direction) + fv = (s_v[iy][s_idx_halo] + dx * s_dvdx[iy][s_idx_halo]) * fh; } else { - fv = (XEv.v[i] - dx * XGrad.dvdx[i]) * fh; - } - //fv = (fh > 0.f ? vv[xminus + iy*nx] + dx*dvdx[xminus + iy*nx] : vv[i] - dx*dvdx[i])*fh; - //dtmax needs to be stored in an array and reduced at the end - //dtmax = dtmaxf; - //dtmaxtmp = min(dtmax, dtmaxtmp); - /*if (ix == 11 && iy == 0) - { - printf("a=%f\t b=%f\t c=%f\t d=%f\n", ap*(qm*um + ga*hm2), -am*(qp*up + ga*hp2),( ap*(qm*um + g*sq(hm) / 2.0f) - am*(qp*up + g*sq(hp) / 2.0f) + ap*am*(qp - qm) ) *ad/100.0f, ad); + fv = (s_v[iy][s_idx_current] - dx * s_dvdx[iy][s_idx_current]) * fh; } - */ - /* - #### Topographic source term - In the case of adaptive refinement, care must be taken to ensure - well-balancing at coarse/fine faces (see [notes/balanced.tm]()). */ - if ((ix == blockDim.y) && levRB < lev)//(ix==16) i.e. in the right halo + // Boundary condition accesses - keep as global. + // Initialize boundary variables with values from shared memory (or computed from shared). + T hi_boundary = hi; + T zi_boundary = zi; // zi is computed from shared memory values + T hn_boundary = hn; + T zn_boundary = zn; // zn is computed from shared memory values + + // Original code: `if ((ix == blockDim.y) && levRB < lev)` + // Assuming blockDim.y was a placeholder for the extent of the block in x-direction. + // Using STATIC_MAX_BLOCK_X - 1 for the rightmost thread in the current block. + if ((ix == STATIC_MAX_BLOCK_X - 1) && levRB < lev) { - int jj = LBRB == ib ? floor(iy * (T)0.5) : floor(iy * (T)0.5) + blockDim.y / 2; - int iright = memloc(halowidth, blkmemwidth, 0, jj, RB);; - hi = XEv.h[iright]; - zi = zb[iright]; + int jj = LBRB == ib ? floor(iy * (T)0.5) : floor(iy * (T)0.5) + STATIC_MAX_BLOCK_Y / 2; + int iright = memloc(halowidth, blkmemwidth, 0, jj, RB); // Accesses cell 0 of neighbor block RB + hi_boundary = XEv.h[iright]; + zi_boundary = zb[iright]; // Note: zb is a direct global pointer argument to the kernel } - if ((ix == 0) && levLB < lev)//(ix==16) i.e. in the right halo + // Original code: `if ((ix == 0) && levLB < lev)` + if ((ix == 0) && levLB < lev) { - int jj = RBLB == ib ? floor(iy * (T)0.5) : floor(iy * (T)0.5) + blockDim.y / 2; - int ilc = memloc(halowidth, blkmemwidth, blockDim.y - 1, jj, LB); - hn = XEv.h[ilc]; - zn = zb[ilc]; + int jj = RBLB == ib ? floor(iy * (T)0.5) : floor(iy * (T)0.5) + STATIC_MAX_BLOCK_Y / 2; + // Accesses last cell (STATIC_MAX_BLOCK_X - 1) of neighbor block LB + int ilc = memloc(halowidth, blkmemwidth, STATIC_MAX_BLOCK_X - 1, jj, LB); + hn_boundary = XEv.h[ilc]; + zn_boundary = zb[ilc]; // Note: zb is a direct global pointer argument } - sl = ga * (utils::sq(hp) - utils::sq(hl) + (hl + hi) * (zi - zl)); - sr = ga * (utils::sq(hm) - utils::sq(hr) + (hr + hn) * (zn - zr)); + sl = ga * (utils::sq(hp) - utils::sq(hl) + (hl + hi_boundary) * (zi_boundary - zl)); + sr = ga * (utils::sq(hm) - utils::sq(hr) + (hr + hn_boundary) * (zn_boundary - zr)); - ////Flux update - - XFlux.Fhu[i] = fmu * fh; - XFlux.Fqux[i] = fmu * (fu - sl); - XFlux.Su[i] = fmu * (fu - sr); - XFlux.Fqvx[i] = fmu * fv; + ////Flux update (writing to global memory, using global_idx_center) + XFlux.Fhu[global_idx_center] = fmu * fh; + XFlux.Fqux[global_idx_center] = fmu * (fu - sl); + XFlux.Su[global_idx_center] = fmu * (fu - sr); + XFlux.Fqvx[global_idx_center] = fmu * fv; } else { - dtmax[i] = T(1.0) / epsi; - XFlux.Fhu[i] = T(0.0); - XFlux.Fqux[i] = T(0.0); - XFlux.Su[i] = T(0.0); - XFlux.Fqvx[i] = T(0.0); + // dtmax is an output array, write to global memory using global_idx_center + dtmax[global_idx_center] = T(1.0) / epsi; + XFlux.Fhu[global_idx_center] = T(0.0); + XFlux.Fqux[global_idx_center] = T(0.0); + XFlux.Su[global_idx_center] = T(0.0); + XFlux.Fqvx[global_idx_center] = T(0.0); } - - } template __global__ void updateKurgXGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, float* dtmax, float* zb); @@ -158,12 +183,29 @@ template __global__ void updateKurgXGPU(Param XParam, BlockP XBl template __global__ void updateKurgXATMGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, T* dtmax, T* zb, T* Patm, T*dPdx) { - - unsigned int halowidth = XParam.halowidth; - unsigned int blkmemwidth = blockDim.y + halowidth * 2; + // Shared memory declarations + __shared__ T s_h[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_zs[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_u[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_v[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_dhdx[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_dzsdx[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_dudx[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_dvdx[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_Patm[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_dPdx[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + + unsigned int halowidth = XParam.halowidth; // Runtime halowidth + unsigned int blkmemwidth = blockDim.y + halowidth * 2; // Keep original, used by memloc //unsigned int blksize = blkmemwidth * blkmemwidth; int ix = threadIdx.x; int iy = threadIdx.y; + + // Safety check for static shared memory bounds + if (ix >= STATIC_MAX_BLOCK_X || iy >= STATIC_MAX_BLOCK_Y) { + return; + } + unsigned int ibl = blockIdx.x; unsigned int ib = XBlock.active[ibl]; @@ -184,130 +226,135 @@ template __global__ void updateKurgXATMGPU(Param XParam, BlockP XBl T CFL = T(XParam.CFL); // This is based on kurganov and Petrova 2007 + // Global memory indices + int global_idx_center = memloc(halowidth, blkmemwidth, ix, iy, ib); + int global_idx_left_halo = memloc(halowidth, blkmemwidth, ix - 1, iy, ib); // ix-1 for halo + + // Shared memory indices + int s_idx_current = ix + SHARED_MEM_HALO_WIDTH; // ix + 1 + int s_idx_halo = ix; // ix + + // Load data into shared memory + s_h[iy][s_idx_current] = XEv.h[global_idx_center]; + s_zs[iy][s_idx_current] = XEv.zs[global_idx_center]; + s_u[iy][s_idx_current] = XEv.u[global_idx_center]; + s_v[iy][s_idx_current] = XEv.v[global_idx_center]; + s_dhdx[iy][s_idx_current] = XGrad.dhdx[global_idx_center]; + s_dzsdx[iy][s_idx_current] = XGrad.dzsdx[global_idx_center]; + s_dudx[iy][s_idx_current] = XGrad.dudx[global_idx_center]; + s_dvdx[iy][s_idx_current] = XGrad.dvdx[global_idx_center]; + s_Patm[iy][s_idx_current] = Patm[global_idx_center]; + s_dPdx[iy][s_idx_current] = dPdx[global_idx_center]; + + if (ix == 0) { + s_h[iy][s_idx_halo] = XEv.h[global_idx_left_halo]; + s_zs[iy][s_idx_halo] = XEv.zs[global_idx_left_halo]; + s_u[iy][s_idx_halo] = XEv.u[global_idx_left_halo]; + s_v[iy][s_idx_halo] = XEv.v[global_idx_left_halo]; + s_dhdx[iy][s_idx_halo] = XGrad.dhdx[global_idx_left_halo]; + s_dzsdx[iy][s_idx_halo] = XGrad.dzsdx[global_idx_left_halo]; + s_dudx[iy][s_idx_halo] = XGrad.dudx[global_idx_left_halo]; + s_dvdx[iy][s_idx_halo] = XGrad.dvdx[global_idx_left_halo]; + s_Patm[iy][s_idx_halo] = Patm[global_idx_left_halo]; + s_dPdx[iy][s_idx_halo] = dPdx[global_idx_left_halo]; + } - int i = memloc(halowidth, blkmemwidth, ix, iy, ib); - int ileft = memloc(halowidth, blkmemwidth, ix - 1, iy, ib); + __syncthreads(); T ybo = T(XParam.yo + XBlock.yo[ib]); - - T dhdxi = XGrad.dhdx[i]; - T dhdxmin = XGrad.dhdx[ileft]; + // Access data from shared memory + T dhdxi = s_dhdx[iy][s_idx_current]; + T dhdxmin = s_dhdx[iy][s_idx_halo]; T cm = XParam.spherical ? calcCM(T(XParam.Radius), delta, ybo, iy) : T(1.0); T fmu = T(1.0); - T hi = XEv.h[i]; - - T hn = XEv.h[ileft]; - + T hi = s_h[iy][s_idx_current]; + T hn = s_h[iy][s_idx_halo]; if (hi > eps || hn > eps) { T dx, zi, zl, zn, zr, zlr, hl, up, hp, hr, um, hm, sl, sr, ga; - // along X dx = delta * T(0.5); - zi = XEv.zs[i] - hi + XParam.Pa2m * Patm[i]; - - //printf("%f\n", zi); - - - //zl = zi - dx*(dzsdx[i] - dhdx[i]); - zl = zi - dx * (XGrad.dzsdx[i] - dhdxi + XParam.Pa2m * dPdx[i]); - //printf("%f\n", zl); - - zn = XEv.zs[ileft] - hn + XParam.Pa2m * Patm[ileft]; - - //printf("%f\n", zn); - zr = zn + dx * (XGrad.dzsdx[ileft] - dhdxmin + XParam.Pa2m * dPdx[ileft]); - + zi = s_zs[iy][s_idx_current] - hi + XParam.Pa2m * s_Patm[iy][s_idx_current]; + zl = zi - dx * (s_dzsdx[iy][s_idx_current] - dhdxi + XParam.Pa2m * s_dPdx[iy][s_idx_current]); + + zn = s_zs[iy][s_idx_halo] - hn + XParam.Pa2m * s_Patm[iy][s_idx_halo]; + zr = zn + dx * (s_dzsdx[iy][s_idx_halo] - dhdxmin + XParam.Pa2m * s_dPdx[iy][s_idx_halo]); zlr = max(zl, zr); - //hl = hi - dx*dhdx[i]; hl = hi - dx * dhdxi; - up = XEv.u[i] - dx * XGrad.dudx[i]; + up = s_u[iy][s_idx_current] - dx * s_dudx[iy][s_idx_current]; hp = max(T(0.0), hl + zl - zlr); hr = hn + dx * dhdxmin; - um = XEv.u[ileft] + dx * XGrad.dudx[ileft]; + um = s_u[iy][s_idx_halo] + dx * s_dudx[iy][s_idx_halo]; hm = max(T(0.0), hr + zr - zlr); ga = g * T(0.5); - T fh, fu, fv, dt; - - //solver below also modifies fh and fu dt = KurgSolver(g, delta, epsi, CFL, cm, fmu, hp, hm, up, um, fh, fu); - if (dt < dtmax[i]) + if (dt < dtmax[global_idx_center]) // Output to global memory { - dtmax[i] = dt; + dtmax[global_idx_center] = dt; } - - - if (fh > T(0.0)) { - fv = (XEv.v[ileft] + dx * XGrad.dvdx[ileft]) * fh;// Eq 3.7 third term? (X direction) + fv = (s_v[iy][s_idx_halo] + dx * s_dvdx[iy][s_idx_halo]) * fh; } else { - fv = (XEv.v[i] - dx * XGrad.dvdx[i]) * fh; - } - //fv = (fh > 0.f ? vv[xminus + iy*nx] + dx*dvdx[xminus + iy*nx] : vv[i] - dx*dvdx[i])*fh; - //dtmax needs to be stored in an array and reduced at the end - //dtmax = dtmaxf; - //dtmaxtmp = min(dtmax, dtmaxtmp); - /*if (ix == 11 && iy == 0) - { - printf("a=%f\t b=%f\t c=%f\t d=%f\n", ap*(qm*um + ga*hm2), -am*(qp*up + ga*hp2),( ap*(qm*um + g*sq(hm) / 2.0f) - am*(qp*up + g*sq(hp) / 2.0f) + ap*am*(qp - qm) ) *ad/100.0f, ad); + fv = (s_v[iy][s_idx_current] - dx * s_dvdx[iy][s_idx_current]) * fh; } - */ - /* - #### Topographic source term + + // Boundary condition handling: Initialize with shared/local values, then overwrite if boundary global access occurs. + T hi_boundary = hi; + T zi_boundary = zi; // zi already incorporates s_Patm for current cell + T hn_boundary = hn; + T zn_boundary = zn; // zn already incorporates s_Patm for halo cell - In the case of adaptive refinement, care must be taken to ensure - well-balancing at coarse/fine faces (see [notes/balanced.tm]()). */ - if ((ix == blockDim.y) && levRB < lev)//(ix==16) i.e. in the right halo + if ((ix == STATIC_MAX_BLOCK_X - 1) && levRB < lev) { - int jj = LBRB == ib ? floor(iy * (T)0.5) : floor(iy * (T)0.5) + blockDim.y / 2; - int iright = memloc(halowidth, blkmemwidth, 0, jj, RB);; - hi = XEv.h[iright]; - zi = zb[iright] + XParam.Pa2m * Patm[iright]; + int jj = LBRB == ib ? floor(iy * (T)0.5) : floor(iy * (T)0.5) + STATIC_MAX_BLOCK_Y / 2; + int iright = memloc(halowidth, blkmemwidth, 0, jj, RB); + hi_boundary = XEv.h[iright]; + // Patm[iright] is a global access from a neighboring block. + zi_boundary = zb[iright] + XParam.Pa2m * Patm[iright]; } - if ((ix == 0) && levLB < lev)//(ix==16) i.e. in the right halo + if ((ix == 0) && levLB < lev) { - int jj = RBLB == ib ? floor(iy * (T)0.5) : floor(iy * (T)0.5) + blockDim.y / 2; - int ilc = memloc(halowidth, blkmemwidth, blockDim.y - 1, jj, LB); - hn = XEv.h[ilc]; - zn = zb[ilc] + XParam.Pa2m * Patm[ilc]; + int jj = RBLB == ib ? floor(iy * (T)0.5) : floor(iy * (T)0.5) + STATIC_MAX_BLOCK_Y / 2; + int ilc = memloc(halowidth, blkmemwidth, STATIC_MAX_BLOCK_X - 1, jj, LB); + hn_boundary = XEv.h[ilc]; + // Patm[ilc] is a global access from a neighboring block. + zn_boundary = zb[ilc] + XParam.Pa2m * Patm[ilc]; } - sl = ga * (utils::sq(hp) - utils::sq(hl) + (hl + hi) * (zi - zl)); - sr = ga * (utils::sq(hm) - utils::sq(hr) + (hr + hn) * (zn - zr)); - - ////Flux update + sl = ga * (utils::sq(hp) - utils::sq(hl) + (hl + hi_boundary) * (zi_boundary - zl)); + sr = ga * (utils::sq(hm) - utils::sq(hr) + (hr + hn_boundary) * (zn_boundary - zr)); - XFlux.Fhu[i] = fmu * fh; - XFlux.Fqux[i] = fmu * (fu - sl); - XFlux.Su[i] = fmu * (fu - sr); - XFlux.Fqvx[i] = fmu * fv; + ////Flux update (writing to global memory) + XFlux.Fhu[global_idx_center] = fmu * fh; + XFlux.Fqux[global_idx_center] = fmu * (fu - sl); + XFlux.Su[global_idx_center] = fmu * (fu - sr); + XFlux.Fqvx[global_idx_center] = fmu * fv; } else { - dtmax[i] = T(1.0) / epsi; - XFlux.Fhu[i] = T(0.0); - XFlux.Fqux[i] = T(0.0); - XFlux.Su[i] = T(0.0); - XFlux.Fqvx[i] = T(0.0); + dtmax[global_idx_center] = T(1.0) / epsi; // Output to global memory + XFlux.Fhu[global_idx_center] = T(0.0); + XFlux.Fqux[global_idx_center] = T(0.0); + XFlux.Su[global_idx_center] = T(0.0); + XFlux.Fqvx[global_idx_center] = T(0.0); } - } template __global__ void updateKurgXATMGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, float* dtmax, float* zb, float* Patm, float* dPdx); template __global__ void updateKurgXATMGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, double* dtmax, double* zb, double* Patm, double* dPdx); @@ -315,11 +362,25 @@ template __global__ void updateKurgXATMGPU(Param XParam, BlockP template __global__ void AddSlopeSourceXGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, T * zb) { - unsigned int halowidth = XParam.halowidth; - unsigned int blkmemwidth = blockDim.y + halowidth * 2; + // Shared memory declarations + __shared__ T s_h[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_zs[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_dhdx[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_dzsdx[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_Fqux[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + __shared__ T s_Su[STATIC_MAX_BLOCK_Y][STATIC_MAX_BLOCK_X + SHARED_MEM_HALO_WIDTH]; + + unsigned int halowidth = XParam.halowidth; // Runtime halowidth + unsigned int blkmemwidth = blockDim.y + halowidth * 2; // Keep original for memloc //unsigned int blksize = blkmemwidth * blkmemwidth; unsigned int ix = threadIdx.x; unsigned int iy = threadIdx.y; + + // Safety check for static shared memory bounds + if (ix >= STATIC_MAX_BLOCK_X || iy >= STATIC_MAX_BLOCK_Y) { + return; + } + unsigned int ibl = blockIdx.x; unsigned int ib = XBlock.active[ibl]; @@ -344,35 +405,50 @@ template __global__ void AddSlopeSourceXGPU(Param XParam, BlockP XB T ga = T(0.5) * g; - - - int i = memloc(halowidth, blkmemwidth, ix, iy, ib); - int ileft = memloc(halowidth, blkmemwidth, ix - 1, iy, ib); - + // Global memory indices + int global_idx_center = memloc(halowidth, blkmemwidth, ix, iy, ib); + int global_idx_left_halo = memloc(halowidth, blkmemwidth, ix - 1, iy, ib); // ix-1 for halo + + // Shared memory indices + int s_idx_current = ix + SHARED_MEM_HALO_WIDTH; // ix + 1 + int s_idx_halo = ix; // ix + + // Load data into shared memory + s_h[iy][s_idx_current] = XEv.h[global_idx_center]; + s_zs[iy][s_idx_current] = XEv.zs[global_idx_center]; + s_dhdx[iy][s_idx_current] = XGrad.dhdx[global_idx_center]; + s_dzsdx[iy][s_idx_current] = XGrad.dzsdx[global_idx_center]; + s_Fqux[iy][s_idx_current] = XFlux.Fqux[global_idx_center]; // Read current flux value + s_Su[iy][s_idx_current] = XFlux.Su[global_idx_center]; // Read current flux value + + if (ix == 0) { + s_h[iy][s_idx_halo] = XEv.h[global_idx_left_halo]; + s_zs[iy][s_idx_halo] = XEv.zs[global_idx_left_halo]; + s_dhdx[iy][s_idx_halo] = XGrad.dhdx[global_idx_left_halo]; + s_dzsdx[iy][s_idx_halo] = XGrad.dzsdx[global_idx_left_halo]; + // No halo load for s_Fqux, s_Su as they are modified in place based on current cell and its halo + } - T dhdxi = XGrad.dhdx[i]; - T dhdxmin = XGrad.dhdx[ileft]; - //T cm = T(1.0); + __syncthreads(); + + // Access data from shared memory + T dhdxi = s_dhdx[iy][s_idx_current]; + T dhdxmin = s_dhdx[iy][s_idx_halo]; + T hi = s_h[iy][s_idx_current]; + T hn = s_h[iy][s_idx_halo]; + //T cm = T(1.0); // Not used in this kernel as per original T fmu = T(1.0); T dx, zi, zl, zn, zr, zlr, hl, hp, hr, hm; - T hi = XEv.h[i]; - - T hn = XEv.h[ileft]; - if (hi > eps || hn > eps) { - - // along X these are same as in Kurgannov dx = delta * T(0.5); - zi = XEv.zs[i] - hi; + zi = s_zs[iy][s_idx_current] - hi; + zl = zi - dx * (s_dzsdx[iy][s_idx_current] - dhdxi); - zl = zi - dx * (XGrad.dzsdx[i] - dhdxi); - - zn = XEv.zs[ileft] - hn; - - zr = zn + dx * (XGrad.dzsdx[ileft] - dhdxmin); + zn = s_zs[iy][s_idx_halo] - hn; + zr = zn + dx * (s_dzsdx[iy][s_idx_halo] - dhdxmin); zlr = max(zl, zr); @@ -382,34 +458,47 @@ template __global__ void AddSlopeSourceXGPU(Param XParam, BlockP XB hr = hn + dx * dhdxmin; hm = max(T(0.0), hr + zr - zlr); + // Boundary condition handling + T hi_boundary = hi; + T zi_boundary = zi; + T hn_boundary = hn; + T zn_boundary = zn; - //#### Topographic source term - //In the case of adaptive refinement, care must be taken to ensure - // well - balancing at coarse / fine faces(see[notes / balanced.tm]()). * / - - if ((ix == blockDim.y) && levRB < lev)//(ix==16) i.e. in the right halo + if ((ix == STATIC_MAX_BLOCK_X - 1) && levRB < lev) { - int jj = LBRB == ib ? floor(iy * (T)0.5) : floor(iy * (T)0.5) + blockDim.y / 2; - int iright = memloc(halowidth, blkmemwidth, 0, jj, RB);; - hi = XEv.h[iright]; - zi = zb[iright]; + int jj = LBRB == ib ? floor(iy * (T)0.5) : floor(iy * (T)0.5) + STATIC_MAX_BLOCK_Y / 2; + int iright = memloc(halowidth, blkmemwidth, 0, jj, RB); + hi_boundary = XEv.h[iright]; + zi_boundary = zb[iright]; } - if ((ix == 0) && levLB < lev)//(ix==16) i.e. in the right halo + if ((ix == 0) && levLB < lev) { - int jj = RBLB == ib ? floor(iy * (T)0.5) : floor(iy * (T)0.5) + blockDim.y / 2; - int ilc = memloc(halowidth, blkmemwidth, blockDim.y - 1, jj, LB); - hn = XEv.h[ilc]; - zn = zb[ilc]; + int jj = RBLB == ib ? floor(iy * (T)0.5) : floor(iy * (T)0.5) + STATIC_MAX_BLOCK_Y / 2; + int ilc = memloc(halowidth, blkmemwidth, STATIC_MAX_BLOCK_X - 1, jj, LB); + hn_boundary = XEv.h[ilc]; + zn_boundary = zb[ilc]; } T sl, sr; - sl = ga * (utils::sq(hp) - utils::sq(hl) + (hl + hi) * (zi - zl)); - sr = ga * (utils::sq(hm) - utils::sq(hr) + (hr + hn) * (zn - zr)); + sl = ga * (utils::sq(hp) - utils::sq(hl) + (hl + hi_boundary) * (zi_boundary - zl)); + sr = ga * (utils::sq(hm) - utils::sq(hr) + (hr + hn_boundary) * (zn_boundary - zr)); + // Update flux components in shared memory + s_Fqux[iy][s_idx_current] = s_Fqux[iy][s_idx_current] - fmu * sl; + s_Su[iy][s_idx_current] = s_Su[iy][s_idx_current] - fmu * sr; - XFlux.Fqux[i] = XFlux.Fqux[i] - fmu * sl; - XFlux.Su[i] = XFlux.Su[i] - fmu * sr; + // Synchronize before writing back if there were cross-thread dependencies on these shared flux values. + // In this specific logic, each thread modifies its own s_Fqux[iy][s_idx_current] and s_Su[iy][s_idx_current]. + // However, for clarity or future modifications, a syncthreads can be defensive. + // __syncthreads(); // Optional here, as each thread writes to a distinct location based on its original global_idx_center. + + // Write results back to global memory + XFlux.Fqux[global_idx_center] = s_Fqux[iy][s_idx_current]; + XFlux.Su[global_idx_center] = s_Su[iy][s_idx_current]; } + // If (hi <= eps && hn <= eps), the original code does nothing to Fqux or Su, + // so no "else" branch is needed to write back unmodified Fqux/Su from shared memory, + // as they would have been loaded with their original values. } template __global__ void AddSlopeSourceXGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, float* zb); template __global__ void AddSlopeSourceXGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, double* zb); @@ -876,11 +965,27 @@ template __host__ void AddSlopeSourceXCPU(Param XParam, BlockP X template __global__ void updateKurgYGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, T* dtmax, T* zb) { - unsigned int halowidth = XParam.halowidth; - unsigned int blkmemwidth = blockDim.x + halowidth * 2; + // Shared memory declarations for Y-direction halo + __shared__ T s_h[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_zs[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_u[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_v[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_dhdy[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_dzsdy[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_dudy[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_dvdy[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + + unsigned int halowidth = XParam.halowidth; // Runtime halowidth + unsigned int blkmemwidth = blockDim.x + halowidth * 2; // Keep original for memloc //unsigned int blksize = blkmemwidth * blkmemwidth; unsigned int ix = threadIdx.x; unsigned int iy = threadIdx.y; + + // Safety check for static shared memory bounds + if (ix >= STATIC_MAX_BLOCK_X || iy >= STATIC_MAX_BLOCK_Y) { + return; + } + unsigned int ibl = blockIdx.x; unsigned int ib = XBlock.active[ibl]; @@ -903,109 +1008,127 @@ template __global__ void updateKurgYGPU(Param XParam, BlockP XBlock T ybo = T(XParam.yo + XBlock.yo[ib]); - int i = memloc(halowidth, blkmemwidth, ix, iy, ib); - int ibot = memloc(halowidth, blkmemwidth, ix , iy-1, ib); + // Global memory indices + int global_idx_center = memloc(halowidth, blkmemwidth, ix, iy, ib); + int global_idx_bottom_halo = memloc(halowidth, blkmemwidth, ix, iy - 1, ib); // iy-1 for "bottom" halo + + // Shared memory indices + int s_iy_current = iy + SHARED_MEM_HALO_WIDTH; // iy + 1 + int s_iy_halo = iy; // iy (for halo at iy=0) + + // Load data into shared memory + s_h[s_iy_current][ix] = XEv.h[global_idx_center]; + s_zs[s_iy_current][ix] = XEv.zs[global_idx_center]; + s_u[s_iy_current][ix] = XEv.u[global_idx_center]; + s_v[s_iy_current][ix] = XEv.v[global_idx_center]; + s_dhdy[s_iy_current][ix] = XGrad.dhdy[global_idx_center]; + s_dzsdy[s_iy_current][ix] = XGrad.dzsdy[global_idx_center]; + s_dudy[s_iy_current][ix] = XGrad.dudy[global_idx_center]; + s_dvdy[s_iy_current][ix] = XGrad.dvdy[global_idx_center]; + + if (iy == 0) { // Load "bottom" halo (iy-1 data) + s_h[s_iy_halo][ix] = XEv.h[global_idx_bottom_halo]; + s_zs[s_iy_halo][ix] = XEv.zs[global_idx_bottom_halo]; + s_u[s_iy_halo][ix] = XEv.u[global_idx_bottom_halo]; + s_v[s_iy_halo][ix] = XEv.v[global_idx_bottom_halo]; + s_dhdy[s_iy_halo][ix] = XGrad.dhdy[global_idx_bottom_halo]; + s_dzsdy[s_iy_halo][ix] = XGrad.dzsdy[global_idx_bottom_halo]; + s_dudy[s_iy_halo][ix] = XGrad.dudy[global_idx_bottom_halo]; + s_dvdy[s_iy_halo][ix] = XGrad.dvdy[global_idx_bottom_halo]; + } + + __syncthreads(); T cm = XParam.spherical ? calcCM(T(XParam.Radius), delta, ybo, iy) : T(1.0); T fmv = XParam.spherical ? calcFM(T(XParam.Radius), delta, ybo, T(iy)) : T(1.0); - T dhdyi = XGrad.dhdy[i]; - T dhdymin = XGrad.dhdy[ibot]; - T hi = XEv.h[i]; - T hn = XEv.h[ibot]; + // Access data from shared memory + T dhdyi = s_dhdy[s_iy_current][ix]; + T dhdymin = s_dhdy[s_iy_halo][ix]; + T hi = s_h[s_iy_current][ix]; + T hn = s_h[s_iy_halo][ix]; T dx, zi, zl, zn, zr, zlr, hl, up, hp, hr, um, hm,ga; - - if (hi > eps || hn > eps) { - hn = XEv.h[ibot]; + // Note: 'hn' is already loaded from shared memory s_h[s_iy_halo][ix] dx = delta * T(0.5); - zi = XEv.zs[i] - hi; - zl = zi - dx * (XGrad.dzsdy[i] - dhdyi); - zn = XEv.zs[ibot] - hn; - zr = zn + dx * (XGrad.dzsdy[ibot] - dhdymin); + zi = s_zs[s_iy_current][ix] - hi; + zl = zi - dx * (s_dzsdy[s_iy_current][ix] - dhdyi); + zn = s_zs[s_iy_halo][ix] - hn; + zr = zn + dx * (s_dzsdy[s_iy_halo][ix] - dhdymin); zlr = max(zl, zr); hl = hi - dx * dhdyi; - up = XEv.v[i] - dx * XGrad.dvdy[i]; + up = s_v[s_iy_current][ix] - dx * s_dvdy[s_iy_current][ix]; // Current thread's v and dvdy hp = max(T(0.0), hl + zl - zlr); hr = hn + dx * dhdymin; - um = XEv.v[ibot] + dx * XGrad.dvdy[ibot]; + um = s_v[s_iy_halo][ix] + dx * s_dvdy[s_iy_halo][ix]; // Bottom halo's v and dvdy hm = max(T(0.0), hr + zr - zlr); - ga = g * T(0.5); - - //// Reimann solver T fh, fu, fv, sl, sr, dt; - //solver below also modifies fh and fu dt = KurgSolver(g, delta, epsi, CFL, cm, fmv, hp, hm, up, um, fh, fu); - if (dt < dtmax[i]) + if (dt < dtmax[global_idx_center]) // Output to global memory { - dtmax[i] = dt; + dtmax[global_idx_center] = dt; } - - if (fh > T(0.0)) { - fv = (XEv.u[ibot] + dx * XGrad.dudy[ibot]) * fh; + fv = (s_u[s_iy_halo][ix] + dx * s_dudy[s_iy_halo][ix]) * fh; // Bottom halo's u and dudy } else { - fv = (XEv.u[i] - dx * XGrad.dudy[i]) * fh; + fv = (s_u[s_iy_current][ix] - dx * s_dudy[s_iy_current][ix]) * fh; // Current thread's u and dudy } - //fv = (fh > 0.f ? uu[ix + yminus*nx] + dx*dudy[ix + yminus*nx] : uu[i] - dx*dudy[i])*fh; - /** - #### Topographic source term - - In the case of adaptive refinement, care must be taken to ensure - well-balancing at coarse/fine faces (see [notes/balanced.tm]()). */ - //sl = ga * (utils::sq(hp) - utils::sq(hl) + (hl + hi) * (zi - zl)); - //sr = ga * (utils::sq(hm) - utils::sq(hr) + (hr + hn) * (zn - zr)); - - //#### Topographic source term - //In the case of adaptive refinement, care must be taken to ensure - // well - balancing at coarse / fine faces(see[notes / balanced.tm]()). * / - - if ((iy == blockDim.x) && levTL < lev)//(ix==16) i.e. in the right halo + + T hi_boundary = hi; + T zi_boundary = zi; + T hn_boundary = hn; + T zn_boundary = zn; + + // Original: if ((iy == blockDim.x) && levTL < lev) + // Assuming blockDim.x here means the Y-extent of the block (STATIC_MAX_BLOCK_Y). + // This condition checks the "top" boundary of the current block. + if ((iy == STATIC_MAX_BLOCK_Y - 1) && levTL < lev) { - int jj = BLTL == ib ? floor(ix * (T)0.5) : floor(ix * (T)0.5) + blockDim.x / 2; - int itop = memloc(halowidth, blkmemwidth, jj, 0, TL);; - hi = XEv.h[itop]; - zi = zb[itop]; + int jj = BLTL == ib ? floor(ix * (T)0.5) : floor(ix * (T)0.5) + STATIC_MAX_BLOCK_X / 2; + int itop = memloc(halowidth, blkmemwidth, jj, 0, TL); // Accesses cell 0 (in Y) of neighbor block TL + hi_boundary = XEv.h[itop]; + zi_boundary = zb[itop]; } - if ((iy == 0) && levBL < lev)//(ix==16) i.e. in the right halo + // Original: if ((iy == 0) && levBL < lev) + // This condition checks the "bottom" boundary of the current block. + if ((iy == 0) && levBL < lev) { - int jj = TLBL == ib ? floor(ix * (T)0.5) : floor(ix * (T)0.5) + blockDim.x / 2; - int ibc = memloc(halowidth, blkmemwidth, jj, blockDim.x - 1, BL); - hn = XEv.h[ibc]; - zn = zb[ibc]; + int jj = TLBL == ib ? floor(ix * (T)0.5) : floor(ix * (T)0.5) + STATIC_MAX_BLOCK_X / 2; + // Accesses last cell (STATIC_MAX_BLOCK_Y - 1 in Y) of neighbor block BL + int ibc = memloc(halowidth, blkmemwidth, jj, STATIC_MAX_BLOCK_Y - 1, BL); + hn_boundary = XEv.h[ibc]; + zn_boundary = zb[ibc]; } - sl = ga * (utils::sq(hp) - utils::sq(hl) + (hl + hi) * (zi - zl)); - sr = ga * (utils::sq(hm) - utils::sq(hr) + (hr + hn) * (zn - zr)); + sl = ga * (utils::sq(hp) - utils::sq(hl) + (hl + hi_boundary) * (zi_boundary - zl)); + sr = ga * (utils::sq(hm) - utils::sq(hr) + (hr + hn_boundary) * (zn_boundary - zr)); - ////Flux update - - XFlux.Fhv[i] = fmv * fh; - XFlux.Fqvy[i] = fmv * (fu - sl); - XFlux.Sv[i] = fmv * (fu - sr); - XFlux.Fquy[i] = fmv * fv; + ////Flux update (writing to global memory) + XFlux.Fhv[global_idx_center] = fmv * fh; + XFlux.Fqvy[global_idx_center] = fmv * (fu - sl); + XFlux.Sv[global_idx_center] = fmv * (fu - sr); + XFlux.Fquy[global_idx_center] = fmv * fv; } else { - dtmax[i] = T(1.0) / epsi; - XFlux.Fhv[i] = T(0.0); - XFlux.Fqvy[i] = T(0.0); - XFlux.Sv[i] = T(0.0); - XFlux.Fquy[i] = T(0.0); + dtmax[global_idx_center] = T(1.0) / epsi; // Output to global memory + XFlux.Fhv[global_idx_center] = T(0.0); + XFlux.Fqvy[global_idx_center] = T(0.0); + XFlux.Sv[global_idx_center] = T(0.0); + XFlux.Fquy[global_idx_center] = T(0.0); } - } template __global__ void updateKurgYGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, float* dtmax, float* zb); template __global__ void updateKurgYGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, double* dtmax, double *zb); @@ -1013,11 +1136,29 @@ template __global__ void updateKurgYGPU(Param XParam, BlockP XBl template __global__ void updateKurgYATMGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, T* dtmax, T* zb, T* Patm,T* dPdy) { - unsigned int halowidth = XParam.halowidth; - unsigned int blkmemwidth = blockDim.x + halowidth * 2; + // Shared memory declarations for Y-direction halo + __shared__ T s_h[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_zs[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_u[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_v[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_dhdy[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_dzsdy[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_dudy[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_dvdy[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_Patm[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_dPdy[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + + unsigned int halowidth = XParam.halowidth; // Runtime halowidth + unsigned int blkmemwidth = blockDim.x + halowidth * 2; // Keep original for memloc //unsigned int blksize = blkmemwidth * blkmemwidth; unsigned int ix = threadIdx.x; unsigned int iy = threadIdx.y; + + // Safety check for static shared memory bounds + if (ix >= STATIC_MAX_BLOCK_X || iy >= STATIC_MAX_BLOCK_Y) { + return; + } + unsigned int ibl = blockIdx.x; unsigned int ib = XBlock.active[ibl]; @@ -1040,116 +1181,134 @@ template __global__ void updateKurgYATMGPU(Param XParam, BlockP XBl T ybo = T(XParam.yo + XBlock.yo[ib]); - int i = memloc(halowidth, blkmemwidth, ix, iy, ib); - int ibot = memloc(halowidth, blkmemwidth, ix, iy - 1, ib); + // Global memory indices + int global_idx_center = memloc(halowidth, blkmemwidth, ix, iy, ib); + int global_idx_bottom_halo = memloc(halowidth, blkmemwidth, ix, iy - 1, ib); // iy-1 for "bottom" halo + + // Shared memory indices + int s_iy_current = iy + SHARED_MEM_HALO_WIDTH; // iy + 1 + int s_iy_halo = iy; // iy (for halo at iy=0) + + // Load data into shared memory + s_h[s_iy_current][ix] = XEv.h[global_idx_center]; + s_zs[s_iy_current][ix] = XEv.zs[global_idx_center]; + s_u[s_iy_current][ix] = XEv.u[global_idx_center]; + s_v[s_iy_current][ix] = XEv.v[global_idx_center]; + s_dhdy[s_iy_current][ix] = XGrad.dhdy[global_idx_center]; + s_dzsdy[s_iy_current][ix] = XGrad.dzsdy[global_idx_center]; + s_dudy[s_iy_current][ix] = XGrad.dudy[global_idx_center]; + s_dvdy[s_iy_current][ix] = XGrad.dvdy[global_idx_center]; + s_Patm[s_iy_current][ix] = Patm[global_idx_center]; + s_dPdy[s_iy_current][ix] = dPdy[global_idx_center]; + + if (iy == 0) { // Load "bottom" halo (iy-1 data) + s_h[s_iy_halo][ix] = XEv.h[global_idx_bottom_halo]; + s_zs[s_iy_halo][ix] = XEv.zs[global_idx_bottom_halo]; + s_u[s_iy_halo][ix] = XEv.u[global_idx_bottom_halo]; + s_v[s_iy_halo][ix] = XEv.v[global_idx_bottom_halo]; + s_dhdy[s_iy_halo][ix] = XGrad.dhdy[global_idx_bottom_halo]; + s_dzsdy[s_iy_halo][ix] = XGrad.dzsdy[global_idx_bottom_halo]; + s_dudy[s_iy_halo][ix] = XGrad.dudy[global_idx_bottom_halo]; + s_dvdy[s_iy_halo][ix] = XGrad.dvdy[global_idx_bottom_halo]; + s_Patm[s_iy_halo][ix] = Patm[global_idx_bottom_halo]; + s_dPdy[s_iy_halo][ix] = dPdy[global_idx_bottom_halo]; + } + + __syncthreads(); T cm = XParam.spherical ? calcCM(T(XParam.Radius), delta, ybo, iy) : T(1.0); T fmv = XParam.spherical ? calcFM(T(XParam.Radius), delta, ybo, T(iy)) : T(1.0); - T dhdyi = XGrad.dhdy[i]; - T dhdymin = XGrad.dhdy[ibot]; - T hi = XEv.h[i]; - T hn = XEv.h[ibot]; + // Access data from shared memory + T dhdyi = s_dhdy[s_iy_current][ix]; + T dhdymin = s_dhdy[s_iy_halo][ix]; + T hi = s_h[s_iy_current][ix]; + T hn = s_h[s_iy_halo][ix]; T dx, zi, zl, zn, zr, zlr, hl, up, hp, hr, um, hm, ga; - - if (hi > eps || hn > eps) { - hn = XEv.h[ibot]; + // hn is already from shared s_h[s_iy_halo][ix] dx = delta * T(0.5); - //zi = XEv.zs[i] - hi; - //zl = zi - dx * (XGrad.dzsdy[i] - dhdyi); - //zn = XEv.zs[ibot] - hn; - //zr = zn + dx * (XGrad.dzsdy[ibot] - dhdymin); - - zi = XEv.zs[i] - hi + XParam.Pa2m * Patm[i]; - zl = zi - dx * (XGrad.dzsdy[i] - dhdyi + XParam.Pa2m * dPdy[i]); - zn = XEv.zs[ibot] - hn + XParam.Pa2m * Patm[ibot]; - zr = zn + dx * (XGrad.dzsdy[ibot] - dhdymin + XParam.Pa2m * dPdy[ibot]); + zi = s_zs[s_iy_current][ix] - hi + XParam.Pa2m * s_Patm[s_iy_current][ix]; + zl = zi - dx * (s_dzsdy[s_iy_current][ix] - dhdyi + XParam.Pa2m * s_dPdy[s_iy_current][ix]); + zn = s_zs[s_iy_halo][ix] - hn + XParam.Pa2m * s_Patm[s_iy_halo][ix]; + zr = zn + dx * (s_dzsdy[s_iy_halo][ix] - dhdymin + XParam.Pa2m * s_dPdy[s_iy_halo][ix]); zlr = max(zl, zr); hl = hi - dx * dhdyi; - up = XEv.v[i] - dx * XGrad.dvdy[i]; + up = s_v[s_iy_current][ix] - dx * s_dvdy[s_iy_current][ix]; hp = max(T(0.0), hl + zl - zlr); hr = hn + dx * dhdymin; - um = XEv.v[ibot] + dx * XGrad.dvdy[ibot]; + um = s_v[s_iy_halo][ix] + dx * s_dvdy[s_iy_halo][ix]; hm = max(T(0.0), hr + zr - zlr); - ga = g * T(0.5); - - //// Reimann solver T fh, fu, fv, sl, sr, dt; - //solver below also modifies fh and fu dt = KurgSolver(g, delta, epsi, CFL, cm, fmv, hp, hm, up, um, fh, fu); - if (dt < dtmax[i]) + if (dt < dtmax[global_idx_center]) // Output to global memory { - dtmax[i] = dt; + dtmax[global_idx_center] = dt; } - - if (fh > T(0.0)) { - fv = (XEv.u[ibot] + dx * XGrad.dudy[ibot]) * fh; + fv = (s_u[s_iy_halo][ix] + dx * s_dudy[s_iy_halo][ix]) * fh; } else { - fv = (XEv.u[i] - dx * XGrad.dudy[i]) * fh; + fv = (s_u[s_iy_current][ix] - dx * s_dudy[s_iy_current][ix]) * fh; } - //fv = (fh > 0.f ? uu[ix + yminus*nx] + dx*dudy[ix + yminus*nx] : uu[i] - dx*dudy[i])*fh; - /** - #### Topographic source term - - In the case of adaptive refinement, care must be taken to ensure - well-balancing at coarse/fine faces (see [notes/balanced.tm]()). */ - //sl = ga * (utils::sq(hp) - utils::sq(hl) + (hl + hi) * (zi - zl)); - //sr = ga * (utils::sq(hm) - utils::sq(hr) + (hr + hn) * (zn - zr)); - - //#### Topographic source term - //In the case of adaptive refinement, care must be taken to ensure - // well - balancing at coarse / fine faces(see[notes / balanced.tm]()). * / - - if ((iy == blockDim.x) && levTL < lev)//(ix==16) i.e. in the right halo + + // Boundary condition handling + T hi_boundary = hi; + T zi_boundary = zi; // zi already incorporates s_Patm for current cell + T hn_boundary = hn; + T zn_boundary = zn; // zn already incorporates s_Patm for halo cell + + // Original: if ((iy == blockDim.x) && levTL < lev) + // Interpreted as top boundary of the current block. + if ((iy == STATIC_MAX_BLOCK_Y - 1) && levTL < lev) { - int jj = BLTL == ib ? floor(ix * (T)0.5) : floor(ix * (T)0.5) + blockDim.x / 2; - int itop = memloc(halowidth, blkmemwidth, jj, 0, TL);; - hi = XEv.h[itop]; - zi = zb[itop] + XParam.Pa2m * Patm[itop]; + int jj = BLTL == ib ? floor(ix * (T)0.5) : floor(ix * (T)0.5) + STATIC_MAX_BLOCK_X / 2; + int itop = memloc(halowidth, blkmemwidth, jj, 0, TL); + hi_boundary = XEv.h[itop]; + // Patm[itop] is a global access from a neighboring block (TL). + zi_boundary = zb[itop] + XParam.Pa2m * Patm[itop]; } - if ((iy == 0) && levBL < lev)//(ix==16) i.e. in the right halo + // Original: if ((iy == 0) && levBL < lev) + // Bottom boundary of the current block. + if ((iy == 0) && levBL < lev) { - int jj = TLBL == ib ? floor(ix * (T)0.5) : floor(ix * (T)0.5) + blockDim.x / 2; - int ibc = memloc(halowidth, blkmemwidth, jj, blockDim.x - 1, BL); - hn = XEv.h[ibc]; - zn = zb[ibc] + +XParam.Pa2m * Patm[ibc]; + int jj = TLBL == ib ? floor(ix * (T)0.5) : floor(ix * (T)0.5) + STATIC_MAX_BLOCK_X / 2; + int ibc = memloc(halowidth, blkmemwidth, jj, STATIC_MAX_BLOCK_Y - 1, BL); + hn_boundary = XEv.h[ibc]; + // Patm[ibc] is a global access from a neighboring block (BL). + zn_boundary = zb[ibc] + XParam.Pa2m * Patm[ibc]; } - sl = ga * (utils::sq(hp) - utils::sq(hl) + (hl + hi) * (zi - zl)); - sr = ga * (utils::sq(hm) - utils::sq(hr) + (hr + hn) * (zn - zr)); - - ////Flux update + sl = ga * (utils::sq(hp) - utils::sq(hl) + (hl + hi_boundary) * (zi_boundary - zl)); + sr = ga * (utils::sq(hm) - utils::sq(hr) + (hr + hn_boundary) * (zn_boundary - zr)); - XFlux.Fhv[i] = fmv * fh; - XFlux.Fqvy[i] = fmv * (fu - sl); - XFlux.Sv[i] = fmv * (fu - sr); - XFlux.Fquy[i] = fmv * fv; + ////Flux update (writing to global memory) + XFlux.Fhv[global_idx_center] = fmv * fh; + XFlux.Fqvy[global_idx_center] = fmv * (fu - sl); + XFlux.Sv[global_idx_center] = fmv * (fu - sr); + XFlux.Fquy[global_idx_center] = fmv * fv; } else { - dtmax[i] = T(1.0) / epsi; - XFlux.Fhv[i] = T(0.0); - XFlux.Fqvy[i] = T(0.0); - XFlux.Sv[i] = T(0.0); - XFlux.Fquy[i] = T(0.0); + dtmax[global_idx_center] = T(1.0) / epsi; // Output to global memory + XFlux.Fhv[global_idx_center] = T(0.0); + XFlux.Fqvy[global_idx_center] = T(0.0); + XFlux.Sv[global_idx_center] = T(0.0); + XFlux.Fquy[global_idx_center] = T(0.0); } - } template __global__ void updateKurgYATMGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, float* dtmax, float* zb, float* Patm, float* dPdy); template __global__ void updateKurgYATMGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, double* dtmax, double* zb, double* Patm, double* dPdy); @@ -1158,11 +1317,25 @@ template __global__ void updateKurgYATMGPU(Param XParam, BlockP template __global__ void AddSlopeSourceYGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, T* zb) { - unsigned int halowidth = XParam.halowidth; - unsigned int blkmemwidth = blockDim.x + halowidth * 2; + // Shared memory declarations for Y-direction halo + __shared__ T s_h[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_zs[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_dhdy[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_dzsdy[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_Fqvy[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + __shared__ T s_Sv[STATIC_MAX_BLOCK_Y + SHARED_MEM_HALO_WIDTH][STATIC_MAX_BLOCK_X]; + + unsigned int halowidth = XParam.halowidth; // Runtime halowidth + unsigned int blkmemwidth = blockDim.x + halowidth * 2; // Keep original for memloc //unsigned int blksize = blkmemwidth * blkmemwidth; unsigned int ix = threadIdx.x; unsigned int iy = threadIdx.y; + + // Safety check for static shared memory bounds + if (ix >= STATIC_MAX_BLOCK_X || iy >= STATIC_MAX_BLOCK_Y) { + return; + } + unsigned int ibl = blockIdx.x; unsigned int ib = XBlock.active[ibl]; @@ -1188,33 +1361,56 @@ template __global__ void AddSlopeSourceYGPU(Param XParam, BlockP XB T ybo = T(XParam.yo + XBlock.yo[ib]); - - int i = memloc(halowidth, blkmemwidth, ix, iy, ib); - int ibot = memloc(halowidth, blkmemwidth, ix, iy - 1, ib); - - + // Global memory indices + int global_idx_center = memloc(halowidth, blkmemwidth, ix, iy, ib); + int global_idx_bottom_halo = memloc(halowidth, blkmemwidth, ix, iy - SHARED_MEM_HALO_WIDTH, ib); // iy-1 + + // Shared memory indices + int s_iy_current = iy + SHARED_MEM_HALO_WIDTH; // iy + 1 + int s_iy_halo = iy; // iy (for halo at iy=0) + + // Load data into shared memory + s_h[s_iy_current][ix] = XEv.h[global_idx_center]; + s_zs[s_iy_current][ix] = XEv.zs[global_idx_center]; + s_dhdy[s_iy_current][ix] = XGrad.dhdy[global_idx_center]; + s_dzsdy[s_iy_current][ix] = XGrad.dzsdy[global_idx_center]; + s_Fqvy[s_iy_current][ix] = XFlux.Fqvy[global_idx_center]; + s_Sv[s_iy_current][ix] = XFlux.Sv[global_idx_center]; + + if (iy == 0) { // Load "bottom" halo (iy-1 data) + s_h[s_iy_halo][ix] = XEv.h[global_idx_bottom_halo]; + s_zs[s_iy_halo][ix] = XEv.zs[global_idx_bottom_halo]; + s_dhdy[s_iy_halo][ix] = XGrad.dhdy[global_idx_bottom_halo]; + s_dzsdy[s_iy_halo][ix] = XGrad.dzsdy[global_idx_bottom_halo]; + // Fluxes s_Fqvy and s_Sv for halo are not strictly needed as they are outputs, + // but loading them or zeroing might prevent uninitialized reads if logic changes. + // For now, we'll match the pattern of loading inputs. + s_Fqvy[s_iy_halo][ix] = XFlux.Fqvy[global_idx_bottom_halo]; + s_Sv[s_iy_halo][ix] = XFlux.Sv[global_idx_bottom_halo]; + } + + __syncthreads(); - //T cm = T(1.0); + //T cm = T(1.0); // Not used in this kernel T fmv = XParam.spherical ? calcFM(T(XParam.Radius), delta, ybo, T(iy)) : T(1.0); T dx, zi, zl, zn, zr, zlr, hl, hp, hr, hm; - T dhdyi = XGrad.dhdy[i]; - T dhdymin = XGrad.dhdy[ibot]; - T hi = XEv.h[i]; - T hn = XEv.h[ibot]; - + // Access data from shared memory + T dhdyi = s_dhdy[s_iy_current][ix]; + T dhdymin = s_dhdy[s_iy_halo][ix]; + T hi = s_h[s_iy_current][ix]; + T hn = s_h[s_iy_halo][ix]; if (hi > eps || hn > eps) { - - // along X these are same as in Kurgannov dx = delta * T(0.5); - zi = XEv.zs[i] - hi; + zi = s_zs[s_iy_current][ix] - hi; + zl = zi - dx * (s_dzsdy[s_iy_current][ix] - dhdyi); - zl = zi - dx * (XGrad.dzsdy[i] - dhdyi); - zn = XEv.zs[ibot] - hn; - zr = zn + dx * (XGrad.dzsdy[ibot] - dhdymin); + zn = s_zs[s_iy_halo][ix] - hn; + zr = zn + dx * (s_dzsdy[s_iy_halo][ix] - dhdymin); + zlr = max(zl, zr); hl = hi - dx * dhdyi; @@ -1223,34 +1419,47 @@ template __global__ void AddSlopeSourceYGPU(Param XParam, BlockP XB hr = hn + dx * dhdymin; hm = max(T(0.0), hr + zr - zlr); + // Boundary condition handling + T hi_boundary = hi; + T zi_boundary = zi; + T hn_boundary = hn; + T zn_boundary = zn; - //#### Topographic source term - //In the case of adaptive refinement, care must be taken to ensure - // well - balancing at coarse / fine faces(see[notes / balanced.tm]()). * / - - if ((iy == blockDim.x) && levTL < lev)//(ix==16) i.e. in the right halo + // Original: if ((iy == blockDim.x) && levTL < lev) + // Interpreted as top boundary of the current block. + if ((iy == STATIC_MAX_BLOCK_Y - 1) && levTL < lev) { - int jj = BLTL == ib ? floor(ix * (T)0.5) : floor(ix * (T)0.5) + blockDim.x / 2; - int itop = memloc(halowidth, blkmemwidth, jj, 0, TL);; - hi = XEv.h[itop]; - zi = zb[itop]; + int jj = BLTL == ib ? floor(ix * (T)0.5) : floor(ix * (T)0.5) + STATIC_MAX_BLOCK_X / 2; + int itop = memloc(halowidth, blkmemwidth, jj, 0, TL); + hi_boundary = XEv.h[itop]; + zi_boundary = zb[itop]; } - if ((iy == 0) && levBL < lev)//(ix==16) i.e. in the right halo + // Original: if ((iy == 0) && levBL < lev) + // Bottom boundary of the current block. + if ((iy == 0) && levBL < lev) { - int jj = TLBL == ib ? floor(ix * (T)0.5) : floor(ix * (T)0.5) + blockDim.x / 2; - int ibc = memloc(halowidth, blkmemwidth, jj, blockDim.x - 1, BL); - hn = XEv.h[ibc]; - zn = zb[ibc]; + int jj = TLBL == ib ? floor(ix * (T)0.5) : floor(ix * (T)0.5) + STATIC_MAX_BLOCK_X / 2; + int ibc = memloc(halowidth, blkmemwidth, jj, STATIC_MAX_BLOCK_Y - 1, BL); + hn_boundary = XEv.h[ibc]; + zn_boundary = zb[ibc]; } T sl, sr; - sl = ga * (utils::sq(hp) - utils::sq(hl) + (hl + hi) * (zi - zl)); - sr = ga * (utils::sq(hm) - utils::sq(hr) + (hr + hn) * (zn - zr)); + sl = ga * (utils::sq(hp) - utils::sq(hl) + (hl + hi_boundary) * (zi_boundary - zl)); + sr = ga * (utils::sq(hm) - utils::sq(hr) + (hr + hn_boundary) * (zn_boundary - zr)); + + // Update flux components in shared memory + s_Fqvy[s_iy_current][ix] = s_Fqvy[s_iy_current][ix] - fmv * sl; + s_Sv[s_iy_current][ix] = s_Sv[s_iy_current][ix] - fmv * sr; + __syncthreads(); // Synchronize before writing back to global memory - XFlux.Fqvy[i] = XFlux.Fqvy[i] - fmv * sl; - XFlux.Sv[i] = XFlux.Sv[i] - fmv * sr; + // Write results back to global memory + XFlux.Fqvy[global_idx_center] = s_Fqvy[s_iy_current][ix]; + XFlux.Sv[global_idx_center] = s_Sv[s_iy_current][ix]; } + // If (hi <= eps && hn <= eps), the original code does nothing to Fqvy or Sv, + // so no "else" branch is needed to write back unmodified values. } template __global__ void AddSlopeSourceYGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, float* zb); template __global__ void AddSlopeSourceYGPU(Param XParam, BlockP XBlock, EvolvingP XEv, GradientsP XGrad, FluxP XFlux, double* zb); diff --git a/src/Mainloop.cu b/src/Mainloop.cu index 8f5503720..e9e69dd1d 100755 --- a/src/Mainloop.cu +++ b/src/Mainloop.cu @@ -100,7 +100,7 @@ template void DebugLoop(Param& XParam, Forcing XForcing, Model< fillHaloGPU(XParam, XModel_g.blocks, XLoop.streams[0], XModel_g.zb); cudaStreamDestroy(XLoop.streams[0]); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.zb, XModel_g.grad.dzbdx, XModel_g.grad.dzbdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.zb, XModel_g.grad.dzbdx, XModel_g.grad.dzbdy); CUDA_CHECK(cudaDeviceSynchronize()); gradientHaloGPU(XParam, XModel_g.blocks, XModel_g.zb, XModel_g.grad.dzbdx, XModel_g.grad.dzbdy); @@ -280,7 +280,7 @@ template void pointoutputstep(Param XParam, Loop &XLoop, Model X XLoop.TSAllout[o].push_back(stepread); - storeTSout << > > (XParam,(int)XParam.TSnodesout.size(), o, XLoop.nTSsteps, XParam.TSnodesout[o].block, XParam.TSnodesout[o].i, XParam.TSnodesout[o].j, XModel_g.bndblk.Tsout, XModel_g.evolv, XModel_g.TSstore); + storeTSout <<>> (XParam,(int)XParam.TSnodesout.size(), o, XLoop.nTSsteps, XParam.TSnodesout[o].block, XParam.TSnodesout[o].i, XParam.TSnodesout[o].j, XModel_g.bndblk.Tsout, XModel_g.evolv, XModel_g.TSstore); CUDA_CHECK(cudaDeviceSynchronize()); } //CUDA_CHECK(cudaDeviceSynchronize()); diff --git a/src/Meanmax.cu b/src/Meanmax.cu index 777227126..58996dc04 100755 --- a/src/Meanmax.cu +++ b/src/Meanmax.cu @@ -12,11 +12,11 @@ template void Calcmeanmax(Param XParam, Loop& XLoop, Model XMode { if (XParam.GPUDEVICE >= 0) { - addavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmean.h, XModel_g.evolv.h); - addavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmean.zs, XModel_g.evolv.zs); - addavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmean.u, XModel_g.evolv.u); - addavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmean.v, XModel_g.evolv.v); - addUandhU_GPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evolv.h, XModel_g.evolv.u, XModel_g.evolv.v, XModel_g.evmean.U, XModel_g.evmean.hU); + addavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmean.h, XModel_g.evolv.h); + addavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmean.zs, XModel_g.evolv.zs); + addavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmean.u, XModel_g.evolv.u); + addavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmean.v, XModel_g.evolv.v); + addUandhU_GPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evolv.h, XModel_g.evolv.u, XModel_g.evolv.v, XModel_g.evmean.U, XModel_g.evmean.hU); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -39,12 +39,12 @@ template void Calcmeanmax(Param XParam, Loop& XLoop, Model XMode // devide by number of steps if (XParam.GPUDEVICE >= 0) { - divavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.h); - divavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.zs); - divavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.u); - divavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.v); - divavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.U); - divavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.hU); + divavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.h); + divavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.zs); + divavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.u); + divavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.v); + divavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.U); + divavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.hU); CUDA_CHECK(cudaDeviceSynchronize()); } else @@ -65,12 +65,12 @@ template void Calcmeanmax(Param XParam, Loop& XLoop, Model XMode { if (XParam.GPUDEVICE >= 0) { - max_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmax.h, XModel_g.evolv.h); - max_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmax.zs, XModel_g.evolv.zs); - max_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmax.u, XModel_g.evolv.u); - max_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmax.v, XModel_g.evolv.v); - max_Norm_GPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmax.U, XModel_g.evolv.u, XModel_g.evolv.v); - max_hU_GPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmax.hU, XModel_g.evolv.h, XModel_g.evolv.u, XModel_g.evolv.v); + max_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmax.h, XModel_g.evolv.h); + max_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmax.zs, XModel_g.evolv.zs); + max_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmax.u, XModel_g.evolv.u); + max_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmax.v, XModel_g.evolv.v); + max_Norm_GPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmax.U, XModel_g.evolv.u, XModel_g.evolv.v); + max_hU_GPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmax.hU, XModel_g.evolv.h, XModel_g.evolv.u, XModel_g.evolv.v); CUDA_CHECK(cudaDeviceSynchronize()); } else @@ -88,7 +88,7 @@ template void Calcmeanmax(Param XParam, Loop& XLoop, Model XMode if (XParam.GPUDEVICE >= 0) { // Add value GPU - addwettime_GPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.wettime, XModel_g.evolv.h, T(XParam.wet_threshold), T(XLoop.dt)); + addwettime_GPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.wettime, XModel_g.evolv.h, T(XParam.wet_threshold), T(XLoop.dt)); } else { @@ -169,17 +169,17 @@ template void resetmaxGPU(Param XParam, Loop XLoop, BlockP XBloc dim3 blockDim(XParam.blkwidth, XParam.blkwidth, 1); dim3 gridDim(XParam.nblk, 1, 1); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.h); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.h); CUDA_CHECK(cudaDeviceSynchronize()); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.zs); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.zs); CUDA_CHECK(cudaDeviceSynchronize()); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.u); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.u); CUDA_CHECK(cudaDeviceSynchronize()); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.v); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.v); CUDA_CHECK(cudaDeviceSynchronize()); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.U); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.U); CUDA_CHECK(cudaDeviceSynchronize()); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.hU); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.hU); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -217,12 +217,12 @@ template void resetmeanGPU(Param XParam, Loop XLoop, BlockP XBlo dim3 blockDim(XParam.blkwidth, XParam.blkwidth, 1); dim3 gridDim(XParam.nblk, 1, 1); // - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, T(0.0), XEv.h); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, T(0.0), XEv.zs); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, T(0.0), XEv.u); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, T(0.0), XEv.v); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, T(0.0), XEv.U); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, T(0.0), XEv.hU); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, T(0.0), XEv.h); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, T(0.0), XEv.zs); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, T(0.0), XEv.u); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, T(0.0), XEv.v); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, T(0.0), XEv.U); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, T(0.0), XEv.hU); CUDA_CHECK(cudaDeviceSynchronize()); @@ -244,7 +244,7 @@ template void resetvalGPU(Param XParam, BlockP XBlock, T*& var, T v { dim3 blockDim(XParam.blkwidth, XParam.blkwidth, 1); dim3 gridDim(XParam.nblk, 1, 1); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, val, var); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, val, var); CUDA_CHECK(cudaDeviceSynchronize()); } diff --git a/src/MemManagement.cu b/src/MemManagement.cu index f6415a682..b92ae70b6 100755 --- a/src/MemManagement.cu +++ b/src/MemManagement.cu @@ -1,6 +1,10 @@ #include "MemManagement.h" + +#define MEMORY_ALIGNMENT 4096 +#define ALIGN_UP(x,size) ( ((size_t)x+(size-1))&(~(size-1)) ) + __host__ int memloc(Param XParam, int i, int j, int ib) { return (i+XParam.halowidth) + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize; @@ -203,6 +207,9 @@ void AllocateCPU(int nblk, int blksize, Param XParam, Model& XModel) //this will be eventually reallocated later AllocateCPU(1, 1, XModel.bndblk.river); XModel.bndblk.nblkriver = 1; + + AllocateCPU(1, 1, XModel.bndblk.Riverinfo.Xbidir); + AllocateCPU(1, 1, XModel.bndblk.Riverinfo.Xridib); } // preallocate 1 block along all bnds //this will be eventually reallocated later @@ -359,6 +366,82 @@ template void ReallocArray(int nblk, int blksize, Param XParam, Model(int nblk, int blksize, Param XParam, Model& XModel); + + +template void AllocateMappedMemCPU(int nx, int ny,int gpudevice, T*& z) +{ + + bool bPinGenericMemory; + cudaDeviceProp deviceProp; +#if defined(__APPLE__) || defined(MACOSX) + bPinGenericMemory = false; // Generic Pinning of System Paged memory is not currently supported on Mac OSX +#else + bPinGenericMemory = true; +#endif + + // Here there should be a limit for cudar version less than 4.000 + + + if (bPinGenericMemory) + { + //printf("> Using Generic System Paged Memory (malloc)\n"); + } + else + { + //printf("> Using CUDA Host Allocated (cudaHostAlloc)\n"); + } + if (gpudevice >= 0) + { + cudaGetDeviceProperties(&deviceProp, gpudevice); + + if (!deviceProp.canMapHostMemory) + { + fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", gpudevice); + bPinGenericMemory = false; + } + } + size_t bytes = nx * ny * sizeof(T); + if (bPinGenericMemory) + { + + + + T* a_UA = (T*)malloc(bytes + MEMORY_ALIGNMENT); + + + // We need to ensure memory is aligned to 4K (so we will need to padd memory accordingly) + z = (T*)ALIGN_UP(a_UA, MEMORY_ALIGNMENT); + + if (gpudevice >= 0) + { + CUDA_CHECK(cudaHostRegister(z, bytes, cudaHostRegisterMapped)); + } + + } + else + { + + //flags = cudaHostAllocMapped; + CUDA_CHECK(cudaHostAlloc((void**)&z, bytes, cudaHostAllocMapped)); + + + } + + +} +template void AllocateMappedMemCPU(int nx, int ny, int gpudevice, int*& z); +template void AllocateMappedMemCPU(int nx, int ny, int gpudevice, float*& z); +template void AllocateMappedMemCPU(int nx, int ny, int gpudevice, double*& z); + +template void AllocateMappedMemGPU(int nx, int ny, int gpudevice, T*& z_g, T* z) +{ + CUDA_CHECK(cudaHostGetDevicePointer((void**)&z_g, (void*)z, 0)); +} +template void AllocateMappedMemGPU(int nx, int ny, int gpudevice, int*& z_g, int* z); +template void AllocateMappedMemGPU(int nx, int ny, int gpudevice,float*& z_g, float* z); +template void AllocateMappedMemGPU(int nx, int ny, int gpudevice, double*& z_g, double* z); + + template void AllocateGPU(int nx, int ny, T*& z_g) { CUDA_CHECK(cudaMalloc((void**)& z_g, nx * ny * sizeof(T))); diff --git a/src/MemManagement.h b/src/MemManagement.h index 9eee09f4c..ae550777f 100755 --- a/src/MemManagement.h +++ b/src/MemManagement.h @@ -25,6 +25,9 @@ template void ReallocArray(int nblk, int blksize, EvolvingP& Ev); template void ReallocArray(int nblk, int blksize, EvolvingP_M& Ev); template void ReallocArray(int nblk, int blksize, Param XParam, Model& XModel); +template void AllocateMappedMemCPU(int nx, int ny, int gpudevice, T*& z); + + template __host__ void FillCPU(int nx, int ny, T fillval, T*& zb); int memloc(Param XParam, int i, int j, int ib); @@ -33,5 +36,6 @@ __host__ __device__ int memloc(int halowidth, int blkmemwidth, int i, int j, int template void AllocateGPU(int nblk, int blksize, Param XParam, Model& XModel); template void AllocateGPU(int nx, int ny, T*& z_g); +template void AllocateMappedMemGPU(int nx, int ny, int gpudevice, T*& z_g, T* z); // End of global definition #endif diff --git a/src/Param.h b/src/Param.h index 40b8d11fa..04efe05c9 100644 --- a/src/Param.h +++ b/src/Param.h @@ -31,11 +31,12 @@ class Param { bool conserveElevation = false; //Switch to force the conservation of zs instead of h at the interface between coarse and fine blocks bool wetdryfix = true; // Switch to remove wet/dry instability (i.e. true reoves instability and false leaves the model as is) + bool ForceMassConserve = false; // Switch to enforce mass conservation only useful on steep slope - bool leftbnd = false; // bnd is forced (i.e. not a wall or neuman) - bool rightbnd = false; // bnd is forced (i.e. not a wall or neuman) - bool topbnd = false; // bnd is forced (i.e. not a wall or neuman) - bool botbnd = false; // bnd is forced (i.e. not a wall or neuman) + bool leftbnd = false; // bnd is forced (i.e. not a wall or neuman) // Not in use anymore + bool rightbnd = false; // bnd is forced (i.e. not a wall or neuman) // Not in use anymore + bool topbnd = false; // bnd is forced (i.e. not a wall or neuman) // Not in use anymore + bool botbnd = false; // bnd is forced (i.e. not a wall or neuman) // Not in use anymore int aoibnd = 0; // Boundary type for AOI: 0=wall; 1 neumann; 3 absorbing @@ -84,6 +85,7 @@ class Param { double outputtimestep = 0.0; //Number of seconds between netCDF outputs, 0.0 for none double endtime = std::numeric_limits::max(); // Total runtime in s, will be calculated based on bnd input as min(length of the shortest time series, user defined) and should be shorter than any time-varying forcing double totaltime = 0.0; // Total simulation time in s + double inittime = 0.0; // initital model time. At start of simulation inittime==totaltime double dtinit = -1; // Maximum initial time steps in s (should be positive, advice 0.1 if dry domain initialement) double dtmin = 0.0005; //Minimum accepted time steps in s (a lower value will be concidered a crash of the code, and stop the run) double bndrelaxtime = 3600.0; // Realxation time for absorbing boundary diff --git a/src/ReadForcing.cu b/src/ReadForcing.cu index 057d42140..fbe75840f 100755 --- a/src/ReadForcing.cu +++ b/src/ReadForcing.cu @@ -671,7 +671,72 @@ Polygon readbndpolysegment(bndsegment bnd, Param XParam) double yo = XParam.yo; double ymax = XParam.ymax; - if (case_insensitive_compare(bnd.polyfile,"left")==0) + + if (case_insensitive_compare(bnd.polyfile, "all") == 0) + { + va.x = xo - epsbnd; va.y = yo - epsbnd; + vb.x = xmax + epsbnd; vb.y = yo - epsbnd; + vc.x = xmax + epsbnd; vc.y = ymax + epsbnd; + vd.x = xo - epsbnd; vd.y = ymax + epsbnd; + + bndpoly.vertices.push_back(va); + bndpoly.vertices.push_back(vb); + bndpoly.vertices.push_back(vc); + bndpoly.vertices.push_back(vd); + bndpoly.vertices.push_back(va); + bndpoly.xmin = va.x; + bndpoly.xmax = vb.x; + bndpoly.ymin = va.y; + bndpoly.ymax = vd.y; + } + else if (case_insensitive_compare(bnd.polyfile, "file") == 0) + { + if (bnd.WLmap.uniform == 1) + { + + log("Warning: 'file' keyword used for uniform boundary, using 'all' instead"); + + va.x = xo - epsbnd; va.y = yo - epsbnd; + vb.x = xmax + epsbnd; vb.y = yo - epsbnd; + vc.x = xmax + epsbnd; vc.y = ymax + epsbnd; + vd.x = xo - epsbnd; vd.y = ymax + epsbnd; + + bndpoly.vertices.push_back(va); + bndpoly.vertices.push_back(vb); + bndpoly.vertices.push_back(vc); + bndpoly.vertices.push_back(vd); + bndpoly.vertices.push_back(va); + bndpoly.xmin = va.x; + bndpoly.xmax = vb.x; + bndpoly.ymin = va.y; + bndpoly.ymax = vd.y; + } + else + { + DynForcingP Df = readforcinghead(bnd.WLmap, XParam); + + + + + va.x = Df.xo - epsbnd; va.y = Df.yo - epsbnd; + vb.x = Df.xmax + epsbnd; vb.y = Df.yo - epsbnd; + vc.x = Df.xmax + epsbnd; vc.y = Df.ymax + epsbnd; + vd.x = Df.xo - epsbnd; vd.y = Df.ymax + epsbnd; + + bndpoly.vertices.push_back(va); + bndpoly.vertices.push_back(vb); + bndpoly.vertices.push_back(vc); + bndpoly.vertices.push_back(vd); + bndpoly.vertices.push_back(va); + bndpoly.xmin = va.x; + bndpoly.xmax = vb.x; + bndpoly.ymin = va.y; + bndpoly.ymax = vd.y; + + } + + } + else if (case_insensitive_compare(bnd.polyfile,"left")==0) { va.x = xo - epsbnd; va.y = yo; vb.x = xo + epsbnd; vb.y = yo; diff --git a/src/ReadInput.cu b/src/ReadInput.cu index 8526a8fe1..3131d1cb9 100644 --- a/src/ReadInput.cu +++ b/src/ReadInput.cu @@ -336,6 +336,21 @@ Param readparamstr(std::string line, Param param) } + parameterstr = "bndrelaxtime"; + parametervalue = findparameter(parameterstr, line); + if (!parametervalue.empty()) + { + param.bndrelaxtime = std::stod(parametervalue); + + } + parameterstr = "bndfiltertime"; + parametervalue = findparameter(parameterstr, line); + if (!parametervalue.empty()) + { + param.bndfiltertime = std::stod(parametervalue); + + } + parameterstr = "CFL"; parametervalue = findparameter(parameterstr, line); if (!parametervalue.empty()) @@ -376,6 +391,17 @@ Param readparamstr(std::string line, Param param) } + paramvec = { "MassConservation", "MassCon","forcemassconservation","forcevolumeconservation","Volumeconservation","VolumeCon", "ForceMassConserve", "ForceVolConserve" }; + parametervalue = findparameter(paramvec, line); + if (!parametervalue.empty()) + { + //param.totaltime = std::stod(parametervalue); + param.totaltime = readinputtimetxt(parametervalue, param.reftime); + param.ForceMassConserve = readparambool(parametervalue, param.ForceMassConserve); + + } + + parameterstr = "dtinit"; parametervalue = findparameter(parameterstr, line); if (!parametervalue.empty()) @@ -1135,6 +1161,12 @@ void checkparamsanity(Param& XParam, Forcing& XForcing) XParam.blkmemwidth = XParam.blkwidth + 2 * XParam.halowidth; XParam.blksize = utils::sq(XParam.blkmemwidth); + /////////////////////////////////////////// + // zsoffset + /////////////////////////////////////////// + + XParam.zsoffset = std::isnan(XParam.zsoffset) ? 0.0 : XParam.zsoffset; + /////////////////////////////////////////// // Read Bathy Information /////////////////////////////////////////// @@ -1336,8 +1368,10 @@ void checkparamsanity(Param& XParam, Forcing& XForcing) XParam.reftime = "2000-01-01T00:00:00"; } + XParam.inittime = XParam.totaltime; + log("Reference time: " + XParam.reftime); - log("Model Initial time: " + std::to_string(XParam.totaltime)); + log("Model Initial time: " + std::to_string(XParam.totaltime) + " ; " ); log("Model end time: " + std::to_string(XParam.endtime)); diff --git a/src/Read_netcdf.cu b/src/Read_netcdf.cu index 1f7701fb1..bec9caa46 100755 --- a/src/Read_netcdf.cu +++ b/src/Read_netcdf.cu @@ -546,6 +546,7 @@ int readnctime2(int ncid,char * timecoordname,std::string refdate,size_t nt, dou for (int it = 0; it < nt; it++) { time[it] = time[it] * fac + offset; + //printf("%f\n", time[it]); } ///status = nc_close(ncid); diff --git a/src/Setup_GPU.cu b/src/Setup_GPU.cu index 3a6e72405..421c63f3f 100755 --- a/src/Setup_GPU.cu +++ b/src/Setup_GPU.cu @@ -100,6 +100,33 @@ template void SetupGPU(Param &XParam, Model XModel,Forcing & XModel_g.bndblk.nblkriver = XModel.bndblk.nblkriver; AllocateGPU(XModel.bndblk.nblkriver, 1, XModel_g.bndblk.river); CopytoGPU(XModel.bndblk.nblkriver, 1, XModel.bndblk.river, XModel_g.bndblk.river); + + int nribmax = XModel.bndblk.Riverinfo.nribmax; + int nburmax = XModel.bndblk.Riverinfo.nburmax; + + XModel_g.bndblk.Riverinfo.nribmax = nribmax; + XModel_g.bndblk.Riverinfo.nburmax = nburmax; + + + AllocateMappedMemGPU(XForcing.rivers.size(), 1,XParam.GPUDEVICE, XModel_g.bndblk.Riverinfo.qnow_g,XModel.bndblk.Riverinfo.qnow); + XModel_g.bndblk.Riverinfo.qnow = XModel.bndblk.Riverinfo.qnow; + + + AllocateGPU(nribmax, nburmax, XModel_g.bndblk.Riverinfo.Xbidir); + AllocateGPU(nribmax, nburmax, XModel_g.bndblk.Riverinfo.Xridib); + CopytoGPU(nribmax, nburmax, XModel.bndblk.Riverinfo.Xbidir, XModel_g.bndblk.Riverinfo.Xbidir); + CopytoGPU(nribmax, nburmax, XModel.bndblk.Riverinfo.Xridib, XModel_g.bndblk.Riverinfo.Xridib); + + AllocateGPU(nribmax, nburmax, XModel_g.bndblk.Riverinfo.xstart); + AllocateGPU(nribmax, nburmax, XModel_g.bndblk.Riverinfo.xend); + AllocateGPU(nribmax, nburmax, XModel_g.bndblk.Riverinfo.ystart); + AllocateGPU(nribmax, nburmax, XModel_g.bndblk.Riverinfo.yend); + + CopytoGPU(nribmax, nburmax, XModel.bndblk.Riverinfo.xstart, XModel_g.bndblk.Riverinfo.xstart); + CopytoGPU(nribmax, nburmax, XModel.bndblk.Riverinfo.xend, XModel_g.bndblk.Riverinfo.xend); + CopytoGPU(nribmax, nburmax, XModel.bndblk.Riverinfo.ystart, XModel_g.bndblk.Riverinfo.ystart); + CopytoGPU(nribmax, nburmax, XModel.bndblk.Riverinfo.yend, XModel_g.bndblk.Riverinfo.yend); + } // Reset GPU mean and max arrays diff --git a/src/Testing.cu b/src/Testing.cu index d793d79aa..7c417f078 100644 --- a/src/Testing.cu +++ b/src/Testing.cu @@ -315,6 +315,12 @@ template bool Testing(Param XParam, Forcing XForcing, Model result = (wallbndleft & wallbndright & wallbndbot & wallbndtop) ? "successful" : "failed"; log("\t\tAOI bnd wall test : " + result); } + + if (mytest == 993) + { + //pinned pageable Memory test + TestPinMem(XParam, XModel, XModel_g); + } if (mytest == 994) { Testzbinit(XParam, XForcing, XModel, XModel_g); @@ -693,6 +699,8 @@ template bool Rivertest(T zsnit, int gpu) XParam.outmean = true; XParam.outtwet = true; + XParam.ForceMassConserve = true; + // create Model setup Model XModel; Model XModel_g; @@ -917,6 +925,7 @@ template bool MassConserveSteepSlope(T zsnit, int gpu) XParam.frictionmodel = 1; XParam.conserveElevation = false; + XParam.ForceMassConserve = true; // Enforce GPU/CPU XParam.GPUDEVICE = gpu; @@ -1197,7 +1206,7 @@ template bool reductiontest(Param XParam, Model XModel, Model XM if (XParam.GPUDEVICE >= 0) { - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, XLoop.hugeposval, XModel_g.time.dtmax); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, XLoop.hugeposval, XModel_g.time.dtmax); CUDA_CHECK(cudaDeviceSynchronize()); CopytoGPU(XParam.nblkmem, XParam.blksize, XModel.time.dtmax, XModel_g.time.dtmax); @@ -1252,7 +1261,7 @@ template bool CPUGPUtest(Param XParam, Model XModel, Model XModel InitArrayBUQ(XParam, XModel.blocks, T(0.0), XModel.evolv.v); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, T(0.0), XModel_g.zb); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, T(0.0), XModel_g.zb); CUDA_CHECK(cudaDeviceSynchronize()); // Create some usefull vectors std::string evolvst[4] = { "h","zs","u","v" }; @@ -1308,15 +1317,15 @@ template bool CPUGPUtest(Param XParam, Model XModel, Model XModel //GPU gradients - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.evolv.h, XModel_g.grad.dhdx, XModel_g.grad.dhdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.evolv.h, XModel_g.grad.dhdx, XModel_g.grad.dhdy); CUDA_CHECK(cudaDeviceSynchronize()); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.evolv.zs, XModel_g.grad.dzsdx, XModel_g.grad.dzsdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.evolv.zs, XModel_g.grad.dzsdx, XModel_g.grad.dzsdy); CUDA_CHECK(cudaDeviceSynchronize()); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.evolv.u, XModel_g.grad.dudx, XModel_g.grad.dudy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.evolv.u, XModel_g.grad.dudx, XModel_g.grad.dudy); CUDA_CHECK(cudaDeviceSynchronize()); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.evolv.v, XModel_g.grad.dvdx, XModel_g.grad.dvdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.evolv.v, XModel_g.grad.dvdx, XModel_g.grad.dvdy); CUDA_CHECK(cudaDeviceSynchronize()); std::string gradst[8] = { "dhdx","dzsdx","dudx","dvdx","dhdy","dzsdy","dudy","dvdy" }; @@ -1361,14 +1370,14 @@ template bool CPUGPUtest(Param XParam, Model XModel, Model XModel updateKurgXCPU(XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); //GPU part - updateKurgXGPU << < gridDim, blockDimKX, 0 >> > (XParam, XModel_g.blocks, XModel_g.evolv, XModel_g.grad, XModel_g.flux, XModel_g.time.dtmax, XModel_g.zb); + updateKurgXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel_g.blocks, XModel_g.evolv, XModel_g.grad, XModel_g.flux, XModel_g.time.dtmax, XModel_g.zb); CUDA_CHECK(cudaDeviceSynchronize()); // Y- direction updateKurgYCPU(XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); - updateKurgYGPU << < gridDim, blockDimKY, 0 >> > (XParam, XModel_g.blocks, XModel_g.evolv, XModel_g.grad, XModel_g.flux, XModel_g.time.dtmax, XModel_g.zb); + updateKurgYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel_g.blocks, XModel_g.evolv, XModel_g.grad, XModel_g.flux, XModel_g.time.dtmax, XModel_g.zb); CUDA_CHECK(cudaDeviceSynchronize()); CompareCPUvsGPU(XParam, XModel, XModel_g, fluxVar, false); @@ -1391,7 +1400,7 @@ template bool CPUGPUtest(Param XParam, Model XModel, Model XModel advVar.push_back(advst[nv]); } updateEVCPU(XParam, XModel.blocks, XModel.evolv, XModel.flux, XModel.adv); - updateEVGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evolv, XModel_g.flux, XModel_g.adv); + updateEVGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evolv, XModel_g.flux, XModel_g.adv); CUDA_CHECK(cudaDeviceSynchronize()); CompareCPUvsGPU(XParam, XModel, XModel_g, advVar, false); @@ -1407,7 +1416,7 @@ template bool CPUGPUtest(Param XParam, Model XModel, Model XModel evoVar.push_back(evost[nv]); } AdvkernelCPU(XParam, XModel.blocks, T(0.0005), XModel.zb, XModel.evolv, XModel.adv, XModel.evolv_o); - AdvkernelGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, T(0.0005), XModel_g.zb, XModel_g.evolv, XModel_g.adv, XModel_g.evolv_o); + AdvkernelGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, T(0.0005), XModel_g.zb, XModel_g.evolv, XModel_g.adv, XModel_g.evolv_o); CUDA_CHECK(cudaDeviceSynchronize()); CompareCPUvsGPU(XParam, XModel, XModel_g, evoVar, false); @@ -1417,7 +1426,7 @@ template bool CPUGPUtest(Param XParam, Model XModel, Model XModel bottomfrictionCPU(XParam, XModel.blocks, T(0.5), XModel.cf, XModel.evolv_o); - bottomfrictionGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, T(0.5), XModel_g.cf, XModel_g.evolv_o); + bottomfrictionGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, T(0.5), XModel_g.cf, XModel_g.evolv_o); CUDA_CHECK(cudaDeviceSynchronize()); CompareCPUvsGPU(XParam, XModel, XModel_g, evoVar, false); @@ -1466,10 +1475,10 @@ template bool CPUGPUtest(Param XParam, Model XModel, Model XModel InitArrayBUQ(XParam, XModel.blocks, T(0.0), XModel.evolv.u); InitArrayBUQ(XParam, XModel.blocks, T(0.0), XModel.evolv.v); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, T(0.0), XModel_g.evolv.u); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, T(0.0), XModel_g.evolv.u); CUDA_CHECK(cudaDeviceSynchronize()); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, T(0.0), XModel_g.evolv.v); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, T(0.0), XModel_g.evolv.v); CUDA_CHECK(cudaDeviceSynchronize()); Forcing XForcing; @@ -1673,6 +1682,7 @@ template bool RiverVolumeAdapt(Param XParam, T maxslope) XParam.minlevel = 1; XParam.maxlevel = 1; XParam.initlevel = 1; + XParam.ForceMassConserve = true; UnitestA=RiverVolumeAdapt(XParam, maxslope, false, false); @@ -2211,6 +2221,7 @@ template bool RiverOnBoundary(Param XParam,T slope, int Dir, int Bound XParam.mask = 999.0; XParam.outishift = 0; XParam.outjshift = 0; + XParam.ForceMassConserve = true; XParam.outputtimestep = 10.0;// XParam.endtime; @@ -2720,6 +2731,7 @@ template bool Raintest(T zsnit, int gpu, float alpha) //Specification of the test //XParam.test = 7; XParam.rainforcing = true; + XParam.ForceMassConserve = true; // Enforce GPU/CPU XParam.GPUDEVICE = gpu; @@ -3810,7 +3822,7 @@ template int TestGradientSpeed(Param XParam, Model XModel, Model // Record the start event cudaEventRecord(startA, NULL); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.zb, XModel_g.grad.dzbdx, XModel_g.grad.dzbdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.zb, XModel_g.grad.dzbdx, XModel_g.grad.dzbdy); CUDA_CHECK(cudaDeviceSynchronize()); // Record the stop event @@ -3833,7 +3845,7 @@ template int TestGradientSpeed(Param XParam, Model XModel, Model // Record the start event cudaEventRecord(startB, NULL); - gradientSM << < gridDim, blockDim >> > (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.zb, XModel_g.grad.dzsdx, XModel_g.grad.dzsdy); + gradientSM <<< gridDim, blockDim >>> (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.zb, XModel_g.grad.dzsdx, XModel_g.grad.dzsdy); CUDA_CHECK(cudaDeviceSynchronize()); // Record the stop event @@ -3856,7 +3868,7 @@ template int TestGradientSpeed(Param XParam, Model XModel, Model // Record the start event cudaEventRecord(startC, NULL); - gradientSMC << < gridDim, blockDim >> > (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.zb, XModel_g.grad.dhdx, XModel_g.grad.dhdy); + gradientSMC <<< gridDim, blockDim >>> (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.zb, XModel_g.grad.dhdx, XModel_g.grad.dhdy); CUDA_CHECK(cudaDeviceSynchronize()); // Record the stop event @@ -4847,6 +4859,91 @@ template int TestAIObnd(Param XParam, Model XModel, Model XModel } +template __global__ void vectoroffsetGPU(int nx, T offset, T* z) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < nx) + { + z[idx] = z[idx] + offset; + } +} + +template int TestPinMem(Param XParam, Model XModel, Model XModel_g) +{ + T* zf, *zf_g, * zf_recov; + + + int nx = 32; + int ny = 1; + + int nelem = nx * ny; + + + AllocateMappedMemCPU(nx, ny, XParam.GPUDEVICE, zf); + + + AllocateCPU(nx, ny, zf_recov); + + for (int i = 0; i < nx; i++) + { + for (int j = 0; j < ny; j++) + { + zf[i + j * nx] = i + j * nx +T(0.25); + + } + } + + T checkrem = T(0.0); + + if (XParam.GPUDEVICE >= 0) + { + AllocateMappedMemGPU(nx, ny, XParam.GPUDEVICE, zf_g, zf); + + + + dim3 block(16); + dim3 grid((unsigned int)ceil(nelem / (float)block.x)); + + vectoroffsetGPU <<>> (nelem, T(1.0), zf_g); + CUDA_CHECK(cudaDeviceSynchronize()); + + CUDA_CHECK(cudaMemcpy(zf_recov, zf_g, nx * ny * sizeof(T), cudaMemcpyDeviceToHost)); + + + + + for (int i = 0; i < nx; i++) + { + for (int j = 0; j < ny; j++) + { + + checkrem = checkrem + abs(zf[i + j * nx] - zf_recov[i + j * nx]); + } + } + } + int modelgood = checkrem < 1.e-6f; + + if (checkrem > 1.e-6f) + { + printf("\n Test Failed error = %e \n", checkrem); + return modelgood; + } + else + { + printf("\n Test Success error = %e \n", checkrem); + } + + + + return modelgood; +} +template int TestPinMem(Param XParam, Model XModel, Model XModel_g); +template int TestPinMem(Param XParam, Model XModel, Model XModel_g); + + + + template Forcing MakValleyBathy(Param XParam, T slope, bool bottop, bool flip) { // diff --git a/src/Updateforcing.cu b/src/Updateforcing.cu index 7865c97d3..5049a9103 100755 --- a/src/Updateforcing.cu +++ b/src/Updateforcing.cu @@ -81,7 +81,7 @@ void Forcingthisstep(Param XParam, double totaltime, DynForcingP &XDynFor } - //NextHDstep << > > (XParam.Rainongrid.nx, XParam.Rainongrid.ny, Rainbef_g, Rainaft_g); + //NextHDstep <<>> (XParam.Rainongrid.nx, XParam.Rainongrid.ny, Rainbef_g, Rainaft_g); //CUDA_CHECK(cudaDeviceSynchronize()); // Read the actual file data @@ -98,7 +98,9 @@ void Forcingthisstep(Param XParam, double totaltime, DynForcingP &XDynFor // Interpolate the forcing array to this time if (XParam.GPUDEVICE >= 0) { - InterpstepGPU << > > (XDynForcing.nx, XDynForcing.ny, XDynForcing.instep - 1, float(totaltime), float(XDynForcing.dt), XDynForcing.now_g, XDynForcing.before_g, XDynForcing.after_g); + float bftime = float(XDynForcing.to+XDynForcing.dt*(XDynForcing.instep-1)); + float aftime = float(XDynForcing.to + XDynForcing.dt * (XDynForcing.instep)); + InterpstepGPU <<>> (XDynForcing.nx, XDynForcing.ny, float(totaltime), bftime,aftime, XDynForcing.now_g, XDynForcing.before_g, XDynForcing.after_g); CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaMemcpyToArray(XDynForcing.GPU.CudArr, 0, 0, XDynForcing.now_g, XDynForcing.nx * XDynForcing.ny * sizeof(float), cudaMemcpyDeviceToDevice)); @@ -119,7 +121,7 @@ void Forcingthisstep(Param XParam, double totaltime, DynForcingP &XDynFor template __host__ void AddRiverForcing(Param XParam, Loop XLoop, std::vector XRivers, Model XModel) { - dim3 gridDimRiver(XModel.bndblk.nblkriver, 1, 1); + dim3 gridDimRiver(XModel.bndblk.Riverinfo.nburmax, 1, 1); dim3 blockDim(XParam.blkwidth, XParam.blkwidth, 1); T qnow; for (int Rin = 0; Rin < XRivers.size(); Rin++) @@ -134,17 +136,30 @@ template __host__ void AddRiverForcing(Param XParam, Loop XLoop, st } qnow = T(interptime(XRivers[Rin].flowinput[bndstep].q, XRivers[Rin].flowinput[max(bndstep - 1, 0)].q, XRivers[Rin].flowinput[bndstep].time - XRivers[Rin].flowinput[max(bndstep - 1, 0)].time, XLoop.totaltime - XRivers[Rin].flowinput[max(bndstep - 1, 0)].time)); + + XModel.bndblk.Riverinfo.qnow[Rin] = qnow / XRivers[Rin].disarea; + + } - if (XParam.GPUDEVICE >= 0) + if (XParam.GPUDEVICE >= 0) + { + for (int irib = 0; irib < XModel.bndblk.Riverinfo.nribmax; irib++) { - InjectRiverGPU <<>> (XParam, XRivers[Rin], qnow, XModel.bndblk.river, XModel.blocks, XModel.adv); + //InjectRiverGPU <<>> (XParam, XRivers[Rin], qnow, XModel.bndblk.river, XModel.blocks, XModel.adv); + //CUDA_CHECK(cudaDeviceSynchronize()); + InjectManyRiversGPU <<>> (XParam, irib, XModel.bndblk.Riverinfo, XModel.blocks, XModel.adv); CUDA_CHECK(cudaDeviceSynchronize()); } - else + + } + else + { + for (int Rin = 0; Rin < XRivers.size(); Rin++) { - InjectRiverCPU(XParam, XRivers[Rin], qnow, XModel.bndblk.nblkriver, XModel.bndblk.river, XModel.blocks, XModel.adv); + InjectRiverCPU(XParam, XRivers[Rin], T(XModel.bndblk.Riverinfo.qnow[Rin]), XModel.bndblk.nblkriver, XModel.bndblk.river, XModel.blocks, XModel.adv); } } + } template __host__ void AddRiverForcing(Param XParam, Loop XLoop, std::vector XRivers, Model XModel); template __host__ void AddRiverForcing(Param XParam, Loop XLoop, std::vector XRivers, Model XModel); @@ -190,6 +205,59 @@ template __global__ void InjectRiverGPU(Param XParam,River XRiver, T q template __global__ void InjectRiverGPU(Param XParam, River XRiver, float qnow, int* Riverblks, BlockP XBlock, AdvanceP XAdv); template __global__ void InjectRiverGPU(Param XParam, River XRiver, double qnow, int* Riverblks, BlockP XBlock, AdvanceP XAdv); +template __global__ void InjectManyRiversGPU(Param XParam,int irib, RiverInfo XRin, BlockP XBlock, AdvanceP XAdv) +{ + int halowidth = XParam.halowidth; + int blkmemwidth = blockDim.x + halowidth * 2; + + int ix = threadIdx.x; + int iy = threadIdx.y; + int ibl = blockIdx.x; + + int indx = ibl + irib * XRin.nburmax; + + int ib,rid,i; + + T xllo, yllo, xl, yb, xr, yt, levdx; + T rxst, ryst, rxnd, rynd; + + ib = XRin.Xbidir[indx]; + if (ib > -1) + { + + i = memloc(halowidth, blkmemwidth, ix, iy, ib); + rid = XRin.Xridib[indx]; + + levdx = calcres(T(XParam.dx), XBlock.level[ib]); + + xllo = T(XParam.xo + XBlock.xo[ib]); + yllo = T(XParam.yo + XBlock.yo[ib]); + + + xl = xllo + ix * levdx - T(0.5) * levdx; + yb = yllo + iy * levdx - T(0.5) * levdx; + + xr = xllo + ix * levdx + T(0.5) * levdx; + yt = yllo + iy * levdx + T(0.5) * levdx; + + rxst = XRin.xstart[indx]; + ryst = XRin.ystart[indx]; + rxnd = XRin.xend[indx]; + rynd = XRin.yend[indx]; + + + T qnow = XRin.qnow_g[rid]; // here we use qnow_g because qnow is a CPU pointer + if (OBBdetect(xl, xr, yb, yt, rxst, rxnd, ryst, rynd)) + { + XAdv.dh[i] += qnow; //was / T(XRiver.disarea) but this is done upstream now to be consistent with GPU Many river ops + + } + + + } + +} + template __host__ void InjectRiverCPU(Param XParam, River XRiver, T qnow, int nblkriver, int* Riverblks, BlockP XBlock, AdvanceP XAdv) { unsigned int ib; @@ -232,7 +300,7 @@ template __host__ void InjectRiverCPU(Param XParam, River XRiver, T qn //if (xx >= XForcing.rivers[Rin].xstart && xx <= XForcing.rivers[Rin].xend && yy >= XForcing.rivers[Rin].ystart && yy <= XForcing.rivers[Rin].yend) if (OBBdetect(xl, xr, yb, yt, T(XRiver.xstart),T(XRiver.xend), T(XRiver.ystart), T(XRiver.yend))) { - XAdv.dh[i] += qnow / T(XRiver.disarea); + XAdv.dh[i] += qnow ; //was / T(XRiver.disarea) but this is done upstream now to be consistent with GPU Many river ops } } @@ -296,6 +364,8 @@ template __global__ void AddrainforcingImplicitGPU(Param XParam, Loop< T Rainhh; + T hi = XEv.h[i]; + T x = XParam.xo + XBlock.xo[ib] + ix * delta; T y = XParam.yo + XBlock.yo[ib] + iy * delta; if (Rain.uniform) @@ -308,13 +378,17 @@ template __global__ void AddrainforcingImplicitGPU(Param XParam, Loop< } + Rainhh = max(Rainhh / T(1000.0) / T(3600.0) * T(XLoop.dt), T(0.0)) * XBlock.activeCell[i]; // convert from mm/hrs to m/s and + //printf("%f\n", Rainhh); + T qvol = hi / (hi + Rainhh); - Rainhh = max(Rainhh / T(1000.0) / T(3600.0) * XLoop.dt,T(0.0)); // convert from mm/hrs to m/s - - - XEv.h[i] += Rainhh * XBlock.activeCell[i]; - XEv.zs[i] += Rainhh * XBlock.activeCell[i]; - + XEv.h[i] = hi + Rainhh; + XEv.zs[i] += Rainhh; + if (hi > XParam.eps) + { + //XEv.u[i] = XEv.u[i] * qvol; + //XEv.v[i] = XEv.v[i] * qvol; + } } template __global__ void AddrainforcingImplicitGPU(Param XParam, Loop XLoop, BlockP XBlock, DynForcingP Rain, EvolvingP XEv); template __global__ void AddrainforcingImplicitGPU(Param XParam, Loop XLoop, BlockP XBlock, DynForcingP Rain, EvolvingP XEv); @@ -387,6 +461,8 @@ template __host__ void AddrainforcingImplicitCPU(Param XParam, Loop T delta = calcres(T(XParam.dx), XBlock.level[ib]); + T hi = XEv.h[i]; + T Rainhh; T x = T(XParam.xo) + XBlock.xo[ib] + ix * delta; @@ -402,10 +478,18 @@ template __host__ void AddrainforcingImplicitCPU(Param XParam, Loop } - Rainhh = max(Rainhh / T(1000.0) / T(3600.0) * T(XLoop.dt), T(0.0)); // convert from mm/hrs to m/s + Rainhh = max(Rainhh / T(1000.0) / T(3600.0) * T(XLoop.dt), T(0.0)) * XBlock.activeCell[i]; // convert from mm/hrs to m/s and + + T qvol = hi/(hi + Rainhh); - XEv.h[i] += Rainhh * XBlock.activeCell[i]; - XEv.zs[i] += Rainhh * XBlock.activeCell[i]; + XEv.h[i] = hi + Rainhh; + XEv.zs[i] += Rainhh; + + if (hi > XParam.eps) + { + XEv.u[i] = XEv.u[i] * qvol; + XEv.v[i] = XEv.v[i] * qvol; + } } } }