From 3e17570d6be9fffb45e0f7bda42ab4828d5e547f Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Wed, 25 Sep 2024 09:56:08 +1200 Subject: [PATCH 01/36] add structure for parallel process of rivers --- src/Arrays.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/Arrays.h b/src/Arrays.h index f71e604ae..4ca02dff6 100755 --- a/src/Arrays.h +++ b/src/Arrays.h @@ -82,6 +82,16 @@ struct maskinfo }; +struct RiverInfo +{ + int nbir; + int nburmax; // size of (max number of) unique block with rivers + int nribmax; // size of (max number of) rivers in one block + int* Xbidir; // array of block id for each river size() + int* Xridib; // array of river id in each block + +}; + // outzone info used to actually write the nc files (one nc file by zone, the default zone is the full domain) struct outzoneB From 5918ee60efc718ccc33663e317248e506a5509c0 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Wed, 25 Sep 2024 14:35:28 +1200 Subject: [PATCH 02/36] try to build new array to parallellise rivers --- src/Arrays.h | 4 +- src/InitialConditions.cu | 87 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 86 insertions(+), 5 deletions(-) diff --git a/src/Arrays.h b/src/Arrays.h index 4ca02dff6..3b1b01af5 100755 --- a/src/Arrays.h +++ b/src/Arrays.h @@ -87,8 +87,8 @@ struct RiverInfo int nbir; int nburmax; // size of (max number of) unique block with rivers int nribmax; // size of (max number of) rivers in one block - int* Xbidir; // array of block id for each river size() - int* Xridib; // array of river id in each block + int* Xbidir; // array of block id for each river size(nburmax,nribmax) + int* Xridib; // array of river id in each block size(nburmax,nribmax) }; diff --git a/src/InitialConditions.cu b/src/InitialConditions.cu index ca65a2bc8..5c693983d 100644 --- a/src/InitialConditions.cu +++ b/src/InitialConditions.cu @@ -336,7 +336,7 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model XForcing.rivers[Rin].i = idis; XForcing.rivers[Rin].j = jdis; XForcing.rivers[Rin].block = blockdis; - XForcing.rivers[Rin].disarea = dischargeArea; // That is not valid for spherical grids + XForcing.rivers[Rin].disarea = dischargeArea; // That is valid for spherical grids @@ -352,8 +352,7 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model } } - - + //Now identify sort and unique blocks where rivers are being inserted std::vector activeRiverBlk; @@ -376,6 +375,88 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model { XModel.bndblk.river[b] = activeRiverBlk[b]; } + + // Setup the river info + + int nburmax = activeRiverBlk.size(); + int nribmax = 0; + for (int b = 0; b < activeRiverBlk.size(); b++) + { + int bur = activeRiverBlk[b]; + int nriverinblock = 0; + for (int Rin = 0; Rin < XForcing.rivers.size(); Rin++) + { + std::vector uniqblockforriver = XForcing.rivers[Rin].block; + + std::sort(uniqblockforriver.begin(), uniqblockforriver.end()); + uniqblockforriver.erase(std::unique(uniqblockforriver.begin(), uniqblockforriver.end()), uniqblockforriver.end()); + + for (int bir = 0; bir < uniqblockforriver.size(); bir++) + { + if (uniqblockforriver[bir] == bur) + { + nriverinblock = nriverinblock + 1; + } + } + + } + nribmax=max(nribmax, nriverinblock) + } + + // Allocate XXbidir and Xridib + AllocateCPU(nribmax, nburmax, XModel.bndblk.XRiverinfo.Xbidir); + AllocateCPU(nribmax, nburmax, XModel.bndblk.XRiverinfo.Xridib); + + // Fill them with a flag value + FillCPU(nribmax, nburmax, -1, XModel.bndblk.XRiverinfo.Xbidir); + FillCPU(nribmax, nburmax, -1, XModel.bndblk.XRiverinfo.Xbidir); + + + std::vector> blocksalreadyin(nribmax, vector(1, 1)); + + for (int iblk = 0; iblk < nribmax; iblk++) + { + blocksalreadyin[iblk][0] = -1; + } + + + + for (int Rin = 0; Rin < XForcing.rivers.size(); Rin++) + { + std::vector uniqblockforriver = XForcing.rivers[Rin].block; + + std::sort(uniqblockforriver.begin(), uniqblockforriver.end()); + uniqblockforriver.erase(std::unique(uniqblockforriver.begin(), uniqblockforriver.end()), uniqblockforriver.end()); + + + + for (int bir = 0; bir < uniqblockforriver.size(); bir++) + { + + for (int iribm = 0; iribm < nribmax; iribm++) + { + + if (std::find(blocksalreadyin[iribm].begin(), blocksalreadyin[iribm].end(), uniqblockforriver[bir]) != blocksalreadyin[iribm].end()) + { + //Found; + + continue; + } + else + { + //not found; + + // add it to the list + // write to the array + break; + } + } + + } + + } + + } From 3011e4df1ab0bb4c60524464191f159062749788 Mon Sep 17 00:00:00 2001 From: Cyprien Date: Wed, 25 Sep 2024 21:54:28 +1200 Subject: [PATCH 03/36] Fix some compile issue --- src/Arrays.h | 2 +- src/InitialConditions.cu | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Arrays.h b/src/Arrays.h index 3b1b01af5..84bf0f64f 100755 --- a/src/Arrays.h +++ b/src/Arrays.h @@ -150,7 +150,7 @@ struct BndblockP int* top; int* bot; - + RiverInfo Riverinfo; }; diff --git a/src/InitialConditions.cu b/src/InitialConditions.cu index 5c693983d..62ec56486 100644 --- a/src/InitialConditions.cu +++ b/src/InitialConditions.cu @@ -400,19 +400,19 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model } } - nribmax=max(nribmax, nriverinblock) + nribmax = max(nribmax, nriverinblock); } // Allocate XXbidir and Xridib - AllocateCPU(nribmax, nburmax, XModel.bndblk.XRiverinfo.Xbidir); - AllocateCPU(nribmax, nburmax, XModel.bndblk.XRiverinfo.Xridib); + AllocateCPU(nribmax, nburmax, XModel.bndblk.Riverinfo.Xbidir); + AllocateCPU(nribmax, nburmax, XModel.bndblk.Riverinfo.Xridib); // Fill them with a flag value - FillCPU(nribmax, nburmax, -1, XModel.bndblk.XRiverinfo.Xbidir); - FillCPU(nribmax, nburmax, -1, XModel.bndblk.XRiverinfo.Xbidir); + FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.Xbidir); + FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.Xbidir); - std::vector> blocksalreadyin(nribmax, vector(1, 1)); + std::vector blocksalreadyin(nribmax, std::vector(1, 1)); for (int iblk = 0; iblk < nribmax; iblk++) { From 65c82b63f506d6fee7d7e1bdfc4c59ac5386e939 Mon Sep 17 00:00:00 2001 From: Cyprien Date: Wed, 25 Sep 2024 22:09:42 +1200 Subject: [PATCH 04/36] Fix compile issue --- src/Arrays.h | 5 ++++- src/InitialConditions.cu | 12 ++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/Arrays.h b/src/Arrays.h index 84bf0f64f..0a912b48c 100755 --- a/src/Arrays.h +++ b/src/Arrays.h @@ -155,7 +155,10 @@ struct BndblockP }; - +struct RiverBlk +{ + std::vector block; +}; diff --git a/src/InitialConditions.cu b/src/InitialConditions.cu index 62ec56486..9e81de592 100644 --- a/src/InitialConditions.cu +++ b/src/InitialConditions.cu @@ -412,11 +412,13 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.Xbidir); - std::vector blocksalreadyin(nribmax, std::vector(1, 1)); - + std::vector blocksalreadyin; + RiverBlk emptyvec; for (int iblk = 0; iblk < nribmax; iblk++) { - blocksalreadyin[iblk][0] = -1; + + blocksalreadyin.push_back(emptyvec); + } @@ -436,7 +438,7 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model for (int iribm = 0; iribm < nribmax; iribm++) { - if (std::find(blocksalreadyin[iribm].begin(), blocksalreadyin[iribm].end(), uniqblockforriver[bir]) != blocksalreadyin[iribm].end()) + if (std::find(blocksalreadyin[iribm].block.begin(), blocksalreadyin[iribm].block.end(), uniqblockforriver[bir]) != blocksalreadyin[iribm].block.end()) { //Found; @@ -447,6 +449,8 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model //not found; // add it to the list + blocksalreadyin[iribm].block.push_back(uniqblockforriver[bir]); + // write to the array break; } From 6c7489d457a3a782515a90f8573e6167e1f1319a Mon Sep 17 00:00:00 2001 From: Cyprien Date: Wed, 25 Sep 2024 22:18:12 +1200 Subject: [PATCH 05/36] Fix code and fill Xriver array --- src/InitialConditions.cu | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/InitialConditions.cu b/src/InitialConditions.cu index 9e81de592..1f557e0a7 100644 --- a/src/InitialConditions.cu +++ b/src/InitialConditions.cu @@ -409,7 +409,7 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model // Fill them with a flag value FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.Xbidir); - FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.Xbidir); + FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.Xridib); std::vector blocksalreadyin; @@ -421,8 +421,8 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model } - - + //(n, 10) + std::vector iriv(nribmax,0); for (int Rin = 0; Rin < XForcing.rivers.size(); Rin++) { std::vector uniqblockforriver = XForcing.rivers[Rin].block; @@ -448,6 +448,11 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model { //not found; + XModel.bndblk.Riverinfo.Xbidir[iriv[iribm] + iribm * nburmax] = uniqblockforriver[bir]; + XModel.bndblk.Riverinfo.Xridib[iriv[iribm] + iribm * nburmax] = Rin; + + iriv[iribm] = iriv[iribm] + 1; + // add it to the list blocksalreadyin[iribm].block.push_back(uniqblockforriver[bir]); From c5c3cbeb58d2b57c3a7c5debf7168d97b06057b4 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Thu, 3 Oct 2024 14:04:12 +1300 Subject: [PATCH 06/36] fix allocation blunder --- src/InitialConditions.cu | 11 ++++++----- src/MemManagement.cu | 3 +++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/InitialConditions.cu b/src/InitialConditions.cu index 1f557e0a7..a5b80e078 100644 --- a/src/InitialConditions.cu +++ b/src/InitialConditions.cu @@ -404,8 +404,8 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model } // Allocate XXbidir and Xridib - AllocateCPU(nribmax, nburmax, XModel.bndblk.Riverinfo.Xbidir); - AllocateCPU(nribmax, nburmax, XModel.bndblk.Riverinfo.Xridib); + ReallocArray(nribmax, nburmax, XModel.bndblk.Riverinfo.Xbidir); + ReallocArray(nribmax, nburmax, XModel.bndblk.Riverinfo.Xridib); // Fill them with a flag value FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.Xbidir); @@ -440,14 +440,14 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model if (std::find(blocksalreadyin[iribm].block.begin(), blocksalreadyin[iribm].block.end(), uniqblockforriver[bir]) != blocksalreadyin[iribm].block.end()) { - //Found; + //block found already listed in that line; continue; } else { //not found; - + // write to the array XModel.bndblk.Riverinfo.Xbidir[iriv[iribm] + iribm * nburmax] = uniqblockforriver[bir]; XModel.bndblk.Riverinfo.Xridib[iriv[iribm] + iribm * nburmax] = Rin; @@ -456,7 +456,8 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model // add it to the list blocksalreadyin[iribm].block.push_back(uniqblockforriver[bir]); - // write to the array + + break; } } diff --git a/src/MemManagement.cu b/src/MemManagement.cu index f6415a682..88554dc18 100755 --- a/src/MemManagement.cu +++ b/src/MemManagement.cu @@ -203,6 +203,9 @@ void AllocateCPU(int nblk, int blksize, Param XParam, Model& XModel) //this will be eventually reallocated later AllocateCPU(1, 1, XModel.bndblk.river); XModel.bndblk.nblkriver = 1; + + AllocateCPU(1, 1, XModel.bndblk.Riverinfo.Xbidir); + AllocateCPU(1, 1, XModel.bndblk.Riverinfo.Xridib); } // preallocate 1 block along all bnds //this will be eventually reallocated later From e0587699c40f59c0ab05db13bc126e785433a18b Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Thu, 10 Oct 2024 17:17:02 +1300 Subject: [PATCH 07/36] Add functions for Pinned memory --- src/MemManagement.cu | 69 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/src/MemManagement.cu b/src/MemManagement.cu index 88554dc18..c3145ed80 100755 --- a/src/MemManagement.cu +++ b/src/MemManagement.cu @@ -1,6 +1,10 @@ #include "MemManagement.h" + +#define MEMORY_ALIGNMENT 4096 +#define ALIGN_UP(x,size) ( ((size_t)x+(size-1))&(~(size-1)) ) + __host__ int memloc(Param XParam, int i, int j, int ib) { return (i+XParam.halowidth) + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize; @@ -362,6 +366,71 @@ template void ReallocArray(int nblk, int blksize, Param XParam, Model(int nblk, int blksize, Param XParam, Model& XModel); + + +template void AllocateMappedMemCPU(int nx, int ny,int gpudevice, T*& z) +{ + + cudaDeviceProp deviceProp; +#if defined(__APPLE__) || defined(MACOSX) + bPinGenericMemory = false; // Generic Pinning of System Paged memory is not currently supported on Mac OSX +#else + bPinGenericMemory = true; +#endif + + // Here there should be a limit for cudar version less than 4.000 + + + if (bPinGenericMemory) + { + printf("> Using Generic System Paged Memory (malloc)\n"); + } + else + { + printf("> Using CUDA Host Allocated (cudaHostAlloc)\n"); + } + cudaGetDeviceProperties(&deviceProp, gpudevice); + + if (!deviceProp.canMapHostMemory) + { + fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", gpudevice); + bPinGenericMemory = false; + } + + if (bPinGenericMemory) + { + + size_t bytes = nx * ny * sizeof(T); + + T* a_UA = (float*)malloc(bytes + MEMORY_ALIGNMENT); + + + // We need to ensure memory is aligned to 4K (so we will need to padd memory accordingly) + z = (float*)ALIGN_UP(a_UA, MEMORY_ALIGNMENT); + + + checkCudaErrors(cudaHostRegister(z, bytes, cudaHostRegisterMapped)); + + + } + else + { + + flags = cudaHostAllocMapped; + checkCudaErrors(cudaHostAlloc((void**)&z, bytes, flags)); + + + } + + +} + +template void AllocateMappedMemGPU(int nx, int ny, int gpudevice, T*& z_g, T* z) +{ + checkCudaErrors(cudaHostGetDevicePointer((void**)&z_g, (void*)z, 0)); +} + + template void AllocateGPU(int nx, int ny, T*& z_g) { CUDA_CHECK(cudaMalloc((void**)& z_g, nx * ny * sizeof(T))); From 9cb83cb1071a0bc0e7a038898d47321fb143c87e Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Thu, 10 Oct 2024 17:29:50 +1300 Subject: [PATCH 08/36] Fix Compile paged mem --- src/MemManagement.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/MemManagement.cu b/src/MemManagement.cu index c3145ed80..1dd9c00fb 100755 --- a/src/MemManagement.cu +++ b/src/MemManagement.cu @@ -370,7 +370,7 @@ template void ReallocArray(int nblk, int blksize, Param XParam, Model void AllocateMappedMemCPU(int nx, int ny,int gpudevice, T*& z) { - + bool bPinGenericMemory; cudaDeviceProp deviceProp; #if defined(__APPLE__) || defined(MACOSX) bPinGenericMemory = false; // Generic Pinning of System Paged memory is not currently supported on Mac OSX @@ -416,8 +416,8 @@ template void AllocateMappedMemCPU(int nx, int ny,int gpudevice, T*& z else { - flags = cudaHostAllocMapped; - checkCudaErrors(cudaHostAlloc((void**)&z, bytes, flags)); + //flags = cudaHostAllocMapped; + checkCudaErrors(cudaHostAlloc((void**)&z, bytes, cudaHostAllocMapped)); } From 1b6c49946c6659480a1c2dcea019b7160759ddd9 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Fri, 11 Oct 2024 16:09:52 +1300 Subject: [PATCH 09/36] Make test for paged mem --- src/Arrays.h | 1 + src/MemManagement.cu | 20 +++++++---- src/MemManagement.h | 4 +++ src/Testing.cu | 82 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 7 deletions(-) diff --git a/src/Arrays.h b/src/Arrays.h index 0a912b48c..138a055e6 100755 --- a/src/Arrays.h +++ b/src/Arrays.h @@ -89,6 +89,7 @@ struct RiverInfo int nribmax; // size of (max number of) rivers in one block int* Xbidir; // array of block id for each river size(nburmax,nribmax) int* Xridib; // array of river id in each block size(nburmax,nribmax) + float* qnow; }; diff --git a/src/MemManagement.cu b/src/MemManagement.cu index 1dd9c00fb..a3ea1a49e 100755 --- a/src/MemManagement.cu +++ b/src/MemManagement.cu @@ -396,20 +396,20 @@ template void AllocateMappedMemCPU(int nx, int ny,int gpudevice, T*& z fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", gpudevice); bPinGenericMemory = false; } - + size_t bytes = nx * ny * sizeof(T); if (bPinGenericMemory) { - size_t bytes = nx * ny * sizeof(T); + - T* a_UA = (float*)malloc(bytes + MEMORY_ALIGNMENT); + T* a_UA = (T*)malloc(bytes + MEMORY_ALIGNMENT); // We need to ensure memory is aligned to 4K (so we will need to padd memory accordingly) - z = (float*)ALIGN_UP(a_UA, MEMORY_ALIGNMENT); + z = (T*)ALIGN_UP(a_UA, MEMORY_ALIGNMENT); - checkCudaErrors(cudaHostRegister(z, bytes, cudaHostRegisterMapped)); + CUDA_CHECK(cudaHostRegister(z, bytes, cudaHostRegisterMapped)); } @@ -417,18 +417,24 @@ template void AllocateMappedMemCPU(int nx, int ny,int gpudevice, T*& z { //flags = cudaHostAllocMapped; - checkCudaErrors(cudaHostAlloc((void**)&z, bytes, cudaHostAllocMapped)); + CUDA_CHECK(cudaHostAlloc((void**)&z, bytes, cudaHostAllocMapped)); } } +template void AllocateMappedMemCPU(int nx, int ny, int gpudevice, int*& z); +template void AllocateMappedMemCPU(int nx, int ny, int gpudevice, float*& z); +template void AllocateMappedMemCPU(int nx, int ny, int gpudevice, double*& z); template void AllocateMappedMemGPU(int nx, int ny, int gpudevice, T*& z_g, T* z) { - checkCudaErrors(cudaHostGetDevicePointer((void**)&z_g, (void*)z, 0)); + CUDA_CHECK(cudaHostGetDevicePointer((void**)&z_g, (void*)z, 0)); } +template void AllocateMappedMemGPU(int nx, int ny, int gpudevice, int*& z_g, int* z); +template void AllocateMappedMemGPU(int nx, int ny, int gpudevice,float*& z_g, float* z); +template void AllocateMappedMemGPU(int nx, int ny, int gpudevice, double*& z_g, double* z); template void AllocateGPU(int nx, int ny, T*& z_g) diff --git a/src/MemManagement.h b/src/MemManagement.h index 9eee09f4c..ae550777f 100755 --- a/src/MemManagement.h +++ b/src/MemManagement.h @@ -25,6 +25,9 @@ template void ReallocArray(int nblk, int blksize, EvolvingP& Ev); template void ReallocArray(int nblk, int blksize, EvolvingP_M& Ev); template void ReallocArray(int nblk, int blksize, Param XParam, Model& XModel); +template void AllocateMappedMemCPU(int nx, int ny, int gpudevice, T*& z); + + template __host__ void FillCPU(int nx, int ny, T fillval, T*& zb); int memloc(Param XParam, int i, int j, int ib); @@ -33,5 +36,6 @@ __host__ __device__ int memloc(int halowidth, int blkmemwidth, int i, int j, int template void AllocateGPU(int nblk, int blksize, Param XParam, Model& XModel); template void AllocateGPU(int nx, int ny, T*& z_g); +template void AllocateMappedMemGPU(int nx, int ny, int gpudevice, T*& z_g, T* z); // End of global definition #endif diff --git a/src/Testing.cu b/src/Testing.cu index d793d79aa..ef6697991 100644 --- a/src/Testing.cu +++ b/src/Testing.cu @@ -315,6 +315,12 @@ template bool Testing(Param XParam, Forcing XForcing, Model result = (wallbndleft & wallbndright & wallbndbot & wallbndtop) ? "successful" : "failed"; log("\t\tAOI bnd wall test : " + result); } + + if (mytest == 993) + { + //pinned pageable Memory test + TestPinMem(XParam, XModel, XModel_g); + } if (mytest == 994) { Testzbinit(XParam, XForcing, XModel, XModel_g); @@ -4847,6 +4853,82 @@ template int TestAIObnd(Param XParam, Model XModel, Model XModel } +template __global__ void vectoroffsetGPU(int nx, T offset, T* z) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < nx) + { + z[idx] = z[idx] + offset; + } +} + +template int TestPinMem(Param XParam, Model XModel, Model XModel_g) +{ + T* zf, *zf_g, * zf_recov; + + + int nx = 32; + int ny = 1; + + int nelem = nx * ny; + + + AllocateMappedMemCPU(nx, ny, XParam.GPUDEVICE, zf); + + + AllocateCPU(nx, ny, zf_recov); + + for (int i = 0; i < nx; i++) + { + for (int j = 0; j < ny; j++) + { + zf[i + j * nx] = i + j * nx +T(0.25); + + } + } + + AllocateMappedMemGPU(nx, ny, XParam.GPUDEVICE, zf_g, zf); + + + + dim3 block(16); + dim3 grid((unsigned int)ceil(nelem / (float)block.x)); + + vectoroffsetGPU << > > (nelem, T(1.0), zf_g); + CUDA_CHECK(cudaDeviceSynchronize()); + + CUDA_CHECK(cudaMemcpy(zf_recov, zf_g, nx*ny * sizeof(T), cudaMemcpyDeviceToHost)); + + T checkrem = T(0.0); + + for (int i = 0; i < nx; i++) + { + for (int j = 0; j < ny; j++) + { + + checkrem = checkrem + abs(zf[i + j * nx] - zf_recov[i + j * nx]); + } + } + + int modelgood = checkrem < 1.e-6f; + + if (checkrem > 1.e-6f) + { + printf("\n Test Failed error = %e \n", checkrem); + } + else + { + printf("\n Test Success error = %e \n", checkrem); + } + return modelgood; +} +template int TestPinMem(Param XParam, Model XModel, Model XModel_g); +template int TestPinMem(Param XParam, Model XModel, Model XModel_g); + + + + template Forcing MakValleyBathy(Param XParam, T slope, bool bottop, bool flip) { // From 9fa3614548395daec4280e8ca0e0c36188c9735b Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Mon, 14 Oct 2024 08:48:38 +1300 Subject: [PATCH 10/36] tweak test and remove out to screen --- src/MemManagement.cu | 4 ++-- src/Testing.cu | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/MemManagement.cu b/src/MemManagement.cu index a3ea1a49e..3b8f3726d 100755 --- a/src/MemManagement.cu +++ b/src/MemManagement.cu @@ -383,11 +383,11 @@ template void AllocateMappedMemCPU(int nx, int ny,int gpudevice, T*& z if (bPinGenericMemory) { - printf("> Using Generic System Paged Memory (malloc)\n"); + //printf("> Using Generic System Paged Memory (malloc)\n"); } else { - printf("> Using CUDA Host Allocated (cudaHostAlloc)\n"); + //printf("> Using CUDA Host Allocated (cudaHostAlloc)\n"); } cudaGetDeviceProperties(&deviceProp, gpudevice); diff --git a/src/Testing.cu b/src/Testing.cu index ef6697991..aae7de61f 100644 --- a/src/Testing.cu +++ b/src/Testing.cu @@ -4916,11 +4916,15 @@ template int TestPinMem(Param XParam, Model XModel, Model XModel if (checkrem > 1.e-6f) { printf("\n Test Failed error = %e \n", checkrem); + return modelgood; } else { printf("\n Test Success error = %e \n", checkrem); } + + + return modelgood; } template int TestPinMem(Param XParam, Model XModel, Model XModel_g); From 00f02f33571707d622b89687a922bfe6d06e9180 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Tue, 15 Oct 2024 13:05:16 +1300 Subject: [PATCH 11/36] modify test for Pin Meme for non-GPU --- src/MemManagement.cu | 1 + src/Testing.cu | 31 ++++++++++++++++++------------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/MemManagement.cu b/src/MemManagement.cu index 3b8f3726d..0d410a772 100755 --- a/src/MemManagement.cu +++ b/src/MemManagement.cu @@ -370,6 +370,7 @@ template void ReallocArray(int nblk, int blksize, Param XParam, Model void AllocateMappedMemCPU(int nx, int ny,int gpudevice, T*& z) { + bool bPinGenericMemory; cudaDeviceProp deviceProp; #if defined(__APPLE__) || defined(MACOSX) diff --git a/src/Testing.cu b/src/Testing.cu index aae7de61f..8784b33f4 100644 --- a/src/Testing.cu +++ b/src/Testing.cu @@ -4888,29 +4888,34 @@ template int TestPinMem(Param XParam, Model XModel, Model XModel } } - AllocateMappedMemGPU(nx, ny, XParam.GPUDEVICE, zf_g, zf); - + T checkrem = T(0.0); + if (XParam.GPUDEVICE >= 0) + { + AllocateMappedMemGPU(nx, ny, XParam.GPUDEVICE, zf_g, zf); - dim3 block(16); - dim3 grid((unsigned int)ceil(nelem / (float)block.x)); - vectoroffsetGPU << > > (nelem, T(1.0), zf_g); - CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemcpy(zf_recov, zf_g, nx*ny * sizeof(T), cudaMemcpyDeviceToHost)); + dim3 block(16); + dim3 grid((unsigned int)ceil(nelem / (float)block.x)); - T checkrem = T(0.0); + vectoroffsetGPU << > > (nelem, T(1.0), zf_g); + CUDA_CHECK(cudaDeviceSynchronize()); - for (int i = 0; i < nx; i++) - { - for (int j = 0; j < ny; j++) + CUDA_CHECK(cudaMemcpy(zf_recov, zf_g, nx * ny * sizeof(T), cudaMemcpyDeviceToHost)); + + + + + for (int i = 0; i < nx; i++) { + for (int j = 0; j < ny; j++) + { - checkrem = checkrem + abs(zf[i + j * nx] - zf_recov[i + j * nx]); + checkrem = checkrem + abs(zf[i + j * nx] - zf_recov[i + j * nx]); + } } } - int modelgood = checkrem < 1.e-6f; if (checkrem > 1.e-6f) From 9c59e3ea0592eb97eece517b86aad21463b6337f Mon Sep 17 00:00:00 2001 From: Cyprien Date: Thu, 17 Oct 2024 11:58:37 +1300 Subject: [PATCH 12/36] Fix CPU only MappedMem --- src/MemManagement.cu | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/MemManagement.cu b/src/MemManagement.cu index 0d410a772..b92ae70b6 100755 --- a/src/MemManagement.cu +++ b/src/MemManagement.cu @@ -390,12 +390,15 @@ template void AllocateMappedMemCPU(int nx, int ny,int gpudevice, T*& z { //printf("> Using CUDA Host Allocated (cudaHostAlloc)\n"); } - cudaGetDeviceProperties(&deviceProp, gpudevice); - - if (!deviceProp.canMapHostMemory) + if (gpudevice >= 0) { - fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", gpudevice); - bPinGenericMemory = false; + cudaGetDeviceProperties(&deviceProp, gpudevice); + + if (!deviceProp.canMapHostMemory) + { + fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", gpudevice); + bPinGenericMemory = false; + } } size_t bytes = nx * ny * sizeof(T); if (bPinGenericMemory) @@ -409,9 +412,10 @@ template void AllocateMappedMemCPU(int nx, int ny,int gpudevice, T*& z // We need to ensure memory is aligned to 4K (so we will need to padd memory accordingly) z = (T*)ALIGN_UP(a_UA, MEMORY_ALIGNMENT); - - CUDA_CHECK(cudaHostRegister(z, bytes, cudaHostRegisterMapped)); - + if (gpudevice >= 0) + { + CUDA_CHECK(cudaHostRegister(z, bytes, cudaHostRegisterMapped)); + } } else From 236b679acf87cce73c11590a186e19aaf3142a9b Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Thu, 17 Oct 2024 12:10:24 +1300 Subject: [PATCH 13/36] add allocations of river info in GPU XModel --- src/InitialConditions.cu | 5 +++++ src/Setup_GPU.cu | 15 +++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/src/InitialConditions.cu b/src/InitialConditions.cu index a5b80e078..8cae00920 100644 --- a/src/InitialConditions.cu +++ b/src/InitialConditions.cu @@ -403,6 +403,9 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model nribmax = max(nribmax, nriverinblock); } + // Allocate Qnow as pinned memory + AllocateMappedMemCPU(XForcing.rivers.size(), 1, XModel.bndblk.Riverinfo.qnow); + // Allocate XXbidir and Xridib ReallocArray(nribmax, nburmax, XModel.bndblk.Riverinfo.Xbidir); ReallocArray(nribmax, nburmax, XModel.bndblk.Riverinfo.Xridib); @@ -411,6 +414,8 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.Xbidir); FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.Xridib); + XModel.bndblk.Riverinfo.nribmax = nribmax; + XModel.bndblk.Riverinfo.nburmax = nburmax; std::vector blocksalreadyin; RiverBlk emptyvec; diff --git a/src/Setup_GPU.cu b/src/Setup_GPU.cu index 3a6e72405..c3c8b64d0 100755 --- a/src/Setup_GPU.cu +++ b/src/Setup_GPU.cu @@ -100,6 +100,21 @@ template void SetupGPU(Param &XParam, Model XModel,Forcing & XModel_g.bndblk.nblkriver = XModel.bndblk.nblkriver; AllocateGPU(XModel.bndblk.nblkriver, 1, XModel_g.bndblk.river); CopytoGPU(XModel.bndblk.nblkriver, 1, XModel.bndblk.river, XModel_g.bndblk.river); + + int nribmax = XModel.bndblk.Riverinfo.nribmax; + int nburmax = XModel.bndblk.Riverinfo.nburmax; + + XModel_g.bndblk.Riverinfo.nribmax = nribmax; + XModel_g.bndblk.Riverinfo.nburmax = nburmax; + + + AllocateMappedMemGPU(XForcing.rivers.size(), 1, XModel_.bndblk.Riverinfo.qnow); + + + AllocateGPU(nribmax, nburmax, XModel_g.bndblk.Riverinfo.Xbidir); + AllocateGPU(nribmax, nburmax, XModel_g.bndblk.Riverinfo.Xridib); + CopytoGPU(nribmax, nburmax, XModel.bndblk.Riverinfo.Xbidir, XModel_g.bndblk.Riverinfo.Xbidir); + CopytoGPU(nribmax, nburmax, XModel.bndblk.Riverinfo.Xridib, XModel_g.bndblk.Riverinfo.Xridib); } // Reset GPU mean and max arrays From b8e8ed9f55fd52a7fe0bf5dc5dd101309b4394f9 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Thu, 17 Oct 2024 14:40:22 +1300 Subject: [PATCH 14/36] Add missing variable but also added template to various classes --- src/Arrays.h | 14 +++++--- src/InitialConditions.cu | 25 ++++++++++++- src/Setup_GPU.cu | 14 +++++++- src/Updateforcing.cu | 78 ++++++++++++++++++++++++++++++++++++---- 4 files changed, 119 insertions(+), 12 deletions(-) diff --git a/src/Arrays.h b/src/Arrays.h index 138a055e6..4c59b63c9 100755 --- a/src/Arrays.h +++ b/src/Arrays.h @@ -82,6 +82,7 @@ struct maskinfo }; +template struct RiverInfo { int nbir; @@ -89,7 +90,12 @@ struct RiverInfo int nribmax; // size of (max number of) rivers in one block int* Xbidir; // array of block id for each river size(nburmax,nribmax) int* Xridib; // array of river id in each block size(nburmax,nribmax) - float* qnow; + T* xstart; + T* xend; + T* ystart; + T *yend; + T* qnow; // qnow is a pin mapped and so both pointers are needed here + T* qnow_g; // this simplify the code later }; @@ -136,7 +142,7 @@ struct AdaptP - +template struct BndblockP { int nblkriver, nblkTs, nbndblkleft, nbndblkright, nbndblktop, nbndblkbot; @@ -151,7 +157,7 @@ struct BndblockP int* top; int* bot; - RiverInfo Riverinfo; + RiverInfo Riverinfo; }; @@ -222,7 +228,7 @@ struct Model AdaptP adapt; - BndblockP bndblk; + BndblockP bndblk; diff --git a/src/InitialConditions.cu b/src/InitialConditions.cu index 8cae00920..9aec225bf 100644 --- a/src/InitialConditions.cu +++ b/src/InitialConditions.cu @@ -404,7 +404,14 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model } // Allocate Qnow as pinned memory - AllocateMappedMemCPU(XForcing.rivers.size(), 1, XModel.bndblk.Riverinfo.qnow); + AllocateMappedMemCPU(XForcing.rivers.size(), 1, XParam.GPUDEVICE,XModel.bndblk.Riverinfo.qnow); + AllocateCPU(nribmax, nburmax, XModel.bndblk.Riverinfo.xstart, XModel.bndblk.Riverinfo.xend, XModel.bndblk.Riverinfo.ystart, XModel.bndblk.Riverinfo.yend); + FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.xstart); + FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.xend); + FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.ystart); + FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.yend); + + // Allocate XXbidir and Xridib ReallocArray(nribmax, nburmax, XModel.bndblk.Riverinfo.Xbidir); @@ -470,6 +477,22 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model } } + for (int iribm = 0; iribm < nribmax; iribm++) + { + for (int ibur = 0; ibur < nburmax; ibur++) + { + int indx = ibur + iribm * nburmax; + int Rin = XModel.bndblk.Riverinfo.Xridib[indx]; + if (Rin > -1) + { + XModel.bndblk.Riverinfo.xstart[indx] = XForcing.rivers[Rin].xstart; + XModel.bndblk.Riverinfo.xend[indx] = XForcing.rivers[Rin].xend; + XModel.bndblk.Riverinfo.ystart[indx] = XForcing.rivers[Rin].ystart; + XModel.bndblk.Riverinfo.yend[indx] = XForcing.rivers[Rin].yend; + } + } + } + diff --git a/src/Setup_GPU.cu b/src/Setup_GPU.cu index c3c8b64d0..902e852d1 100755 --- a/src/Setup_GPU.cu +++ b/src/Setup_GPU.cu @@ -108,13 +108,25 @@ template void SetupGPU(Param &XParam, Model XModel,Forcing & XModel_g.bndblk.Riverinfo.nburmax = nburmax; - AllocateMappedMemGPU(XForcing.rivers.size(), 1, XModel_.bndblk.Riverinfo.qnow); + AllocateMappedMemGPU(XForcing.rivers.size(), 1, XModel_g.bndblk.Riverinfo.qnow_g); + XModel_g.bndblk.Riverinfo.qnow = XModel.bndblk.Riverinfo.qnow; AllocateGPU(nribmax, nburmax, XModel_g.bndblk.Riverinfo.Xbidir); AllocateGPU(nribmax, nburmax, XModel_g.bndblk.Riverinfo.Xridib); CopytoGPU(nribmax, nburmax, XModel.bndblk.Riverinfo.Xbidir, XModel_g.bndblk.Riverinfo.Xbidir); CopytoGPU(nribmax, nburmax, XModel.bndblk.Riverinfo.Xridib, XModel_g.bndblk.Riverinfo.Xridib); + + AllocateGPU(nribmax, nburmax, XModel_g.bndblk.Riverinfo.xstart); + AllocateGPU(nribmax, nburmax, XModel_g.bndblk.Riverinfo.xend); + AllocateGPU(nribmax, nburmax, XModel_g.bndblk.Riverinfo.ystart); + AllocateGPU(nribmax, nburmax, XModel_g.bndblk.Riverinfo.yend); + + CopytoGPU(nribmax, nburmax, XModel.bndblk.Riverinfo.xstart, XModel_g.bndblk.Riverinfo.xstart); + CopytoGPU(nribmax, nburmax, XModel.bndblk.Riverinfo.xend, XModel_g.bndblk.Riverinfo.xend); + CopytoGPU(nribmax, nburmax, XModel.bndblk.Riverinfo.ystart, XModel_g.bndblk.Riverinfo.ystart); + CopytoGPU(nribmax, nburmax, XModel.bndblk.Riverinfo.yend, XModel_g.bndblk.Riverinfo.yend); + } // Reset GPU mean and max arrays diff --git a/src/Updateforcing.cu b/src/Updateforcing.cu index 7865c97d3..09658cdef 100755 --- a/src/Updateforcing.cu +++ b/src/Updateforcing.cu @@ -119,7 +119,7 @@ void Forcingthisstep(Param XParam, double totaltime, DynForcingP &XDynFor template __host__ void AddRiverForcing(Param XParam, Loop XLoop, std::vector XRivers, Model XModel) { - dim3 gridDimRiver(XModel.bndblk.nblkriver, 1, 1); + dim3 gridDimRiver(XModel.bndblk.Riverinfo.nburmax, 1, 1); dim3 blockDim(XParam.blkwidth, XParam.blkwidth, 1); T qnow; for (int Rin = 0; Rin < XRivers.size(); Rin++) @@ -134,17 +134,30 @@ template __host__ void AddRiverForcing(Param XParam, Loop XLoop, st } qnow = T(interptime(XRivers[Rin].flowinput[bndstep].q, XRivers[Rin].flowinput[max(bndstep - 1, 0)].q, XRivers[Rin].flowinput[bndstep].time - XRivers[Rin].flowinput[max(bndstep - 1, 0)].time, XLoop.totaltime - XRivers[Rin].flowinput[max(bndstep - 1, 0)].time)); + + XModel.bndblk.Riverinfo.qnow[Rin] = qnow / XRivers[Rin].disarea; + + } - if (XParam.GPUDEVICE >= 0) + if (XParam.GPUDEVICE >= 0) + { + for (int irib = 0; irib < XModel.bndblk.Riverinfo.nribmax; irib++) { - InjectRiverGPU <<>> (XParam, XRivers[Rin], qnow, XModel.bndblk.river, XModel.blocks, XModel.adv); + //InjectRiverGPU <<>> (XParam, XRivers[Rin], qnow, XModel.bndblk.river, XModel.blocks, XModel.adv); + //CUDA_CHECK(cudaDeviceSynchronize()); + InjectManyRiversGPU << > > (XParam, irib, XModel.bndblk.Riverinfo, XModel.blocks, XModel.adv); CUDA_CHECK(cudaDeviceSynchronize()); } - else + + } + else + { + for (int Rin = 0; Rin < XRivers.size(); Rin++) { - InjectRiverCPU(XParam, XRivers[Rin], qnow, XModel.bndblk.nblkriver, XModel.bndblk.river, XModel.blocks, XModel.adv); + InjectRiverCPU(XParam, XRivers[Rin], T(XModel.bndblk.Riverinfo.qnow[Rin]), XModel.bndblk.nblkriver, XModel.bndblk.river, XModel.blocks, XModel.adv); } } + } template __host__ void AddRiverForcing(Param XParam, Loop XLoop, std::vector XRivers, Model XModel); template __host__ void AddRiverForcing(Param XParam, Loop XLoop, std::vector XRivers, Model XModel); @@ -190,6 +203,59 @@ template __global__ void InjectRiverGPU(Param XParam,River XRiver, T q template __global__ void InjectRiverGPU(Param XParam, River XRiver, float qnow, int* Riverblks, BlockP XBlock, AdvanceP XAdv); template __global__ void InjectRiverGPU(Param XParam, River XRiver, double qnow, int* Riverblks, BlockP XBlock, AdvanceP XAdv); +template __global__ void InjectManyRiversGPU(Param XParam,int irib, RiverInfo XRin, BlockP XBlock, AdvanceP XAdv) +{ + int halowidth = XParam.halowidth; + int blkmemwidth = blockDim.x + halowidth * 2; + + int ix = threadIdx.x; + int iy = threadIdx.y; + int ibl = blockIdx.x; + + int indx = ibl + irib * XRin.nburmax; + + int ib,rid,i; + + T xllo, yllo, xl, yb, xr, yt, levdx; + T rxst, ryst, rxnd, rynd; + + ib = XRin.Xbidir[indx]; + if (ib > -1) + { + + i = memloc(halowidth, blkmemwidth, ix, iy, ib); + rid = XRin.Xridib[index]; + + levdx = calcres(T(XParam.dx), XBlock.level[ib]); + + xllo = T(XParam.xo + XBlock.xo[ib]); + yllo = T(XParam.yo + XBlock.yo[ib]); + + + xl = xllo + ix * levdx - T(0.5) * levdx; + yb = yllo + iy * levdx - T(0.5) * levdx; + + xr = xllo + ix * levdx + T(0.5) * levdx; + yt = yllo + iy * levdx + T(0.5) * levdx; + + rxst = XRin.xstart[indx]; + ryst = XRin.ystart[indx]; + rxnd = XRin.xend[indx]; + rynd = XRin.yend[indx]; + + + T qnow = XRin.qnow_g[rid]; // here we use qnow_g because qnow is a CPU pointer + if (OBBdetect(xl, xr, yb, yt, rxst, rxnd, ryst, rynd)) + { + XAdv.dh[i] += qnow; //was / T(XRiver.disarea) but this is done upstream now to be consistent with GPU Many river ops + + } + + + } + +} + template __host__ void InjectRiverCPU(Param XParam, River XRiver, T qnow, int nblkriver, int* Riverblks, BlockP XBlock, AdvanceP XAdv) { unsigned int ib; @@ -232,7 +298,7 @@ template __host__ void InjectRiverCPU(Param XParam, River XRiver, T qn //if (xx >= XForcing.rivers[Rin].xstart && xx <= XForcing.rivers[Rin].xend && yy >= XForcing.rivers[Rin].ystart && yy <= XForcing.rivers[Rin].yend) if (OBBdetect(xl, xr, yb, yt, T(XRiver.xstart),T(XRiver.xend), T(XRiver.ystart), T(XRiver.yend))) { - XAdv.dh[i] += qnow / T(XRiver.disarea); + XAdv.dh[i] += qnow ; //was / T(XRiver.disarea) but this is done upstream now to be consistent with GPU Many river ops } } From ef90c696a68bf821a43377e4611942c9f409e732 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Thu, 17 Oct 2024 16:34:56 +1300 Subject: [PATCH 15/36] Fix compile issues --- src/InitialConditions.cu | 14 +++++++------- src/InitialConditions.h | 2 +- src/Setup_GPU.cu | 2 +- src/Updateforcing.cu | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/InitialConditions.cu b/src/InitialConditions.cu index 9aec225bf..a45473a69 100644 --- a/src/InitialConditions.cu +++ b/src/InitialConditions.cu @@ -216,7 +216,7 @@ void InitTSOutput(Param XParam) } } -template void FindTSoutNodes(Param& XParam, BlockP XBlock, BndblockP & bnd) +template void FindTSoutNodes(Param& XParam, BlockP XBlock, BndblockP & bnd) { int ib; T levdx,x,y,blkxmin,blkxmax,blkymin,blkymax,dxblk; @@ -263,8 +263,8 @@ template void FindTSoutNodes(Param& XParam, BlockP XBlock, Bndblock } -template void FindTSoutNodes(Param& XParam, BlockP XBlock, BndblockP& bnd); -template void FindTSoutNodes(Param& XParam, BlockP XBlock, BndblockP& bnd); +template void FindTSoutNodes(Param& XParam, BlockP XBlock, BndblockP& bnd); +template void FindTSoutNodes(Param& XParam, BlockP XBlock, BndblockP& bnd); @@ -406,10 +406,10 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model // Allocate Qnow as pinned memory AllocateMappedMemCPU(XForcing.rivers.size(), 1, XParam.GPUDEVICE,XModel.bndblk.Riverinfo.qnow); AllocateCPU(nribmax, nburmax, XModel.bndblk.Riverinfo.xstart, XModel.bndblk.Riverinfo.xend, XModel.bndblk.Riverinfo.ystart, XModel.bndblk.Riverinfo.yend); - FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.xstart); - FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.xend); - FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.ystart); - FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.yend); + FillCPU(nribmax, nburmax, T(-1.0), XModel.bndblk.Riverinfo.xstart); + FillCPU(nribmax, nburmax, T(-1.0), XModel.bndblk.Riverinfo.xend); + FillCPU(nribmax, nburmax, T(-1.0), XModel.bndblk.Riverinfo.ystart); + FillCPU(nribmax, nburmax, T(-1.0), XModel.bndblk.Riverinfo.yend); diff --git a/src/InitialConditions.h b/src/InitialConditions.h index d97a4121f..61ef02f85 100755 --- a/src/InitialConditions.h +++ b/src/InitialConditions.h @@ -23,7 +23,7 @@ template void initoutput(Param &XParam, Model& XModel); void InitTSOutput(Param XParam); //template void Initbnds(Param XParam, Forcing XForcing, Model& XModel); -template void FindTSoutNodes(Param& XParam, BlockP XBlock, BndblockP& bnd); +template void FindTSoutNodes(Param& XParam, BlockP XBlock, BndblockP& bnd); template void Calcbndblks(Param& XParam, Forcing& XForcing, BlockP XBlock); template void Findbndblks(Param XParam, Model XModel, Forcing& XForcing); template void Initoutzone(Param& XParam, BlockP& XBlock); diff --git a/src/Setup_GPU.cu b/src/Setup_GPU.cu index 902e852d1..3e272d76a 100755 --- a/src/Setup_GPU.cu +++ b/src/Setup_GPU.cu @@ -108,7 +108,7 @@ template void SetupGPU(Param &XParam, Model XModel,Forcing & XModel_g.bndblk.Riverinfo.nburmax = nburmax; - AllocateMappedMemGPU(XForcing.rivers.size(), 1, XModel_g.bndblk.Riverinfo.qnow_g); + AllocateMappedMemGPU(XForcing.rivers.size(), 1,XParam.GPUDEVICE, XModel.bndblk.Riverinfo.qnow,XModel_g.bndblk.Riverinfo.qnow_g); XModel_g.bndblk.Riverinfo.qnow = XModel.bndblk.Riverinfo.qnow; diff --git a/src/Updateforcing.cu b/src/Updateforcing.cu index 09658cdef..cc3b2e3df 100755 --- a/src/Updateforcing.cu +++ b/src/Updateforcing.cu @@ -224,7 +224,7 @@ template __global__ void InjectManyRiversGPU(Param XParam,int irib, Ri { i = memloc(halowidth, blkmemwidth, ix, iy, ib); - rid = XRin.Xridib[index]; + rid = XRin.Xridib[indx]; levdx = calcres(T(XParam.dx), XBlock.level[ib]); From 31ab89f35e235b8a8a757f944d5d9e6ea0276353 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Thu, 17 Oct 2024 21:00:50 +1300 Subject: [PATCH 16/36] fix map mem alloc --- src/Setup_GPU.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Setup_GPU.cu b/src/Setup_GPU.cu index 3e272d76a..421c63f3f 100755 --- a/src/Setup_GPU.cu +++ b/src/Setup_GPU.cu @@ -108,7 +108,7 @@ template void SetupGPU(Param &XParam, Model XModel,Forcing & XModel_g.bndblk.Riverinfo.nburmax = nburmax; - AllocateMappedMemGPU(XForcing.rivers.size(), 1,XParam.GPUDEVICE, XModel.bndblk.Riverinfo.qnow,XModel_g.bndblk.Riverinfo.qnow_g); + AllocateMappedMemGPU(XForcing.rivers.size(), 1,XParam.GPUDEVICE, XModel_g.bndblk.Riverinfo.qnow_g,XModel.bndblk.Riverinfo.qnow); XModel_g.bndblk.Riverinfo.qnow = XModel.bndblk.Riverinfo.qnow; From b0576b1eb1cef369afbd0cfc462cac3309c97c21 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Fri, 18 Oct 2024 13:43:02 +1300 Subject: [PATCH 17/36] Add momentum adjustment when using rain --- src/Updateforcing.cu | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/Updateforcing.cu b/src/Updateforcing.cu index cc3b2e3df..d17c50c09 100755 --- a/src/Updateforcing.cu +++ b/src/Updateforcing.cu @@ -362,6 +362,8 @@ template __global__ void AddrainforcingImplicitGPU(Param XParam, Loop< T Rainhh; + T hi = XEv.h[i]; + T x = XParam.xo + XBlock.xo[ib] + ix * delta; T y = XParam.yo + XBlock.yo[ib] + iy * delta; if (Rain.uniform) @@ -374,12 +376,15 @@ template __global__ void AddrainforcingImplicitGPU(Param XParam, Loop< } + Rainhh = max(Rainhh / T(1000.0) / T(3600.0) * T(XLoop.dt), T(0.0)) * XBlock.activeCell[i]; // convert from mm/hrs to m/s and - Rainhh = max(Rainhh / T(1000.0) / T(3600.0) * XLoop.dt,T(0.0)); // convert from mm/hrs to m/s + T qvol = hi / (hi + Rainhh); - - XEv.h[i] += Rainhh * XBlock.activeCell[i]; - XEv.zs[i] += Rainhh * XBlock.activeCell[i]; + XEv.h[i] = hi + Rainhh; + XEv.zs[i] += Rainhh; + + XEv.u[i] = XEv.u[i] * qvol; + XEv.v[i] = XEv.v[i] * qvol; } template __global__ void AddrainforcingImplicitGPU(Param XParam, Loop XLoop, BlockP XBlock, DynForcingP Rain, EvolvingP XEv); @@ -453,6 +458,8 @@ template __host__ void AddrainforcingImplicitCPU(Param XParam, Loop T delta = calcres(T(XParam.dx), XBlock.level[ib]); + T hi = XEv.h[i]; + T Rainhh; T x = T(XParam.xo) + XBlock.xo[ib] + ix * delta; @@ -468,10 +475,15 @@ template __host__ void AddrainforcingImplicitCPU(Param XParam, Loop } - Rainhh = max(Rainhh / T(1000.0) / T(3600.0) * T(XLoop.dt), T(0.0)); // convert from mm/hrs to m/s + Rainhh = max(Rainhh / T(1000.0) / T(3600.0) * T(XLoop.dt), T(0.0)) * XBlock.activeCell[i]; // convert from mm/hrs to m/s and + + T qvol = hi/(hi + Rainhh); + + XEv.h[i] = hi + Rainhh; + XEv.zs[i] += Rainhh; - XEv.h[i] += Rainhh * XBlock.activeCell[i]; - XEv.zs[i] += Rainhh * XBlock.activeCell[i]; + XEv.u[i] = XEv.u[i] * qvol; + XEv.v[i] = XEv.v[i] * qvol; } } } From a957e977dcfdbeccddfdd414737a6199d6a982e3 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Wed, 23 Oct 2024 08:56:25 +1300 Subject: [PATCH 18/36] Add limiter for flux adjustment for dry cells --- src/Updateforcing.cu | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/Updateforcing.cu b/src/Updateforcing.cu index d17c50c09..70f5b90a5 100755 --- a/src/Updateforcing.cu +++ b/src/Updateforcing.cu @@ -382,10 +382,11 @@ template __global__ void AddrainforcingImplicitGPU(Param XParam, Loop< XEv.h[i] = hi + Rainhh; XEv.zs[i] += Rainhh; - - XEv.u[i] = XEv.u[i] * qvol; - XEv.v[i] = XEv.v[i] * qvol; - + if (hi > XParam.eps) + { + XEv.u[i] = XEv.u[i] * qvol; + XEv.v[i] = XEv.v[i] * qvol; + } } template __global__ void AddrainforcingImplicitGPU(Param XParam, Loop XLoop, BlockP XBlock, DynForcingP Rain, EvolvingP XEv); template __global__ void AddrainforcingImplicitGPU(Param XParam, Loop XLoop, BlockP XBlock, DynForcingP Rain, EvolvingP XEv); @@ -482,8 +483,11 @@ template __host__ void AddrainforcingImplicitCPU(Param XParam, Loop XEv.h[i] = hi + Rainhh; XEv.zs[i] += Rainhh; - XEv.u[i] = XEv.u[i] * qvol; - XEv.v[i] = XEv.v[i] * qvol; + if (hi > XParam.eps) + { + XEv.u[i] = XEv.u[i] * qvol; + XEv.v[i] = XEv.v[i] * qvol; + } } } } From 1c7018fee30d8f50f4a113c11d37139d4f83e1e9 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Wed, 30 Oct 2024 08:42:50 +1300 Subject: [PATCH 19/36] playing up with velocity sanity --- src/Advection.cu | 4 ++-- src/FlowGPU.cu | 4 ++-- src/Updateforcing.cu | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Advection.cu b/src/Advection.cu index 2f19765ce..0b6543067 100755 --- a/src/Advection.cu +++ b/src/Advection.cu @@ -251,11 +251,11 @@ template __global__ void AdvkernelGPU(Param XParam, BlockP XBlock, T ho, uo, vo; T dhi = XAdv.dh[i]; - T edt = dt;// dhi > T(0.0) ? dt : min(dt, hold / (T(-1.0) * dhi)); + T edt = dt;// dhi >= T(0.0) ? dt : min(dt, max(hold, XParam.eps) / abs(dhi)); + //ho = max(hold + edt * dhi,T(0.0)); ho = hold + edt * dhi; - if (ho > eps) { // uo = (hold * XEv.u[i] + edt * XAdv.dhu[i]) / ho; diff --git a/src/FlowGPU.cu b/src/FlowGPU.cu index 55ff88197..49e2f2473 100644 --- a/src/FlowGPU.cu +++ b/src/FlowGPU.cu @@ -260,8 +260,8 @@ template void FlowGPU(Param XParam, Loop& XLoop, Forcing XFo //============================================ // Add bottom friction - bottomfrictionGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.time.dt, XModel.cf, XModel.evolv_o); - //XiafrictionGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.time.dt, XModel.cf, XModel.evolv, XModel.evolv_o); + //bottomfrictionGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.time.dt, XModel.cf, XModel.evolv_o); + XiafrictionGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.time.dt, XModel.cf, XModel.evolv, XModel.evolv_o); CUDA_CHECK(cudaDeviceSynchronize()); diff --git a/src/Updateforcing.cu b/src/Updateforcing.cu index 70f5b90a5..cc332bf2c 100755 --- a/src/Updateforcing.cu +++ b/src/Updateforcing.cu @@ -384,8 +384,8 @@ template __global__ void AddrainforcingImplicitGPU(Param XParam, Loop< XEv.zs[i] += Rainhh; if (hi > XParam.eps) { - XEv.u[i] = XEv.u[i] * qvol; - XEv.v[i] = XEv.v[i] * qvol; + //XEv.u[i] = XEv.u[i] * qvol; + //XEv.v[i] = XEv.v[i] * qvol; } } template __global__ void AddrainforcingImplicitGPU(Param XParam, Loop XLoop, BlockP XBlock, DynForcingP Rain, EvolvingP XEv); From 56bfaacf6a086723e7e2cb3bd09db43d672f5cb9 Mon Sep 17 00:00:00 2001 From: Cyprien Date: Wed, 30 Oct 2024 10:01:49 +1300 Subject: [PATCH 20/36] Add explanation to new algo --- src/InitialConditions.cu | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/InitialConditions.cu b/src/InitialConditions.cu index a45473a69..fcbcc65dd 100644 --- a/src/InitialConditions.cu +++ b/src/InitialConditions.cu @@ -421,6 +421,14 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.Xbidir); FillCPU(nribmax, nburmax, -1, XModel.bndblk.Riverinfo.Xridib); + //Xbidir is an array that stores block id where n rivers apply + //along the row of Xbidir block id is unique. meaning that a block id ith two river injection will appear on two seperate row of Xbidir + //The number of column (size of row 1) in xbidir is nburmax = length(uniq(blockwith river injected)) + // + + //Xridib is an array that stores River id that a river is injected for the corresponding block id in Xbidir + + XModel.bndblk.Riverinfo.nribmax = nribmax; XModel.bndblk.Riverinfo.nburmax = nburmax; @@ -434,6 +442,7 @@ template void InitRivers(Param XParam, Forcing &XForcing, Model } //(n, 10) + // std::vector iriv(nribmax,0); for (int Rin = 0; Rin < XForcing.rivers.size(); Rin++) { From 306cc1107c7e4660ff9d8a9ade098241f2acc956 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Thu, 9 Jan 2025 10:14:49 +1300 Subject: [PATCH 21/36] Revert experiment changes on roughness --- src/FlowGPU.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/FlowGPU.cu b/src/FlowGPU.cu index 49e2f2473..55ff88197 100644 --- a/src/FlowGPU.cu +++ b/src/FlowGPU.cu @@ -260,8 +260,8 @@ template void FlowGPU(Param XParam, Loop& XLoop, Forcing XFo //============================================ // Add bottom friction - //bottomfrictionGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.time.dt, XModel.cf, XModel.evolv_o); - XiafrictionGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.time.dt, XModel.cf, XModel.evolv, XModel.evolv_o); + bottomfrictionGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.time.dt, XModel.cf, XModel.evolv_o); + //XiafrictionGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.time.dt, XModel.cf, XModel.evolv, XModel.evolv_o); CUDA_CHECK(cudaDeviceSynchronize()); From bda4751324c722cac02fbc8d0f4bc1da1a19a7a5 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Thu, 16 Jan 2025 16:56:51 +1300 Subject: [PATCH 22/36] Fix Dynamic forcing --- src/GridManip.cu | 12 +++++++----- src/GridManip.h | 3 ++- src/Read_netcdf.cu | 1 + src/Updateforcing.cu | 6 ++++-- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/GridManip.cu b/src/GridManip.cu index 050acfb3c..dc6180b92 100755 --- a/src/GridManip.cu +++ b/src/GridManip.cu @@ -453,7 +453,7 @@ template void InterpstepCPU(int nx, int ny, int hdstep, double to template void InterpstepCPU(int nx, int ny, int hdstep, double totaltime, double hddt, double*& Ux, double* Uo, double* Un); -template __global__ void InterpstepGPU(int nx, int ny, int hdstp, T totaltime, T hddt, T*Ux, T* Uo, T* Un) +template __global__ void InterpstepGPU(int nx, int ny, T totaltime, T beforetime, T aftertime, T*Ux, T* Uo, T* Un) { unsigned int ix = blockIdx.x * blockDim.x + threadIdx.x; unsigned int iy = blockIdx.y * blockDim.y + threadIdx.y; @@ -464,19 +464,21 @@ template __global__ void InterpstepGPU(int nx, int ny, int hdstp, T to __shared__ T Uxn[16][16]; // __shared__ float Ums[16]; - + T hddt = aftertime - beforetime; if (ix < nx && iy < ny) { Uxo[tx][ty] = Uo[ix + nx * iy]/**Ums[tx]*/; Uxn[tx][ty] = Un[ix + nx * iy]/**Ums[tx]*/; - Ux[ix + nx * iy] = Uxo[tx][ty] + (totaltime - hddt * hdstp) * (Uxn[tx][ty] - Uxo[tx][ty]) / hddt; + Ux[ix + nx * iy] = Uxo[tx][ty] + (totaltime - beforetime) * (Uxn[tx][ty] - Uxo[tx][ty]) / (hddt); } } //template __global__ void InterpstepGPU(int nx, int ny, int hdstp, T totaltime, T hddt, T* Ux, T* Uo, T* Un); -template __global__ void InterpstepGPU(int nx, int ny, int hdstp, float totaltime, float hddt, float* Ux, float* Uo, float* Un); -template __global__ void InterpstepGPU(int nx, int ny, int hdstp, double totaltime, double hddt, double* Ux, double* Uo, double* Un); +template __global__ void InterpstepGPU(int nx, int ny, float totaltime, float beforetime, float aftertime, float* Ux, float* Uo, float* Un); +template __global__ void InterpstepGPU(int nx, int ny, double totaltime, double beforetime, double aftertime, double* Ux, double* Uo, double* Un); + + template void Copy2CartCPU(int nx, int ny, T* dest, T* src) { diff --git a/src/GridManip.h b/src/GridManip.h index 4ee385ad4..0bd725fdf 100755 --- a/src/GridManip.h +++ b/src/GridManip.h @@ -23,7 +23,8 @@ template T interp2BUQ(T x, T y, F forcing); template T interp2BUQ(T x, T y, T dx, F forcing); template void InterpstepCPU(int nx, int ny, int hdstep, F totaltime, F hddt, T*& Ux, T* Uo, T* Un); -template __global__ void InterpstepGPU(int nx, int ny, int hdstp, T totaltime, T hddt, T* Ux, T* Uo, T* Un); +//template __global__ void InterpstepGPU(int nx, int ny, int hdstp, T totaltime, T hddt, T* Ux, T* Uo, T* Un); +template __global__ void InterpstepGPU(int nx, int ny, T totaltime, T beforetime, T aftertime, T* Ux, T* Uo, T* Un); template void Copy2CartCPU(int nx, int ny, T* dest, T* src); diff --git a/src/Read_netcdf.cu b/src/Read_netcdf.cu index 1f7701fb1..bec9caa46 100755 --- a/src/Read_netcdf.cu +++ b/src/Read_netcdf.cu @@ -546,6 +546,7 @@ int readnctime2(int ncid,char * timecoordname,std::string refdate,size_t nt, dou for (int it = 0; it < nt; it++) { time[it] = time[it] * fac + offset; + //printf("%f\n", time[it]); } ///status = nc_close(ncid); diff --git a/src/Updateforcing.cu b/src/Updateforcing.cu index cc332bf2c..b2a69c93c 100755 --- a/src/Updateforcing.cu +++ b/src/Updateforcing.cu @@ -98,7 +98,9 @@ void Forcingthisstep(Param XParam, double totaltime, DynForcingP &XDynFor // Interpolate the forcing array to this time if (XParam.GPUDEVICE >= 0) { - InterpstepGPU << > > (XDynForcing.nx, XDynForcing.ny, XDynForcing.instep - 1, float(totaltime), float(XDynForcing.dt), XDynForcing.now_g, XDynForcing.before_g, XDynForcing.after_g); + float bftime = float(XDynForcing.to+XDynForcing.dt*(XDynForcing.instep-1)); + float aftime = float(XDynForcing.to + XDynForcing.dt * (XDynForcing.instep)); + InterpstepGPU << > > (XDynForcing.nx, XDynForcing.ny, float(totaltime), bftime,aftime, XDynForcing.now_g, XDynForcing.before_g, XDynForcing.after_g); CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaMemcpyToArray(XDynForcing.GPU.CudArr, 0, 0, XDynForcing.now_g, XDynForcing.nx * XDynForcing.ny * sizeof(float), cudaMemcpyDeviceToDevice)); @@ -377,7 +379,7 @@ template __global__ void AddrainforcingImplicitGPU(Param XParam, Loop< Rainhh = max(Rainhh / T(1000.0) / T(3600.0) * T(XLoop.dt), T(0.0)) * XBlock.activeCell[i]; // convert from mm/hrs to m/s and - + //printf("%f\n", Rainhh); T qvol = hi / (hi + Rainhh); XEv.h[i] = hi + Rainhh; From a835af544828421247725207b515f838998a5f44 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Fri, 31 Jan 2025 11:53:10 +1300 Subject: [PATCH 23/36] force sane velocity --- src/Advection.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Advection.cu b/src/Advection.cu index 0b6543067..f6b982da1 100755 --- a/src/Advection.cu +++ b/src/Advection.cu @@ -251,7 +251,7 @@ template __global__ void AdvkernelGPU(Param XParam, BlockP XBlock, T ho, uo, vo; T dhi = XAdv.dh[i]; - T edt = dt;// dhi >= T(0.0) ? dt : min(dt, max(hold, XParam.eps) / abs(dhi)); + T edt = dhi >= T(0.0) ? dt : min(dt, max(hold, XParam.eps) / abs(dhi)); //ho = max(hold + edt * dhi,T(0.0)); ho = hold + edt * dhi; @@ -307,7 +307,7 @@ template __host__ void AdvkernelCPU(Param XParam, BlockP XBlock, T dhi = XAdv.dh[i]; - T edt = dt;// dhi > T(0.0) ? dt : min(dt, hold / (T(-1.0) * dhi)); + T edt = dhi > T(0.0) ? dt : min(dt, hold / (T(-1.0) * dhi)); ho = hold + edt * dhi; From 9ba730a50be4097a583129f0dfa45d2838ee8a58 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Tue, 4 Feb 2025 14:29:23 +1300 Subject: [PATCH 24/36] Fix zsoffset --- src/Boundary.cu | 7 +++++-- src/InitEvolv.cu | 4 ++-- src/ReadInput.cu | 6 ++++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/Boundary.cu b/src/Boundary.cu index b49382284..24f4b595c 100644 --- a/src/Boundary.cu +++ b/src/Boundary.cu @@ -290,9 +290,9 @@ template __global__ void bndFluxGPUSide(Param XParam, bndsegmentside s } - - + zsbnd = zsbnd + XParam.zsoffset; + int inside = Inside(halowidth, blkmemwidth, side.isright, side.istop, ix, iy, ib); @@ -443,6 +443,9 @@ template void bndFluxGPUSideCPU(Param XParam, bndsegmentside side, Blo zsbnd = interp2BUQ(XParam.xo + xx, XParam.yo + yy, Zsmap); } + + zsbnd = zsbnd + XParam.zsoffset; + int i = memloc(halowidth, blkmemwidth, ix, iy, ib); int inside = Inside(halowidth, blkmemwidth, side.isright, side.istop, ix, iy, ib); diff --git a/src/InitEvolv.cu b/src/InitEvolv.cu index fe85c5a8a..3dac1b64b 100755 --- a/src/InitEvolv.cu +++ b/src/InitEvolv.cu @@ -30,7 +30,7 @@ template void initevolv(Param XParam, BlockP XBlock,Forcing hotstartsucess = readhotstartfile(XParam, XBlock, XEv, zb); //add offset if present - if (!std::isnan(XParam.zsoffset)) // apply specified zsoffset + if (T(XParam.zsoffset) != T(0.0)) // apply specified zsoffset { printf("\t\tadd offset to zs and hh... "); // @@ -104,7 +104,7 @@ template int coldstart(Param XParam, BlockP XBlock, T* zb, EvolvingP & XEv) { T zzini = std::isnan(XParam.zsinit)? T(0.0): T(XParam.zsinit); - T zzoffset = std::isnan(XParam.zsoffset) ? T(0.0) : T(XParam.zsoffset); + T zzoffset = T(XParam.zsoffset); diff --git a/src/ReadInput.cu b/src/ReadInput.cu index 8526a8fe1..70fd84e85 100644 --- a/src/ReadInput.cu +++ b/src/ReadInput.cu @@ -1135,6 +1135,12 @@ void checkparamsanity(Param& XParam, Forcing& XForcing) XParam.blkmemwidth = XParam.blkwidth + 2 * XParam.halowidth; XParam.blksize = utils::sq(XParam.blkmemwidth); + /////////////////////////////////////////// + // zsoffset + /////////////////////////////////////////// + + XParam.zsoffset = std::isnan(XParam.zsoffset) ? 0.0 : XParam.zsoffset; + /////////////////////////////////////////// // Read Bathy Information /////////////////////////////////////////// From ad71ffa7ecd9a1a38b28aab13924e2bf69c3c92b Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Wed, 12 Feb 2025 09:51:26 +1300 Subject: [PATCH 25/36] Clean <<< ...>>> --- src/Advection.cu | 6 +- src/Boundary.cu | 10 +-- src/ConserveElevation.cu | 24 +++--- src/FlowGPU.cu | 72 ++++++++-------- src/Gradients.cu | 172 +++++++++++++++++++-------------------- src/Halo.cu | 104 +++++++++++------------ src/InitialConditions.cu | 4 +- src/Mainloop.cu | 4 +- src/Meanmax.cu | 62 +++++++------- src/Testing.cu | 34 ++++---- src/Updateforcing.cu | 6 +- 11 files changed, 249 insertions(+), 249 deletions(-) diff --git a/src/Advection.cu b/src/Advection.cu index f6b982da1..c879bf636 100755 --- a/src/Advection.cu +++ b/src/Advection.cu @@ -475,7 +475,7 @@ template __host__ T CalctimestepGPU(Param XParam,Loop XLoop, BlockP //GPU Harris reduction #3. 8.3x reduction #0 Note #7 if a lot faster // This was successfully tested with a range of grid size - //reducemax3 << > >(dtmax_g, arrmax_g, nx*ny) + //reducemax3 <<>>(dtmax_g, arrmax_g, nx*ny) int maxThreads = 256; int threads = (s < maxThreads * 2) ? nextPow2((s + 1) / 2) : maxThreads; @@ -485,7 +485,7 @@ template __host__ T CalctimestepGPU(Param XParam,Loop XLoop, BlockP dim3 gridDimLine(blocks, 1, 1); - reducemin3 << > > (XTime.dtmax, XTime.arrmin, s); + reducemin3 <<>> (XTime.dtmax, XTime.arrmin, s); CUDA_CHECK(cudaDeviceSynchronize()); @@ -503,7 +503,7 @@ template __host__ T CalctimestepGPU(Param XParam,Loop XLoop, BlockP CUDA_CHECK(cudaMemcpy(XTime.dtmax, XTime.arrmin, s * sizeof(T), cudaMemcpyDeviceToDevice)); - reducemin3 << > > (XTime.dtmax, XTime.arrmin, s); + reducemin3 <<>> (XTime.dtmax, XTime.arrmin, s); CUDA_CHECK(cudaDeviceSynchronize()); s = (s + (threads * 2 - 1)) / (threads * 2); diff --git a/src/Boundary.cu b/src/Boundary.cu index 24f4b595c..0e27c03d2 100644 --- a/src/Boundary.cu +++ b/src/Boundary.cu @@ -131,25 +131,25 @@ template void FlowbndFlux(Param XParam, double totaltime, BlockP XB //Left //template __global__ void bndFluxGPUSide(Param XParam, bndsegmentside side, BlockP XBlock, DynForcingP Atmp, DynForcingP Zsmap, bool uniform, float zsbnd, T * zs, T * h, T * un, T * ut, T * Fh, T * Fq, T * Ss) //bndFluxGPUSide <<< gridDimBBND, blockDim, 0 >>> (XParam, bndseg.left, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), XEv.zs, XEv.h, un, ut, Fh, Fq, S); - bndFluxGPUSide << < gridDimBBNDLeft, blockDim, 0 >> > (XParam, bndseg.left, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.u, XEv.v, XFlux.Fhu, XFlux.Fqux, XFlux.Su); + bndFluxGPUSide <<< gridDimBBNDLeft, blockDim, 0 >>> (XParam, bndseg.left, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.u, XEv.v, XFlux.Fhu, XFlux.Fqux, XFlux.Su); CUDA_CHECK(cudaDeviceSynchronize()); } //if (bndseg.right.nblk > 0) { //Right - bndFluxGPUSide << < gridDimBBNDRight, blockDim, 0 >> > (XParam, bndseg.right, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.u, XEv.v, XFlux.Fhu, XFlux.Fqux, XFlux.Su); + bndFluxGPUSide <<< gridDimBBNDRight, blockDim, 0 >>> (XParam, bndseg.right, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.u, XEv.v, XFlux.Fhu, XFlux.Fqux, XFlux.Su); CUDA_CHECK(cudaDeviceSynchronize()); } //if (bndseg.top.nblk > 0) { //top - bndFluxGPUSide << < gridDimBBNDTop, blockDim, 0 >> > (XParam, bndseg.top, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.v, XEv.u, XFlux.Fhv, XFlux.Fqvy, XFlux.Sv); + bndFluxGPUSide <<< gridDimBBNDTop, blockDim, 0 >>> (XParam, bndseg.top, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.v, XEv.u, XFlux.Fhv, XFlux.Fqvy, XFlux.Sv); CUDA_CHECK(cudaDeviceSynchronize()); } //if (bndseg.bot.nblk > 0) { //bot - bndFluxGPUSide << < gridDimBBNDBot, blockDim, 0 >> > (XParam, bndseg.bot, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.v, XEv.u, XFlux.Fhv, XFlux.Fqvy, XFlux.Sv); + bndFluxGPUSide <<< gridDimBBNDBot, blockDim, 0 >>> (XParam, bndseg.bot, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.v, XEv.u, XFlux.Fhv, XFlux.Fqvy, XFlux.Sv); CUDA_CHECK(cudaDeviceSynchronize()); } } @@ -219,7 +219,7 @@ template void FlowbndFluxold(Param XParam, double totaltime, BlockP if (XParam.GPUDEVICE >= 0) { - //bndFluxGPU << < gridDimBBND, blockDim, 0 >> > (XParam, side, XBlock, Atmp, float(itime), XEv.zs, XEv.h, un, ut, Fh, Fq, S); + //bndFluxGPU <<< gridDimBBND, blockDim, 0 >>> (XParam, side, XBlock, Atmp, float(itime), XEv.zs, XEv.h, un, ut, Fh, Fq, S); //CUDA_CHECK(cudaDeviceSynchronize()); } else diff --git a/src/ConserveElevation.cu b/src/ConserveElevation.cu index 940630b15..a2ecd572a 100755 --- a/src/ConserveElevation.cu +++ b/src/ConserveElevation.cu @@ -31,13 +31,13 @@ template void conserveElevationGPU(Param XParam, BlockP XBlock, Evo dim3 gridDim(XParam.nblk, 1, 1); - conserveElevationLeft << > > (XParam, XBlock, XEv, zb); + conserveElevationLeft <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - conserveElevationRight << > > (XParam, XBlock, XEv, zb); + conserveElevationRight <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - conserveElevationTop << > > (XParam, XBlock, XEv, zb); + conserveElevationTop <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - conserveElevationBot << > > (XParam, XBlock, XEv, zb); + conserveElevationBot <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -329,13 +329,13 @@ template void WetDryProlongationGPU(Param XParam, BlockP XBlock, Ev //WetDryProlongationGPUBot - WetDryProlongationGPULeft << > > (XParam, XBlock, XEv, zb); + WetDryProlongationGPULeft <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetDryProlongationGPURight << > > (XParam, XBlock, XEv, zb); + WetDryProlongationGPURight <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetDryProlongationGPUTop << > > (XParam, XBlock, XEv, zb); + WetDryProlongationGPUTop <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetDryProlongationGPUBot << > > (XParam, XBlock, XEv, zb); + WetDryProlongationGPUBot <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -350,13 +350,13 @@ template void WetDryRestrictionGPU(Param XParam, BlockP XBlock, Evo //WetDryProlongationGPUBot - WetDryRestrictionGPULeft << > > (XParam, XBlock, XEv, zb); + WetDryRestrictionGPULeft <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetDryRestrictionGPURight << > > (XParam, XBlock, XEv, zb); + WetDryRestrictionGPURight <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetDryRestrictionGPUTop << > > (XParam, XBlock, XEv, zb); + WetDryRestrictionGPUTop <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetDryRestrictionGPUBot << > > (XParam, XBlock, XEv, zb); + WetDryRestrictionGPUBot <<>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); } diff --git a/src/FlowGPU.cu b/src/FlowGPU.cu index 55ff88197..ab2d03bd2 100644 --- a/src/FlowGPU.cu +++ b/src/FlowGPU.cu @@ -35,7 +35,7 @@ template void FlowGPU(Param XParam, Loop& XLoop, Forcing XFo //Calc dpdx and dpdy - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.Patm, XModel.datmpdx, XModel.datmpdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.Patm, XModel.datmpdx, XModel.datmpdy); CUDA_CHECK(cudaDeviceSynchronize()); @@ -45,7 +45,7 @@ template void FlowGPU(Param XParam, Loop& XLoop, Forcing XFo refine_linearGPU(XParam, XModel.blocks, XModel.Patm, XModel.datmpdx, XModel.datmpdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.Patm, XModel.datmpdx, XModel.datmpdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.Patm, XModel.datmpdx, XModel.datmpdy); CUDA_CHECK(cudaDeviceSynchronize()); gradientHaloGPU(XParam, XModel.blocks, XModel.Patm, XModel.datmpdx, XModel.datmpdy); @@ -83,15 +83,15 @@ template void FlowGPU(Param XParam, Loop& XLoop, Forcing XFo if (XParam.engine == 1) { // X- direction - UpdateButtingerXGPU << < gridDim, blockDimKX, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); + UpdateButtingerXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); //updateKurgXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceXGPU <<< gridDim, blockDimKX, 0, XLoop.streams[0] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); CUDA_CHECK(cudaDeviceSynchronize()); // Y- direction - UpdateButtingerYGPU << < gridDim, blockDimKY, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); + UpdateButtingerYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); //updateKurgYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceYGPU <<< gridDim, blockDimKY, 0, XLoop.streams[1] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); - // //updateKurgY << < XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >> > (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); + // //updateKurgY <<< XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >>> (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -104,20 +104,20 @@ template void FlowGPU(Param XParam, Loop& XLoop, Forcing XFo // Y- direction updateKurgYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceYGPU <<< gridDim, blockDimKY, 0, XLoop.streams[1] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); - // //updateKurgY << < XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >> > (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); + // //updateKurgY <<< XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >>> (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); CUDA_CHECK(cudaDeviceSynchronize()); } else if (XParam.engine == 3) { // - updateKurgXATMGPU << < gridDim, blockDimKX, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdx); + updateKurgXATMGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdx); // //AddSlopeSourceXGPU <<< gridDim, blockDimKX, 0, XLoop.streams[0] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); CUDA_CHECK(cudaDeviceSynchronize()); // Y- direction - updateKurgYATMGPU << < gridDim, blockDimKY, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdy); + updateKurgYATMGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdy); // //AddSlopeSourceYGPU <<< gridDim, blockDimKY, 0, XLoop.streams[1] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); - // //updateKurgY << < XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >> > (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); + // //updateKurgY <<< XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >>> (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -185,12 +185,12 @@ template void FlowGPU(Param XParam, Loop& XLoop, Forcing XFo if (XParam.engine == 1) { // X- direction - UpdateButtingerXGPU << < gridDim, blockDimKX, 0 >> > (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); + UpdateButtingerXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); //updateKurgXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.zb); CUDA_CHECK(cudaDeviceSynchronize()); // Y- direction - UpdateButtingerYGPU << < gridDim, blockDimKY, 0 >> > (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); + UpdateButtingerYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); //updateKurgYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.zb); CUDA_CHECK(cudaDeviceSynchronize()); @@ -210,11 +210,11 @@ template void FlowGPU(Param XParam, Loop& XLoop, Forcing XFo { // // - updateKurgXATMGPU << < gridDim, blockDimKX, 0 >> > (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdx); + updateKurgXATMGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdx); CUDA_CHECK(cudaDeviceSynchronize()); // Y- direction - updateKurgYATMGPU << < gridDim, blockDimKY, 0 >> > (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdy); + updateKurgYATMGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdy); CUDA_CHECK(cudaDeviceSynchronize()); } //============================================ @@ -273,19 +273,19 @@ template void FlowGPU(Param XParam, Loop& XLoop, Forcing XFo if (!XForcing.Rain.inputfile.empty()) { - AddrainforcingImplicitGPU << < gridDim, blockDim, 0 >> > (XParam,XLoop, XModel.blocks, XForcing.Rain, XModel.evolv); + AddrainforcingImplicitGPU <<< gridDim, blockDim, 0 >>> (XParam,XLoop, XModel.blocks, XForcing.Rain, XModel.evolv); CUDA_CHECK(cudaDeviceSynchronize()); } if (XParam.infiltration) { - AddinfiltrationImplicitGPU << < gridDim, blockDim, 0 >> > (XParam, XLoop, XModel.blocks, XModel.il, XModel.cl, XModel.evolv, XModel.hgw); + AddinfiltrationImplicitGPU <<< gridDim, blockDim, 0 >>> (XParam, XLoop, XModel.blocks, XModel.il, XModel.cl, XModel.evolv, XModel.hgw); CUDA_CHECK(cudaDeviceSynchronize()); } if (XParam.VelThreshold > 0.0) { - TheresholdVelGPU << < gridDim, blockDim, 0 >> > (XParam, XModel.blocks, XModel.evolv); + TheresholdVelGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.evolv); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -328,7 +328,7 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing if (XParam.atmpforcing) { //Update atm press forcing - AddPatmforcingGPU << < gridDim, blockDim, 0 >> > (XParam, XModel.blocks, XForcing.Atmp, XModel); + AddPatmforcingGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XForcing.Atmp, XModel); CUDA_CHECK(cudaDeviceSynchronize()); //Fill atmp halo @@ -339,7 +339,7 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing cudaStreamDestroy(atmpstreams[0]); //Calc dpdx and dpdy - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.Patm, XModel.datmpdx, XModel.datmpdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.Patm, XModel.datmpdx, XModel.datmpdy); CUDA_CHECK(cudaDeviceSynchronize()); gradientHaloGPU(XParam, XModel.blocks, XModel.Patm, XModel.datmpdx, XModel.datmpdy); // @@ -347,7 +347,7 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing refine_linearGPU(XParam, XModel.blocks, XModel.Patm, XModel.datmpdx, XModel.datmpdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.Patm, XModel.datmpdx, XModel.datmpdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.Patm, XModel.datmpdx, XModel.datmpdy); CUDA_CHECK(cudaDeviceSynchronize()); gradientHaloGPU(XParam, XModel.blocks, XModel.Patm, XModel.datmpdx, XModel.datmpdy); @@ -367,7 +367,7 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing //============================================ // Reset DTmax - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel.blocks.active, XLoop.hugeposval, XModel.time.dtmax); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel.blocks.active, XLoop.hugeposval, XModel.time.dtmax); CUDA_CHECK(cudaDeviceSynchronize()); //============================================ @@ -385,41 +385,41 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing if (XParam.engine == 1) { // X- direction - UpdateButtingerXGPU << < gridDim, blockDimKX, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); + UpdateButtingerXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); //updateKurgXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceXGPU <<< gridDim, blockDimKX, 0, XLoop.streams[0] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); CUDA_CHECK(cudaDeviceSynchronize()); // Y- direction - UpdateButtingerYGPU << < gridDim, blockDimKY, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); + UpdateButtingerYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); //updateKurgYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceYGPU <<< gridDim, blockDimKY, 0, XLoop.streams[1] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); - // //updateKurgY << < XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >> > (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); + // //updateKurgY <<< XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >>> (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); CUDA_CHECK(cudaDeviceSynchronize()); } else if (XParam.engine == 2) { // X- direction - updateKurgXGPU << < gridDim, blockDimKX, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); + updateKurgXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceXGPU <<< gridDim, blockDimKX, 0, XLoop.streams[0] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); CUDA_CHECK(cudaDeviceSynchronize()); // Y- direction - updateKurgYGPU << < gridDim, blockDimKY, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); + updateKurgYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); // //AddSlopeSourceYGPU <<< gridDim, blockDimKY, 0, XLoop.streams[1] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); - // //updateKurgY << < XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >> > (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); + // //updateKurgY <<< XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >>> (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); CUDA_CHECK(cudaDeviceSynchronize()); } else if (XParam.engine == 3) { // - updateKurgXATMGPU << < gridDim, blockDimKX, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdx); + updateKurgXATMGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdx); // //AddSlopeSourceXGPU <<< gridDim, blockDimKX, 0, XLoop.streams[0] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); CUDA_CHECK(cudaDeviceSynchronize()); // Y- direction - updateKurgYATMGPU << < gridDim, blockDimKY, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdy); + updateKurgYATMGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb, XModel.Patm, XModel.datmpdy); // //AddSlopeSourceYGPU <<< gridDim, blockDimKY, 0, XLoop.streams[1] >>> (XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.zb); - // //updateKurgY << < XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >> > (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); + // //updateKurgY <<< XLoop.gridDim, XLoop.blockDim, 0, XLoop.streams[0] >>> (XParam, XLoop.epsilon, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -437,7 +437,7 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing //============================================ // Update advection terms (dh dhu dhv) - updateEVGPU << < gridDim, blockDim, 0 >> > (XParam, XModel.blocks, XModel.evolv, XModel.flux, XModel.adv); + updateEVGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.evolv, XModel.flux, XModel.adv); CUDA_CHECK(cudaDeviceSynchronize()); //============================================ @@ -448,7 +448,7 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing //} if (!XForcing.UWind.inputfile.empty())//&& !XForcing.UWind.inputfile.empty() { - AddwindforcingGPU << < gridDim, blockDim, 0 >> > (XParam, XModel.blocks, XForcing.UWind, XForcing.VWind, XModel.adv); + AddwindforcingGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XForcing.UWind, XForcing.VWind, XModel.adv); CUDA_CHECK(cudaDeviceSynchronize()); } if (XForcing.rivers.size() > 0) @@ -458,13 +458,13 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing //============================================ //Update evolving variable by 1/2 time step - AdvkernelGPU << < gridDim, blockDim, 0 >> > (XParam, XModel.blocks, XModel.time.dt * T(0.5), XModel.zb, XModel.evolv, XModel.adv, XModel.evolv_o); + AdvkernelGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.time.dt * T(0.5), XModel.zb, XModel.evolv, XModel.adv, XModel.evolv_o); CUDA_CHECK(cudaDeviceSynchronize()); //============================================ // Add bottom friction - bottomfrictionGPU << < gridDim, blockDim, 0 >> > (XParam, XModel.blocks, XModel.time.dt, XModel.cf, XModel.evolv_o); + bottomfrictionGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.time.dt, XModel.cf, XModel.evolv_o); //XiafrictionGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.time.dt, XModel.cf, XModel.evolv, XModel.evolv_o); @@ -472,18 +472,18 @@ template void HalfStepGPU(Param XParam, Loop& XLoop, Forcing //============================================ //Copy updated evolving variable back - cleanupGPU << < gridDim, blockDim, 0 >> > (XParam, XModel.blocks, XModel.evolv_o, XModel.evolv); + cleanupGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.evolv_o, XModel.evolv); CUDA_CHECK(cudaDeviceSynchronize()); if (!XForcing.Rain.inputfile.empty()) { - AddrainforcingImplicitGPU << < gridDim, blockDim, 0 >> > (XParam, XLoop, XModel.blocks, XForcing.Rain, XModel.evolv); + AddrainforcingImplicitGPU <<< gridDim, blockDim, 0 >>> (XParam, XLoop, XModel.blocks, XForcing.Rain, XModel.evolv); CUDA_CHECK(cudaDeviceSynchronize()); } if (XParam.VelThreshold > 0.0) { - TheresholdVelGPU << < gridDim, blockDim, 0 >> > (XParam, XModel.blocks, XModel.evolv); + TheresholdVelGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel.blocks, XModel.evolv); CUDA_CHECK(cudaDeviceSynchronize()); } diff --git a/src/Gradients.cu b/src/Gradients.cu index 40610ff11..65dbce32a 100644 --- a/src/Gradients.cu +++ b/src/Gradients.cu @@ -25,20 +25,20 @@ template void gradientGPU(Param XParam, BlockPXBlock, EvolvingP dim3 gridDim(XParam.nblk, 1, 1); - //gradient << < gridDim, blockDim, 0, streams[1] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx, XGrad.dhdy); + //gradient <<< gridDim, blockDim, 0, streams[1] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx, XGrad.dhdy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[2] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); + //gradient <<< gridDim, blockDim, 0, streams[2] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[3] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx, XGrad.dudy); + //gradient <<< gridDim, blockDim, 0, streams[3] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx, XGrad.dudy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[0] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx, XGrad.dvdy); + //gradient <<< gridDim, blockDim, 0, streams[0] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx, XGrad.dvdy); CUDA_CHECK(cudaDeviceSynchronize()); @@ -77,67 +77,67 @@ template void gradientGPU(Param XParam, BlockPXBlock, EvolvingP conserveElevationGPU(XParam, XBlock, XEv, zb); } - RecalculateZsGPU << < gridDim, blockDimfull, 0 >> > (XParam, XBlock, XEv, zb); + RecalculateZsGPU <<< gridDim, blockDimfull, 0 >>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[1] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx, XGrad.dhdy); + //gradient <<< gridDim, blockDim, 0, streams[1] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx, XGrad.dhdy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[2] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); + //gradient <<< gridDim, blockDim, 0, streams[2] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[3] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx, XGrad.dudy); + //gradient <<< gridDim, blockDim, 0, streams[3] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx, XGrad.dudy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[0] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx, XGrad.dvdy); + //gradient <<< gridDim, blockDim, 0, streams[0] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx, XGrad.dvdy); CUDA_CHECK(cudaDeviceSynchronize()); /* - gradientedgeX << < gridDim, blockDimLR2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx); + gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeX << < gridDim, blockDimLR, 0 >> > (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx); + //gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeY << < gridDim, blockDimBT2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdy); + gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeY << < gridDim, blockDimBT, 0>> > (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdy); + //gradientedgeY <<< gridDim, blockDimBT, 0>>> (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdy); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeX << < gridDim, blockDimLR2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx); + gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeX << < gridDim, blockDimLR, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx); + //gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeY << < gridDim, blockDimBT2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdy); + gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeY << < gridDim, blockDimBT, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdy); + //gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdy); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeX << < gridDim, blockDimLR2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx); + gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeX << < gridDim, blockDimLR, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx); + //gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeY << < gridDim, blockDimBT2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudy); + gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeY << < gridDim, blockDimBT, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudy); + //gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudy); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeX << < gridDim, blockDimLR2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx); + gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeX << < gridDim, blockDimLR, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx); + //gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeY << < gridDim, blockDimBT2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdy); + gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeY << < gridDim, blockDimBT, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdy); + //gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdy); CUDA_CHECK(cudaDeviceSynchronize()); */ @@ -154,23 +154,23 @@ template void gradientGPU(Param XParam, BlockPXBlock, EvolvingP if (XParam.engine == 1) { // wet slope limiter - WetsloperesetXGPU << < gridDim, blockDim, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetXGPU <<< gridDim, blockDim, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetsloperesetYGPU << < gridDim, blockDim, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetYGPU <<< gridDim, blockDim, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); // ALso do the slope limiter on the halo - WetsloperesetHaloLeftGPU << < gridDim, blockDimLR, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetHaloLeftGPU <<< gridDim, blockDimLR, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetsloperesetHaloRightGPU << < gridDim, blockDimLR, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetHaloRightGPU <<< gridDim, blockDimLR, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetsloperesetHaloBotGPU << < gridDim, blockDimBT, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetHaloBotGPU <<< gridDim, blockDimBT, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetsloperesetHaloTopGPU << < gridDim, blockDimBT, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetHaloTopGPU <<< gridDim, blockDimBT, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -205,20 +205,20 @@ template void gradientGPUnew(Param XParam, BlockPXBlock, EvolvingP< CUDA_CHECK(cudaStreamCreate(&streams[i])); } - //gradient << < gridDim, blockDim, 0, streams[1] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); - gradientSMC << < gridDim, blockDim, 0, streams[0] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx, XGrad.dhdy); + //gradient <<< gridDim, blockDim, 0, streams[1] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); + gradientSMC <<< gridDim, blockDim, 0, streams[0] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx, XGrad.dhdy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[2] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); - gradientSMC << < gridDim, blockDim, 0, streams[1] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); + //gradient <<< gridDim, blockDim, 0, streams[2] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); + gradientSMC <<< gridDim, blockDim, 0, streams[1] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[3] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); - gradientSMC << < gridDim, blockDim, 0, streams[2] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx, XGrad.dudy); + //gradient <<< gridDim, blockDim, 0, streams[3] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); + gradientSMC <<< gridDim, blockDim, 0, streams[2] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx, XGrad.dudy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[0] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); - gradientSMC << < gridDim, blockDim, 0, streams[3] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx, XGrad.dvdy); + //gradient <<< gridDim, blockDim, 0, streams[0] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); + gradientSMC <<< gridDim, blockDim, 0, streams[3] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx, XGrad.dvdy); //CUDA_CHECK(cudaDeviceSynchronize()); @@ -259,24 +259,24 @@ template void gradientGPUnew(Param XParam, BlockPXBlock, EvolvingP< WetDryProlongationGPU(XParam, XBlock, XEv, zb); } - RecalculateZsGPU << < gridDim, blockDimfull, 0 >> > (XParam, XBlock, XEv, zb); + RecalculateZsGPU <<< gridDim, blockDimfull, 0 >>> (XParam, XBlock, XEv, zb); CUDA_CHECK(cudaDeviceSynchronize()); /* - //gradient << < gridDim, blockDim, 0, streams[1] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); - gradientSM << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); + //gradient <<< gridDim, blockDim, 0, streams[1] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); + gradientSM <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[2] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); - gradientSM << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); + //gradient <<< gridDim, blockDim, 0, streams[2] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); + gradientSM <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[3] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); - gradientSM << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); + //gradient <<< gridDim, blockDim, 0, streams[3] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); + gradientSM <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy); CUDA_CHECK(cudaDeviceSynchronize()); - //gradient << < gridDim, blockDim, 0, streams[0] >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); - gradientSM << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); + //gradient <<< gridDim, blockDim, 0, streams[0] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); + gradientSM <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy); CUDA_CHECK(cudaDeviceSynchronize()); */ /* @@ -292,44 +292,44 @@ template void gradientGPUnew(Param XParam, BlockPXBlock, EvolvingP< - gradientedgeX << < gridDim, blockDimLR2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx); + gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeX << < gridDim, blockDimLR, 0 >> > (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx); + //gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeY << < gridDim, blockDimBT2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdy); + gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeY << < gridDim, blockDimBT, 0>> > (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdy); + //gradientedgeY <<< gridDim, blockDimBT, 0>>> (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdy); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeX << < gridDim, blockDimLR2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx); + gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeX << < gridDim, blockDimLR, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx); + //gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeY << < gridDim, blockDimBT2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdy); + gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeY << < gridDim, blockDimBT, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdy); + //gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdy); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeX << < gridDim, blockDimLR2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx); + gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeX << < gridDim, blockDimLR, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx); + //gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeY << < gridDim, blockDimBT2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudy); + gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeY << < gridDim, blockDimBT, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudy); + //gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudy); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeX << < gridDim, blockDimLR2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx); + gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeX << < gridDim, blockDimLR, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx); + //gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx); //CUDA_CHECK(cudaDeviceSynchronize()); - gradientedgeY << < gridDim, blockDimBT2, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdy); + gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdy); //CUDA_CHECK(cudaDeviceSynchronize()); - //gradientedgeY << < gridDim, blockDimBT, 0 >> > (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdy); + //gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdy); CUDA_CHECK(cudaDeviceSynchronize()); /* @@ -351,23 +351,23 @@ template void gradientGPUnew(Param XParam, BlockPXBlock, EvolvingP< if (XParam.engine == 1) { // wet slope limiter - WetsloperesetXGPU << < gridDim, blockDim, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetXGPU <<< gridDim, blockDim, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetsloperesetYGPU << < gridDim, blockDim, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetYGPU <<< gridDim, blockDim, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); // ALso do the slope limiter on the halo - WetsloperesetHaloLeftGPU << < gridDim, blockDimLR, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetHaloLeftGPU <<< gridDim, blockDimLR, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetsloperesetHaloRightGPU << < gridDim, blockDimLR, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetHaloRightGPU <<< gridDim, blockDimLR, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetsloperesetHaloBotGPU << < gridDim, blockDimBT, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetHaloBotGPU <<< gridDim, blockDimBT, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); - WetsloperesetHaloTopGPU << < gridDim, blockDimBT, 0 >> > (XParam, XBlock, XEv, XGrad, zb); + WetsloperesetHaloTopGPU <<< gridDim, blockDimBT, 0 >>> (XParam, XBlock, XEv, XGrad, zb); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -2358,16 +2358,16 @@ template void gradientHaloGPU(Param XParam, BlockPXBlock, T* a, T* dim3 gridDim(XParam.nblk, 1, 1); - gradientHaloLeftGPU << < gridDim, blockDimL, 0 >> > (XParam, XBlock, a, dadx, dady); + gradientHaloLeftGPU <<< gridDim, blockDimL, 0 >>> (XParam, XBlock, a, dadx, dady); CUDA_CHECK(cudaDeviceSynchronize()); - gradientHaloRightGPU << < gridDim, blockDimL, 0 >> > (XParam, XBlock, a, dadx, dady); + gradientHaloRightGPU <<< gridDim, blockDimL, 0 >>> (XParam, XBlock, a, dadx, dady); CUDA_CHECK(cudaDeviceSynchronize()); - gradientHaloBotGPU << < gridDim, blockDimB, 0 >> > (XParam, XBlock, a, dadx, dady); + gradientHaloBotGPU <<< gridDim, blockDimB, 0 >>> (XParam, XBlock, a, dadx, dady); CUDA_CHECK(cudaDeviceSynchronize()); - gradientHaloTopGPU << < gridDim, blockDimB, 0 >> > (XParam, XBlock, a, dadx, dady); + gradientHaloTopGPU <<< gridDim, blockDimB, 0 >>> (XParam, XBlock, a, dadx, dady); CUDA_CHECK(cudaDeviceSynchronize()); @@ -2389,16 +2389,16 @@ template void gradientHaloGPUnew(Param XParam, BlockPXBlock, T* a, dim3 gridDim(ceil(XParam.nblk/2), 1, 1); - gradientHaloLeftGPUnew << < gridDim, blockDimL, 0, streams[0] >> > (XParam, XBlock, a, dadx, dady); + gradientHaloLeftGPUnew <<< gridDim, blockDimL, 0, streams[0] >>> (XParam, XBlock, a, dadx, dady); - gradientHaloRightGPUnew << < gridDim, blockDimL, 0, streams[1] >> > (XParam, XBlock, a, dadx, dady); + gradientHaloRightGPUnew <<< gridDim, blockDimL, 0, streams[1] >>> (XParam, XBlock, a, dadx, dady); - gradientHaloBotGPUnew << < gridDim, blockDimB, 0, streams[2] >> > (XParam, XBlock, a, dadx, dady); + gradientHaloBotGPUnew <<< gridDim, blockDimB, 0, streams[2] >>> (XParam, XBlock, a, dadx, dady); - gradientHaloTopGPUnew << < gridDim, blockDimB, 0, streams[3] >> > (XParam, XBlock, a, dadx, dady); + gradientHaloTopGPUnew <<< gridDim, blockDimB, 0, streams[3] >>> (XParam, XBlock, a, dadx, dady); //CUDA_CHECK(cudaDeviceSynchronize()); diff --git a/src/Halo.cu b/src/Halo.cu index a867ff832..93ea8aeef 100644 --- a/src/Halo.cu +++ b/src/Halo.cu @@ -214,17 +214,17 @@ template void fillHaloGPU(Param XParam, BlockP XBlock, cudaStream_t dim3 blockDimHaloBT(XParam.blkwidth, 1, 1); dim3 gridDim(XParam.nblk, 1, 1); - fillLeft << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z); - //fillLeft << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z); + fillLeft <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z); + //fillLeft <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z); CUDA_CHECK(cudaDeviceSynchronize()); - fillRight << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); - //fillRight << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); + fillRight <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); + //fillRight <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); CUDA_CHECK(cudaDeviceSynchronize()); - fillBot << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z); - //fillBot << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z); + fillBot <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z); + //fillBot <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z); CUDA_CHECK(cudaDeviceSynchronize()); - fillTop << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); - //fillTop << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); + fillTop <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); + //fillTop <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); CUDA_CHECK(cudaDeviceSynchronize()); //CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -244,17 +244,17 @@ template void fillHaloGPUnew(Param XParam, BlockP XBlock, cudaStrea dim3 blockDimHaloBTx2(XParam.blkwidth, 2, 1); dim3 gridDimx2(ceil(XParam.nblk/2), 1, 1); - //fillLeftnew << > > (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z); - fillLeft << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z); + //fillLeftnew <<>> (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z); + fillLeft <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z); CUDA_CHECK(cudaDeviceSynchronize()); - //fillRightnew << > > (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); - fillRight << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); + //fillRightnew <<>> (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); + fillRight <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); CUDA_CHECK(cudaDeviceSynchronize()); - //fillBotnew << > > (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z); - fillBot << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z); + //fillBotnew <<>> (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z); + fillBot <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z); CUDA_CHECK(cudaDeviceSynchronize()); - //fillTopnew << > > (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); - fillTop << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); + //fillTopnew <<>> (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); + fillTop <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); CUDA_CHECK(cudaDeviceSynchronize()); //CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -350,13 +350,13 @@ template void fillHaloTopRightGPU(Param XParam, BlockP XBlock, cuda dim3 gridDim(XParam.nblk, 1, 1); - //fillLeft << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, a); - //fillRightFlux << > > (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); - HaloFluxGPULR << > > (XParam, XBlock, z); + //fillLeft <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, a); + //fillRightFlux <<>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); + HaloFluxGPULR <<>> (XParam, XBlock, z); - //fillBot << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, a); - //fillTopFlux << > > (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); - HaloFluxGPUBT << > > (XParam, XBlock, z); + //fillBot <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, a); + //fillTopFlux <<>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); + HaloFluxGPUBT <<>> (XParam, XBlock, z); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -371,13 +371,13 @@ template void fillHaloLeftRightGPU(Param XParam, BlockP XBlock, cud dim3 gridDim(XParam.nblk, 1, 1); - //fillLeft << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, a); - //fillRightFlux << > > (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); - HaloFluxGPULR << > > (XParam, XBlock, z); + //fillLeft <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, a); + //fillRightFlux <<>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); + HaloFluxGPULR <<>> (XParam, XBlock, z); - //fillBot << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, a); - //fillTopFlux << > > (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); - //HaloFluxGPUBT << > > (XParam, XBlock, z); + //fillBot <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, a); + //fillTopFlux <<>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); + //HaloFluxGPUBT <<>> (XParam, XBlock, z); //CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -391,7 +391,7 @@ template void fillHaloLeftRightGPUnew(Param XParam, BlockP XBlock, //dim3 blockDimHaloBT(16, 1, 1); dim3 gridDim(ceil(XParam.nblk/2), 1, 1); - HaloFluxGPULRnew << > > (XParam, XBlock, z); + HaloFluxGPULRnew <<>> (XParam, XBlock, z); @@ -407,13 +407,13 @@ template void fillHaloBotTopGPU(Param XParam, BlockP XBlock, cudaSt dim3 gridDim(XParam.nblk, 1, 1); - //fillLeft << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, a); - //fillRightFlux << > > (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); - //HaloFluxGPULR << > > (XParam, XBlock, z); + //fillLeft <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, a); + //fillRightFlux <<>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z); + //HaloFluxGPULR <<>> (XParam, XBlock, z); - //fillBot << > > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, a); - //fillTopFlux << > > (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); - HaloFluxGPUBT << > > (XParam, XBlock, z); + //fillBot <<>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, a); + //fillTopFlux <<>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z); + HaloFluxGPUBT <<>> (XParam, XBlock, z); //CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -428,7 +428,7 @@ template void fillHaloBotTopGPUnew(Param XParam, BlockP XBlock, cud dim3 gridDim(ceil(XParam.nblk/2), 1, 1); - HaloFluxGPUBTnew << > > (XParam, XBlock, z); + HaloFluxGPUBTnew <<>> (XParam, XBlock, z); } @@ -549,15 +549,15 @@ template void fillHaloGPU(Param XParam, BlockP XBlock, EvolvingP WetDryRestrictionGPU(XParam, XBlock, Xev, zb); } - RecalculateZsGPU << < gridDimfull, blockDimfull, 0 >> > (XParam, XBlock, Xev, zb); + RecalculateZsGPU <<< gridDimfull, blockDimfull, 0 >>> (XParam, XBlock, Xev, zb); CUDA_CHECK(cudaDeviceSynchronize()); //if (XBlock.mask.nblk > 0) //{ - // maskbndGPUleft << > > (XParam, XBlock, Xev, zb); - // maskbndGPUtop << > > (XParam, XBlock, Xev, zb); - // maskbndGPUright << > > (XParam, XBlock, Xev, zb); - // maskbndGPUtop << > > (XParam, XBlock, Xev, zb); + // maskbndGPUleft <<>> (XParam, XBlock, Xev, zb); + // maskbndGPUtop <<>> (XParam, XBlock, Xev, zb); + // maskbndGPUright <<>> (XParam, XBlock, Xev, zb); + // maskbndGPUtop <<>> (XParam, XBlock, Xev, zb); // //CUDA_CHECK(cudaDeviceSynchronize()); //} @@ -716,10 +716,10 @@ template void fillHaloGPU(Param XParam, BlockP XBlock, FluxP Flu // Below has now moved to its own function //if (XBlock.mask.nblk > 0) //{ - // maskbndGPUFluxleft << > > (XParam, XBlock, Flux); - // maskbndGPUFluxtop << > > (XParam, XBlock, Flux); - // maskbndGPUFluxright << > > (XParam, XBlock, Flux); - // maskbndGPUFluxbot << > > (XParam, XBlock, Flux); + // maskbndGPUFluxleft <<>> (XParam, XBlock, Flux); + // maskbndGPUFluxtop <<>> (XParam, XBlock, Flux); + // maskbndGPUFluxright <<>> (XParam, XBlock, Flux); + // maskbndGPUFluxbot <<>> (XParam, XBlock, Flux); // //CUDA_CHECK(cudaDeviceSynchronize()); //} @@ -750,10 +750,10 @@ template void bndmaskGPU(Param XParam, BlockP XBlock, EvolvingP dim3 gridDim(XBlock.mask.nblk, 1, 1); if (XBlock.mask.nblk > 0) { - maskbndGPUFluxleft << > > (XParam, XBlock, Xev, Flux); - maskbndGPUFluxtop << > > (XParam, XBlock, Flux); - maskbndGPUFluxright << > > (XParam, XBlock, Flux); - maskbndGPUFluxbot << > > (XParam, XBlock, Flux); + maskbndGPUFluxleft <<>> (XParam, XBlock, Xev, Flux); + maskbndGPUFluxtop <<>> (XParam, XBlock, Flux); + maskbndGPUFluxright <<>> (XParam, XBlock, Flux); + maskbndGPUFluxbot <<>> (XParam, XBlock, Flux); //CUDA_CHECK(cudaDeviceSynchronize()); } @@ -1071,9 +1071,9 @@ template void refine_linearGPU(Param XParam, BlockP XBlock, T* z, T dim3 gridDim(XParam.nblk, 1, 1); refine_linear_LeftGPU<<>>(XParam, XBlock, z, dzdx, dzdy); - refine_linear_RightGPU << > > (XParam, XBlock, z, dzdx, dzdy); - refine_linear_TopGPU << > > (XParam, XBlock, z, dzdx, dzdy); - refine_linear_BotGPU << > > (XParam, XBlock, z, dzdx, dzdy); + refine_linear_RightGPU <<>> (XParam, XBlock, z, dzdx, dzdy); + refine_linear_TopGPU <<>> (XParam, XBlock, z, dzdx, dzdy); + refine_linear_BotGPU <<>> (XParam, XBlock, z, dzdx, dzdy); CUDA_CHECK(cudaDeviceSynchronize()); } diff --git a/src/InitialConditions.cu b/src/InitialConditions.cu index fcbcc65dd..bb77eac22 100644 --- a/src/InitialConditions.cu +++ b/src/InitialConditions.cu @@ -144,14 +144,14 @@ template void InitzbgradientGPU(Param XParam, Model XModel) cudaStreamDestroy(streams[0]); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.zb, XModel.grad.dzbdx, XModel.grad.dzbdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.zb, XModel.grad.dzbdx, XModel.grad.dzbdy); CUDA_CHECK(cudaDeviceSynchronize()); gradientHaloGPU(XParam, XModel.blocks, XModel.zb, XModel.grad.dzbdx, XModel.grad.dzbdy); refine_linearGPU(XParam, XModel.blocks, XModel.zb, XModel.grad.dzbdx, XModel.grad.dzbdy); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.zb, XModel.grad.dzbdx, XModel.grad.dzbdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel.blocks.active, XModel.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel.zb, XModel.grad.dzbdx, XModel.grad.dzbdy); CUDA_CHECK(cudaDeviceSynchronize()); gradientHaloGPU(XParam, XModel.blocks, XModel.zb, XModel.grad.dzbdx, XModel.grad.dzbdy); diff --git a/src/Mainloop.cu b/src/Mainloop.cu index 8f5503720..e9e69dd1d 100755 --- a/src/Mainloop.cu +++ b/src/Mainloop.cu @@ -100,7 +100,7 @@ template void DebugLoop(Param& XParam, Forcing XForcing, Model< fillHaloGPU(XParam, XModel_g.blocks, XLoop.streams[0], XModel_g.zb); cudaStreamDestroy(XLoop.streams[0]); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.zb, XModel_g.grad.dzbdx, XModel_g.grad.dzbdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.zb, XModel_g.grad.dzbdx, XModel_g.grad.dzbdy); CUDA_CHECK(cudaDeviceSynchronize()); gradientHaloGPU(XParam, XModel_g.blocks, XModel_g.zb, XModel_g.grad.dzbdx, XModel_g.grad.dzbdy); @@ -280,7 +280,7 @@ template void pointoutputstep(Param XParam, Loop &XLoop, Model X XLoop.TSAllout[o].push_back(stepread); - storeTSout << > > (XParam,(int)XParam.TSnodesout.size(), o, XLoop.nTSsteps, XParam.TSnodesout[o].block, XParam.TSnodesout[o].i, XParam.TSnodesout[o].j, XModel_g.bndblk.Tsout, XModel_g.evolv, XModel_g.TSstore); + storeTSout <<>> (XParam,(int)XParam.TSnodesout.size(), o, XLoop.nTSsteps, XParam.TSnodesout[o].block, XParam.TSnodesout[o].i, XParam.TSnodesout[o].j, XModel_g.bndblk.Tsout, XModel_g.evolv, XModel_g.TSstore); CUDA_CHECK(cudaDeviceSynchronize()); } //CUDA_CHECK(cudaDeviceSynchronize()); diff --git a/src/Meanmax.cu b/src/Meanmax.cu index 777227126..58996dc04 100755 --- a/src/Meanmax.cu +++ b/src/Meanmax.cu @@ -12,11 +12,11 @@ template void Calcmeanmax(Param XParam, Loop& XLoop, Model XMode { if (XParam.GPUDEVICE >= 0) { - addavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmean.h, XModel_g.evolv.h); - addavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmean.zs, XModel_g.evolv.zs); - addavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmean.u, XModel_g.evolv.u); - addavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmean.v, XModel_g.evolv.v); - addUandhU_GPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evolv.h, XModel_g.evolv.u, XModel_g.evolv.v, XModel_g.evmean.U, XModel_g.evmean.hU); + addavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmean.h, XModel_g.evolv.h); + addavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmean.zs, XModel_g.evolv.zs); + addavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmean.u, XModel_g.evolv.u); + addavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmean.v, XModel_g.evolv.v); + addUandhU_GPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evolv.h, XModel_g.evolv.u, XModel_g.evolv.v, XModel_g.evmean.U, XModel_g.evmean.hU); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -39,12 +39,12 @@ template void Calcmeanmax(Param XParam, Loop& XLoop, Model XMode // devide by number of steps if (XParam.GPUDEVICE >= 0) { - divavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.h); - divavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.zs); - divavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.u); - divavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.v); - divavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.U); - divavg_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.hU); + divavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.h); + divavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.zs); + divavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.u); + divavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.v); + divavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.U); + divavg_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, T(XLoop.nstep), XModel_g.evmean.hU); CUDA_CHECK(cudaDeviceSynchronize()); } else @@ -65,12 +65,12 @@ template void Calcmeanmax(Param XParam, Loop& XLoop, Model XMode { if (XParam.GPUDEVICE >= 0) { - max_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmax.h, XModel_g.evolv.h); - max_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmax.zs, XModel_g.evolv.zs); - max_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmax.u, XModel_g.evolv.u); - max_varGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmax.v, XModel_g.evolv.v); - max_Norm_GPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmax.U, XModel_g.evolv.u, XModel_g.evolv.v); - max_hU_GPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evmax.hU, XModel_g.evolv.h, XModel_g.evolv.u, XModel_g.evolv.v); + max_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmax.h, XModel_g.evolv.h); + max_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmax.zs, XModel_g.evolv.zs); + max_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmax.u, XModel_g.evolv.u); + max_varGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmax.v, XModel_g.evolv.v); + max_Norm_GPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmax.U, XModel_g.evolv.u, XModel_g.evolv.v); + max_hU_GPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evmax.hU, XModel_g.evolv.h, XModel_g.evolv.u, XModel_g.evolv.v); CUDA_CHECK(cudaDeviceSynchronize()); } else @@ -88,7 +88,7 @@ template void Calcmeanmax(Param XParam, Loop& XLoop, Model XMode if (XParam.GPUDEVICE >= 0) { // Add value GPU - addwettime_GPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.wettime, XModel_g.evolv.h, T(XParam.wet_threshold), T(XLoop.dt)); + addwettime_GPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.wettime, XModel_g.evolv.h, T(XParam.wet_threshold), T(XLoop.dt)); } else { @@ -169,17 +169,17 @@ template void resetmaxGPU(Param XParam, Loop XLoop, BlockP XBloc dim3 blockDim(XParam.blkwidth, XParam.blkwidth, 1); dim3 gridDim(XParam.nblk, 1, 1); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.h); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.h); CUDA_CHECK(cudaDeviceSynchronize()); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.zs); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.zs); CUDA_CHECK(cudaDeviceSynchronize()); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.u); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.u); CUDA_CHECK(cudaDeviceSynchronize()); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.v); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.v); CUDA_CHECK(cudaDeviceSynchronize()); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.U); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.U); CUDA_CHECK(cudaDeviceSynchronize()); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.hU); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XLoop.hugenegval, XEv.hU); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -217,12 +217,12 @@ template void resetmeanGPU(Param XParam, Loop XLoop, BlockP XBlo dim3 blockDim(XParam.blkwidth, XParam.blkwidth, 1); dim3 gridDim(XParam.nblk, 1, 1); // - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, T(0.0), XEv.h); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, T(0.0), XEv.zs); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, T(0.0), XEv.u); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, T(0.0), XEv.v); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, T(0.0), XEv.U); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, T(0.0), XEv.hU); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, T(0.0), XEv.h); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, T(0.0), XEv.zs); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, T(0.0), XEv.u); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, T(0.0), XEv.v); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, T(0.0), XEv.U); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, T(0.0), XEv.hU); CUDA_CHECK(cudaDeviceSynchronize()); @@ -244,7 +244,7 @@ template void resetvalGPU(Param XParam, BlockP XBlock, T*& var, T v { dim3 blockDim(XParam.blkwidth, XParam.blkwidth, 1); dim3 gridDim(XParam.nblk, 1, 1); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XBlock.active, val, var); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, val, var); CUDA_CHECK(cudaDeviceSynchronize()); } diff --git a/src/Testing.cu b/src/Testing.cu index 8784b33f4..1be1048b7 100644 --- a/src/Testing.cu +++ b/src/Testing.cu @@ -1203,7 +1203,7 @@ template bool reductiontest(Param XParam, Model XModel, Model XM if (XParam.GPUDEVICE >= 0) { - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, XLoop.hugeposval, XModel_g.time.dtmax); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, XLoop.hugeposval, XModel_g.time.dtmax); CUDA_CHECK(cudaDeviceSynchronize()); CopytoGPU(XParam.nblkmem, XParam.blksize, XModel.time.dtmax, XModel_g.time.dtmax); @@ -1258,7 +1258,7 @@ template bool CPUGPUtest(Param XParam, Model XModel, Model XModel InitArrayBUQ(XParam, XModel.blocks, T(0.0), XModel.evolv.v); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, T(0.0), XModel_g.zb); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, T(0.0), XModel_g.zb); CUDA_CHECK(cudaDeviceSynchronize()); // Create some usefull vectors std::string evolvst[4] = { "h","zs","u","v" }; @@ -1314,15 +1314,15 @@ template bool CPUGPUtest(Param XParam, Model XModel, Model XModel //GPU gradients - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.evolv.h, XModel_g.grad.dhdx, XModel_g.grad.dhdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.evolv.h, XModel_g.grad.dhdx, XModel_g.grad.dhdy); CUDA_CHECK(cudaDeviceSynchronize()); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.evolv.zs, XModel_g.grad.dzsdx, XModel_g.grad.dzsdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.evolv.zs, XModel_g.grad.dzsdx, XModel_g.grad.dzsdy); CUDA_CHECK(cudaDeviceSynchronize()); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.evolv.u, XModel_g.grad.dudx, XModel_g.grad.dudy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.evolv.u, XModel_g.grad.dudx, XModel_g.grad.dudy); CUDA_CHECK(cudaDeviceSynchronize()); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.evolv.v, XModel_g.grad.dvdx, XModel_g.grad.dvdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.evolv.v, XModel_g.grad.dvdx, XModel_g.grad.dvdy); CUDA_CHECK(cudaDeviceSynchronize()); std::string gradst[8] = { "dhdx","dzsdx","dudx","dvdx","dhdy","dzsdy","dudy","dvdy" }; @@ -1367,14 +1367,14 @@ template bool CPUGPUtest(Param XParam, Model XModel, Model XModel updateKurgXCPU(XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); //GPU part - updateKurgXGPU << < gridDim, blockDimKX, 0 >> > (XParam, XModel_g.blocks, XModel_g.evolv, XModel_g.grad, XModel_g.flux, XModel_g.time.dtmax, XModel_g.zb); + updateKurgXGPU <<< gridDim, blockDimKX, 0 >>> (XParam, XModel_g.blocks, XModel_g.evolv, XModel_g.grad, XModel_g.flux, XModel_g.time.dtmax, XModel_g.zb); CUDA_CHECK(cudaDeviceSynchronize()); // Y- direction updateKurgYCPU(XParam, XModel.blocks, XModel.evolv, XModel.grad, XModel.flux, XModel.time.dtmax, XModel.zb); - updateKurgYGPU << < gridDim, blockDimKY, 0 >> > (XParam, XModel_g.blocks, XModel_g.evolv, XModel_g.grad, XModel_g.flux, XModel_g.time.dtmax, XModel_g.zb); + updateKurgYGPU <<< gridDim, blockDimKY, 0 >>> (XParam, XModel_g.blocks, XModel_g.evolv, XModel_g.grad, XModel_g.flux, XModel_g.time.dtmax, XModel_g.zb); CUDA_CHECK(cudaDeviceSynchronize()); CompareCPUvsGPU(XParam, XModel, XModel_g, fluxVar, false); @@ -1397,7 +1397,7 @@ template bool CPUGPUtest(Param XParam, Model XModel, Model XModel advVar.push_back(advst[nv]); } updateEVCPU(XParam, XModel.blocks, XModel.evolv, XModel.flux, XModel.adv); - updateEVGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, XModel_g.evolv, XModel_g.flux, XModel_g.adv); + updateEVGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, XModel_g.evolv, XModel_g.flux, XModel_g.adv); CUDA_CHECK(cudaDeviceSynchronize()); CompareCPUvsGPU(XParam, XModel, XModel_g, advVar, false); @@ -1413,7 +1413,7 @@ template bool CPUGPUtest(Param XParam, Model XModel, Model XModel evoVar.push_back(evost[nv]); } AdvkernelCPU(XParam, XModel.blocks, T(0.0005), XModel.zb, XModel.evolv, XModel.adv, XModel.evolv_o); - AdvkernelGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, T(0.0005), XModel_g.zb, XModel_g.evolv, XModel_g.adv, XModel_g.evolv_o); + AdvkernelGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, T(0.0005), XModel_g.zb, XModel_g.evolv, XModel_g.adv, XModel_g.evolv_o); CUDA_CHECK(cudaDeviceSynchronize()); CompareCPUvsGPU(XParam, XModel, XModel_g, evoVar, false); @@ -1423,7 +1423,7 @@ template bool CPUGPUtest(Param XParam, Model XModel, Model XModel bottomfrictionCPU(XParam, XModel.blocks, T(0.5), XModel.cf, XModel.evolv_o); - bottomfrictionGPU << < gridDim, blockDim, 0 >> > (XParam, XModel_g.blocks, T(0.5), XModel_g.cf, XModel_g.evolv_o); + bottomfrictionGPU <<< gridDim, blockDim, 0 >>> (XParam, XModel_g.blocks, T(0.5), XModel_g.cf, XModel_g.evolv_o); CUDA_CHECK(cudaDeviceSynchronize()); CompareCPUvsGPU(XParam, XModel, XModel_g, evoVar, false); @@ -1472,10 +1472,10 @@ template bool CPUGPUtest(Param XParam, Model XModel, Model XModel InitArrayBUQ(XParam, XModel.blocks, T(0.0), XModel.evolv.u); InitArrayBUQ(XParam, XModel.blocks, T(0.0), XModel.evolv.v); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, T(0.0), XModel_g.evolv.u); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, T(0.0), XModel_g.evolv.u); CUDA_CHECK(cudaDeviceSynchronize()); - reset_var << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, T(0.0), XModel_g.evolv.v); + reset_var <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, T(0.0), XModel_g.evolv.v); CUDA_CHECK(cudaDeviceSynchronize()); Forcing XForcing; @@ -3816,7 +3816,7 @@ template int TestGradientSpeed(Param XParam, Model XModel, Model // Record the start event cudaEventRecord(startA, NULL); - gradient << < gridDim, blockDim, 0 >> > (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.zb, XModel_g.grad.dzbdx, XModel_g.grad.dzbdy); + gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.zb, XModel_g.grad.dzbdx, XModel_g.grad.dzbdy); CUDA_CHECK(cudaDeviceSynchronize()); // Record the stop event @@ -3839,7 +3839,7 @@ template int TestGradientSpeed(Param XParam, Model XModel, Model // Record the start event cudaEventRecord(startB, NULL); - gradientSM << < gridDim, blockDim >> > (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.zb, XModel_g.grad.dzsdx, XModel_g.grad.dzsdy); + gradientSM <<< gridDim, blockDim >>> (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.zb, XModel_g.grad.dzsdx, XModel_g.grad.dzsdy); CUDA_CHECK(cudaDeviceSynchronize()); // Record the stop event @@ -3862,7 +3862,7 @@ template int TestGradientSpeed(Param XParam, Model XModel, Model // Record the start event cudaEventRecord(startC, NULL); - gradientSMC << < gridDim, blockDim >> > (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.zb, XModel_g.grad.dhdx, XModel_g.grad.dhdy); + gradientSMC <<< gridDim, blockDim >>> (XParam.halowidth, XModel_g.blocks.active, XModel_g.blocks.level, (T)XParam.theta, (T)XParam.delta, XModel_g.zb, XModel_g.grad.dhdx, XModel_g.grad.dhdy); CUDA_CHECK(cudaDeviceSynchronize()); // Record the stop event @@ -4899,7 +4899,7 @@ template int TestPinMem(Param XParam, Model XModel, Model XModel dim3 block(16); dim3 grid((unsigned int)ceil(nelem / (float)block.x)); - vectoroffsetGPU << > > (nelem, T(1.0), zf_g); + vectoroffsetGPU <<>> (nelem, T(1.0), zf_g); CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaMemcpy(zf_recov, zf_g, nx * ny * sizeof(T), cudaMemcpyDeviceToHost)); diff --git a/src/Updateforcing.cu b/src/Updateforcing.cu index b2a69c93c..5049a9103 100755 --- a/src/Updateforcing.cu +++ b/src/Updateforcing.cu @@ -81,7 +81,7 @@ void Forcingthisstep(Param XParam, double totaltime, DynForcingP &XDynFor } - //NextHDstep << > > (XParam.Rainongrid.nx, XParam.Rainongrid.ny, Rainbef_g, Rainaft_g); + //NextHDstep <<>> (XParam.Rainongrid.nx, XParam.Rainongrid.ny, Rainbef_g, Rainaft_g); //CUDA_CHECK(cudaDeviceSynchronize()); // Read the actual file data @@ -100,7 +100,7 @@ void Forcingthisstep(Param XParam, double totaltime, DynForcingP &XDynFor { float bftime = float(XDynForcing.to+XDynForcing.dt*(XDynForcing.instep-1)); float aftime = float(XDynForcing.to + XDynForcing.dt * (XDynForcing.instep)); - InterpstepGPU << > > (XDynForcing.nx, XDynForcing.ny, float(totaltime), bftime,aftime, XDynForcing.now_g, XDynForcing.before_g, XDynForcing.after_g); + InterpstepGPU <<>> (XDynForcing.nx, XDynForcing.ny, float(totaltime), bftime,aftime, XDynForcing.now_g, XDynForcing.before_g, XDynForcing.after_g); CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaMemcpyToArray(XDynForcing.GPU.CudArr, 0, 0, XDynForcing.now_g, XDynForcing.nx * XDynForcing.ny * sizeof(float), cudaMemcpyDeviceToDevice)); @@ -147,7 +147,7 @@ template __host__ void AddRiverForcing(Param XParam, Loop XLoop, st { //InjectRiverGPU <<>> (XParam, XRivers[Rin], qnow, XModel.bndblk.river, XModel.blocks, XModel.adv); //CUDA_CHECK(cudaDeviceSynchronize()); - InjectManyRiversGPU << > > (XParam, irib, XModel.bndblk.Riverinfo, XModel.blocks, XModel.adv); + InjectManyRiversGPU <<>> (XParam, irib, XModel.bndblk.Riverinfo, XModel.blocks, XModel.adv); CUDA_CHECK(cudaDeviceSynchronize()); } From db30cd0fd197f26a825362425e110934d767e451 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Fri, 7 Mar 2025 12:32:36 +1300 Subject: [PATCH 26/36] Adding record of timestart --- src/Boundary.cu | 2 +- src/Param.h | 1 + src/ReadInput.cu | 4 +++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Boundary.cu b/src/Boundary.cu index 0e27c03d2..83c4a04f0 100644 --- a/src/Boundary.cu +++ b/src/Boundary.cu @@ -112,7 +112,7 @@ template void FlowbndFlux(Param XParam, double totaltime, BlockP XB if (XParam.bndtaper > 0.0) { - taper = min(totaltime / XParam.bndtaper, 1.0); + taper = min((totaltime - XParam.inittime) / XParam.bndtaper, 1.0); } } else diff --git a/src/Param.h b/src/Param.h index 40b8d11fa..6b27181f5 100644 --- a/src/Param.h +++ b/src/Param.h @@ -84,6 +84,7 @@ class Param { double outputtimestep = 0.0; //Number of seconds between netCDF outputs, 0.0 for none double endtime = std::numeric_limits::max(); // Total runtime in s, will be calculated based on bnd input as min(length of the shortest time series, user defined) and should be shorter than any time-varying forcing double totaltime = 0.0; // Total simulation time in s + double inittime = 0.0; // initital model time. At start of simulation inittime==totaltime double dtinit = -1; // Maximum initial time steps in s (should be positive, advice 0.1 if dry domain initialement) double dtmin = 0.0005; //Minimum accepted time steps in s (a lower value will be concidered a crash of the code, and stop the run) double bndrelaxtime = 3600.0; // Realxation time for absorbing boundary diff --git a/src/ReadInput.cu b/src/ReadInput.cu index 70fd84e85..00fb8ad20 100644 --- a/src/ReadInput.cu +++ b/src/ReadInput.cu @@ -1342,8 +1342,10 @@ void checkparamsanity(Param& XParam, Forcing& XForcing) XParam.reftime = "2000-01-01T00:00:00"; } + XParam.inittime = XParam.totaltime; + log("Reference time: " + XParam.reftime); - log("Model Initial time: " + std::to_string(XParam.totaltime)); + log("Model Initial time: " + std::to_string(XParam.totaltime) + " ; " ); log("Model end time: " + std::to_string(XParam.endtime)); From 8aa4c770b1ef5ad7520906d14c43c85c22d04550 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Mon, 10 Mar 2025 15:14:25 +1300 Subject: [PATCH 27/36] Update Makefile --- Makefile | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index f2d8b80e7..41390fc3f 100755 --- a/Makefile +++ b/Makefile @@ -198,7 +198,7 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) SAMPLE_ENABLED := 1 -ALL_LDFLAGS := -lnetcdf -I +ALL_LDFLAGS := -I ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) @@ -207,10 +207,14 @@ ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) INCLUDES := -I/usr/includes LIBRARIES := +# Add NetCDF include library +INCLUDES += $(shell nc-config --cflags) +ALL_LDFLAGS += $(shell nc-config --libs) + ################################################################################ # Gencode arguments -SMS ?= 35 50 52 60 +SMS ?= 52 60 75 #SMS ?= 20 30 35 ifeq ($(SMS),) $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) @@ -318,7 +322,7 @@ ReadInput.o:./src/ReadInput.cu Read_netcdf.o:./src/Read_netcdf.cu $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< - + Reimann.o:./src/Reimann.cu $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< @@ -338,8 +342,7 @@ Write_netcdf.o:./src/Write_netcdf.cu $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< Write_txtlog.o:./src/Write_txtlog.cpp - $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< - + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< Spherical.o:./src/Spherical.cu $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< From 50c7238ce88d54052f506522b77adde863e85057 Mon Sep 17 00:00:00 2001 From: Cyprien Bosserelle Date: Tue, 11 Mar 2025 17:01:53 +1300 Subject: [PATCH 28/36] ad bnd filter and relax time to input param --- src/ReadInput.cu | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/ReadInput.cu b/src/ReadInput.cu index 00fb8ad20..50e98b26c 100644 --- a/src/ReadInput.cu +++ b/src/ReadInput.cu @@ -336,6 +336,21 @@ Param readparamstr(std::string line, Param param) } + parameterstr = "bndrelaxtime"; + parametervalue = findparameter(parameterstr, line); + if (!parametervalue.empty()) + { + param.bndrelaxtime = std::stod(parametervalue); + + } + parameterstr = "bndfiltertime"; + parametervalue = findparameter(parameterstr, line); + if (!parametervalue.empty()) + { + param.bndfiltertime = std::stod(parametervalue); + + } + parameterstr = "CFL"; parametervalue = findparameter(parameterstr, line); if (!parametervalue.empty()) From bf99d3612811d63172e1e5cb109c9dc9be62d354 Mon Sep 17 00:00:00 2001 From: Cyprien Date: Tue, 1 Apr 2025 21:13:27 +1300 Subject: [PATCH 29/36] coding fix for Hipcc --- src/AdaptCriteria.h | 2 +- src/General.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/AdaptCriteria.h b/src/AdaptCriteria.h index 320b25892..82c4a8db1 100755 --- a/src/AdaptCriteria.h +++ b/src/AdaptCriteria.h @@ -14,7 +14,7 @@ template int inrangecriteria(Param XParam, T zmin, T zmax, T* z, BlockP XBlock, bool* refine, bool* coarsen); template int Thresholdcriteria(Param XParam, T threshold, T* z, BlockP XBlock, bool* refine, bool* coarsen); template int AdaptCriteria(Param XParam, Forcing XForcing, Model XModel); - +template int targetlevelcriteria(Param XParam, StaticForcingP targetlevelmap, BlockP XBlock, bool* refine, bool* coarsen); diff --git a/src/General.h b/src/General.h index 4c5aed70c..a7d851878 100755 --- a/src/General.h +++ b/src/General.h @@ -3,7 +3,7 @@ #define GENERAL_H // includes, system -#define pi 3.14159265 +constexpr auto pi=3.14159265 #include "cuda_runtime.h" #include "device_launch_parameters.h" From 852ccd5bb42d9a7eaa2b5d3ebe0590fdd682e653 Mon Sep 17 00:00:00 2001 From: Cyprien Date: Tue, 1 Apr 2025 21:45:48 +1300 Subject: [PATCH 30/36] Update General.h --- src/General.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/General.h b/src/General.h index a7d851878..23849bb9b 100755 --- a/src/General.h +++ b/src/General.h @@ -3,7 +3,7 @@ #define GENERAL_H // includes, system -constexpr auto pi=3.14159265 +constexpr auto pi = 3.14159265; #include "cuda_runtime.h" #include "device_launch_parameters.h" From b0376847391f9056c49097ca3dc7b923c064855c Mon Sep 17 00:00:00 2001 From: Cyprien Date: Tue, 1 Apr 2025 21:46:19 +1300 Subject: [PATCH 31/36] Update General.h --- src/General.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/General.h b/src/General.h index 23849bb9b..b327535e7 100755 --- a/src/General.h +++ b/src/General.h @@ -6,7 +6,7 @@ constexpr auto pi = 3.14159265; #include "cuda_runtime.h" -#include "device_launch_parameters.h" + #include From 854b4bbb08cb35a08df59b84c69e0dd0aa2c8df1 Mon Sep 17 00:00:00 2001 From: Cyprien Date: Tue, 1 Apr 2025 22:18:14 +1300 Subject: [PATCH 32/36] Update Boundary.cu --- src/Boundary.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Boundary.cu b/src/Boundary.cu index 83c4a04f0..0fbf59302 100644 --- a/src/Boundary.cu +++ b/src/Boundary.cu @@ -1,4 +1,5 @@ -#include "Boundary.h" + +#include "Boundary.h" template void Flowbnd(Param XParam, Loop &XLoop, BlockP XBlock, bndparam side, DynForcingP Atmp, EvolvingP XEv) From 2646cdf0a315f0f8a8806c935ae824cf2127feba Mon Sep 17 00:00:00 2001 From: Cyprien Date: Wed, 2 Apr 2025 20:40:36 +1300 Subject: [PATCH 33/36] Update Boundary.cu --- src/Boundary.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Boundary.cu b/src/Boundary.cu index 0fbf59302..f8f732643 100644 --- a/src/Boundary.cu +++ b/src/Boundary.cu @@ -1,4 +1,3 @@ - #include "Boundary.h" From c2ff93cb10d94543d88c84e073828e575657561e Mon Sep 17 00:00:00 2001 From: Cyprien Date: Wed, 2 Apr 2025 20:50:20 +1300 Subject: [PATCH 34/36] Update Boundary.h --- src/Boundary.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Boundary.h b/src/Boundary.h index 15af48b52..2f7db0f82 100755 --- a/src/Boundary.h +++ b/src/Boundary.h @@ -30,7 +30,7 @@ template void FlowbndFlux(Param XParam,double totaltime, BlockP XBl template __global__ void bndFluxGPUSide(Param XParam, bndsegmentside side, BlockP XBlock, DynForcingP Atmp, DynForcingP Zsmap, bool uniform, int type, float zsbnd, T taper, T* zs, T* h, T* un, T* ut, T* Fh, T* Fq, T* Ss); - +template void bndFluxGPUSideCPU(Param XParam, bndsegmentside side, BlockP XBlock, DynForcingP Atmp, DynForcingP Zsmap, bool uniform, int type, float zsbnd, T taper, T* zs, T* h, T* un, T* ut, T* Fh, T* Fq, T* Ss); template __global__ void bndGPU(Param XParam, bndparam side, BlockP XBlock, DynForcingP Atmp, float itime, T* zs, T* h, T* un, T* ut); template __host__ void bndCPU(Param XParam, bndparam side, BlockP XBlock, std::vector zsbndvec, std::vector uubndvec, std::vector vvbndvec, DynForcingP Atmp, T* zs, T* h, T* un, T* ut); From 999a1a25c2ec3bd1527d3d4e6a369ab78fc55c25 Mon Sep 17 00:00:00 2001 From: Cyprien Date: Wed, 2 Apr 2025 21:01:27 +1300 Subject: [PATCH 35/36] Update ConserveElevation.cu --- src/ConserveElevation.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ConserveElevation.cu b/src/ConserveElevation.cu index a2ecd572a..b3194fc6a 100755 --- a/src/ConserveElevation.cu +++ b/src/ConserveElevation.cu @@ -1,4 +1,5 @@ -#include "ConserveElevation.h" + +#include "ConserveElevation.h" template void conserveElevation(Param XParam, BlockP XBlock, EvolvingP XEv, T* zb) From 723121595c698164c2e540bfbae219b59fc5485f Mon Sep 17 00:00:00 2001 From: Cyprien Date: Wed, 2 Apr 2025 21:11:32 +1300 Subject: [PATCH 36/36] Fix encoding to remove BOM --- src/AdaptCriteria.cu | 1 - src/Adaptation.cu | 2 -- src/BG_Flood.cu | 4 ++-- src/Halo.cu | 3 ++- src/Testing.cu | 1 - 5 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/AdaptCriteria.cu b/src/AdaptCriteria.cu index 851c5697f..d82868a75 100755 --- a/src/AdaptCriteria.cu +++ b/src/AdaptCriteria.cu @@ -1,4 +1,3 @@ - #include "AdaptCriteria.h" diff --git a/src/Adaptation.cu b/src/Adaptation.cu index dfe53668c..009691d50 100755 --- a/src/Adaptation.cu +++ b/src/Adaptation.cu @@ -1,5 +1,3 @@ - - #include "Adaptation.h" diff --git a/src/BG_Flood.cu b/src/BG_Flood.cu index bb10c63fe..975e9f97d 100755 --- a/src/BG_Flood.cu +++ b/src/BG_Flood.cu @@ -1,4 +1,4 @@ -////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////// // BG_Flood Main function // // Copyright (C) 2018 Bosserelle // // This code contains an adaptation of the St Venant equation from Basilisk // @@ -20,7 +20,7 @@ // along with this program. If not, see . // ////////////////////////////////////////////////////////////////////////////////// -// includes, system +// includes #include "BG_Flood.h" diff --git a/src/Halo.cu b/src/Halo.cu index 93ea8aeef..760d46149 100644 --- a/src/Halo.cu +++ b/src/Halo.cu @@ -1,4 +1,5 @@ -#include "Halo.h" + +#include "Halo.h" /*! \fn void void fillHaloD(Param XParam, int ib, BlockP XBlock, T* z) diff --git a/src/Testing.cu b/src/Testing.cu index 1be1048b7..4986654fb 100644 --- a/src/Testing.cu +++ b/src/Testing.cu @@ -1,4 +1,3 @@ - #include "Testing.h"