CyprienBosserelle · CyprienBosserelle · May 6, 2025 · May 6, 2025 · May 22, 2025 · Jun 30, 2025
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -105,6 +105,13 @@ jobs:
       - name: running test 14, Test on AOI implementation aa wall bnb
         run: |
           sed -i 's/test = 13/test = 14/' BG_param.txt
-          ./BG_Flood  
+          ./BG_Flood   
+      - name: running test 15, Test on flexible time input
+        run: |
+          sed -i 's/test = 14/test = 15/' BG_param.txt
+          ./BG_Flood
+          ./BG_Flood BG_param_test15.txt
+       ##   var=$(ncdump -v h_P0 Test15_zoom2.nc)
+       ## if: echo ${{ ${#var} == 7 }}
 
 
diff --git a/Makefile b/Makefile
@@ -198,7 +198,7 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 
 SAMPLE_ENABLED := 1
 
-ALL_LDFLAGS := -lnetcdf -I
+ALL_LDFLAGS := -I
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
@@ -207,10 +207,14 @@ ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
 INCLUDES  := -I/usr/includes
 LIBRARIES :=
 
+# Add NetCDF include library
+INCLUDES += $(shell nc-config --cflags)
+ALL_LDFLAGS += $(shell nc-config --libs)
+
 ################################################################################
 
 # Gencode arguments
-SMS ?= 35 50 52 60
+SMS ?= 50 52 60 75
 #SMS ?= 20 30 35
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
@@ -318,7 +322,7 @@ ReadInput.o:./src/ReadInput.cu
 
 Read_netcdf.o:./src/Read_netcdf.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
-	
+
 Reimann.o:./src/Reimann.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 
@@ -338,8 +342,7 @@ Write_netcdf.o:./src/Write_netcdf.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 
 Write_txtlog.o:./src/Write_txtlog.cpp
-	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
-
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<	
 
 Spherical.o:./src/Spherical.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

diff --git a/doc/paper.md b/doc/paper.md
@@ -0,0 +1,44 @@
+title: 'BG_Flood: Adaptive GPU-capable hydrodynamics model for flood and inundation '
+tags:
+  - C++
+  - CUDA
+  - flood
+  - inundation
+  - tsunami
+  - storm-surge
+  - adaptive mesh refinement
+  - GPU
+
+authors:
+  - name: Cyprien Bosserelle
+ q  corresponding: true # (This is how to denote the corresponding author)
+    orcid: 0000-0000-0000-0000
+    equal-contrib: true
+    affiliation: 1
+  - name: Alice Harang
+    equal-contrib: true # (This is how you can denote equal contributions between multiple authors)
+    orcid: 0000-0000-0000-0000
+    affiliation: 1
+  - name: Emily Lane
+    affiliation: 1
+affiliations:
+ - name: Earth Sciences New Zealand
+   index: 1
+
+
+date: 1 July 2025
+bibliography: paper.bib
+
+# Summary
+Flood hazard assessment and forecasting aften require physics-based simulations. These simulation are often completted using hydrodynamics models using high resolution to capture small landscape and flow features (e.g. small drains, hydraulic jump), but also capture large scale domain where the hazard forms and or amplifies (e.g. river catchment, continental shelf). this is only acheivable with unstructured and adaptive mesh. Unfortunatly most available open-source codes only provide unstructured mest that rely heavily on user's input for generating a suitable mesh. These model also do not generally offer a flexibility in reverting mesh generation. Most available open-source model also do not offer GPU enabled code that is well optimised and instead for the few that can offer  
+
+Enabling GPU and semi-automatic mesh refinement is critical to enable rapid devlopement of flood assessment that doesn't compromise physics simulated.     
+
+# Statement of need
+BG_Flood is a numerical model for simulating shallow water hydrodynamics on the GPU using an Adaptive Mesh Refinment type grid. The model was designed with the goal of simulating inundation (River, Storm surge or tsunami). The model uses a Block Uniform Quadtree approach that runs on the GPU with adaptive mesh being generated at the start of tee simulation.
+
+The core SWE engines and adaptivity has been inspired and taken from St Venant solver from Basilisk (Popinet XXXX) and the CUDA GPU memory model has been inspired by the work from (Vacondio et al. 2017). The rest of the implementation
+
+
+
+
diff --git a/src/Advection.cu b/src/Advection.cu
@@ -251,11 +251,11 @@ template <class T> __global__ void AdvkernelGPU(Param XParam, BlockP<T> XBlock,
 	T ho, uo, vo;
 	T dhi = XAdv.dh[i];
 
-	T edt = dt;// dhi > T(0.0) ? dt : min(dt, hold / (T(-1.0) * dhi));
-
+	T edt = XParam.ForceMassConserve ? dt : dhi >= T(0.0) ? dt : min(dt, max(hold, XParam.eps) / abs(dhi));
+
+	//ho = max(hold + edt * dhi,T(0.0));
 	ho = hold + edt * dhi;
 
-
 	if (ho > eps) {
 		//
 		uo = (hold * XEv.u[i] + edt * XAdv.dhu[i]) / ho;
@@ -307,7 +307,7 @@ template <class T> __host__ void AdvkernelCPU(Param XParam, BlockP<T> XBlock, T
 
 				dhi = XAdv.dh[i];
 
-				T edt = dt;// dhi > T(0.0) ? dt : min(dt, hold / (T(-1.0) * dhi));
+				T edt = XParam.ForceMassConserve ? dt : dhi >= T(0.0) ? dt : min(dt, max(hold, XParam.eps) / abs(dhi));
 
 				ho = hold + edt * dhi;
 
@@ -475,7 +475,7 @@ template <class T> __host__ T CalctimestepGPU(Param XParam,Loop<T> XLoop, BlockP
 
 	//GPU Harris reduction #3. 8.3x reduction #0  Note #7 if a lot faster
 	// This was successfully tested with a range of grid size
-	//reducemax3 << <gridDimLine, blockDimLine, 64*sizeof(float) >> >(dtmax_g, arrmax_g, nx*ny)
+	//reducemax3 <<<gridDimLine, blockDimLine, 64*sizeof(float) >>>(dtmax_g, arrmax_g, nx*ny)
 
 	int maxThreads = 256;
 	int threads = (s < maxThreads * 2) ? nextPow2((s + 1) / 2) : maxThreads;
@@ -485,7 +485,7 @@ template <class T> __host__ T CalctimestepGPU(Param XParam,Loop<T> XLoop, BlockP
 	dim3 gridDimLine(blocks, 1, 1);
 
 
-	reducemin3 << <gridDimLine, blockDimLine, smemSize >> > (XTime.dtmax, XTime.arrmin, s);
+	reducemin3 <<<gridDimLine, blockDimLine, smemSize >>> (XTime.dtmax, XTime.arrmin, s);
 	CUDA_CHECK(cudaDeviceSynchronize());
 
 
@@ -503,7 +503,7 @@ template <class T> __host__ T CalctimestepGPU(Param XParam,Loop<T> XLoop, BlockP
 
 		CUDA_CHECK(cudaMemcpy(XTime.dtmax, XTime.arrmin, s * sizeof(T), cudaMemcpyDeviceToDevice));
 
-		reducemin3 << <gridDimLineS, blockDimLineS, smemSize >> > (XTime.dtmax, XTime.arrmin, s);
+		reducemin3 <<<gridDimLineS, blockDimLineS, smemSize >>> (XTime.dtmax, XTime.arrmin, s);
 		CUDA_CHECK(cudaDeviceSynchronize());
 
 		s = (s + (threads * 2 - 1)) / (threads * 2);

diff --git a/src/Arrays.h b/src/Arrays.h
@@ -82,6 +82,23 @@ struct maskinfo
 
 };
 
+template <class T>
+struct RiverInfo
+{
+	int nbir;
+	int nburmax; // size of (max number of) unique block with rivers  
+	int nribmax; // size of (max number of) rivers in one block
+	int* Xbidir; // array of block id for each river size(nburmax,nribmax)
+	int* Xridib; // array of river id in each block size(nburmax,nribmax)
+	T* xstart;
+	T* xend;
+	T* ystart;
+	T *yend;
+	T* qnow; // qnow is a pin mapped and so both pointers are needed here
+	T* qnow_g; // this simplify the code later
+
+};
+
 
 // outzone info used to actually write the nc files (one nc file by zone, the default zone is the full domain)
 struct outzoneB 
@@ -92,6 +109,8 @@ struct outzoneB
 	std::string outname; // name for the output file (one for each zone)
 	int maxlevel; // maximum level in the zone
 	int minlevel; //minimum level in the zone
+	std::vector<double> OutputT; //Next time for the output of this zone
+	int index_next_OutputT = 0; //Index of next time output
 };
 
 
@@ -125,7 +144,7 @@ struct AdaptP
 
 
 
-
+template <class T>
 struct BndblockP
 {
 	int nblkriver, nblkTs, nbndblkleft, nbndblkright, nbndblktop, nbndblkbot;
@@ -140,12 +159,15 @@ struct BndblockP
 	int* top;
 	int* bot;
 
-
+	RiverInfo<T> Riverinfo;
 
 
 };
 
-
+struct RiverBlk
+{
+	std::vector<int> block;
+};
 
 
 
@@ -192,7 +214,7 @@ struct Model
 	std::map<std::string, std::string> Outvarlongname;
 	std::map<std::string, std::string> Outvarstdname;
 	std::map<std::string, std::string> Outvarunits;
-
+	std::vector<double> OutputT;
 
 	//other output
 	//std::vector< std::vector< Pointout > > TSallout;
@@ -208,7 +230,7 @@ struct Model
 
 	AdaptP adapt;
 
-	BndblockP bndblk;
+	BndblockP<T> bndblk;
 
 
 
@@ -229,6 +251,8 @@ struct Loop
 	int nstep = 0;
 	//useful for calculating avg timestep
 	int nstepout = 0;
+	// Needed to identify next output time
+	int indNextoutputtime = 0;
 
 	// usefull for Time series output
 	int nTSsteps = 0;

diff --git a/src/Boundary.cu b/src/Boundary.cu
@@ -95,6 +95,11 @@ template <class T> void FlowbndFlux(Param XParam, double totaltime, BlockP<T> XB
 	T taper=T(1.0);
 	if (bndseg.on)
 	{
+		if (XParam.bndtaper > 0.0)
+		{
+			taper = min((totaltime - XParam.inittime) / XParam.bndtaper, 1.0);
+		}
+
 		if (bndseg.uniform)
 		{
 			int SLstepinbnd = 1;
@@ -110,10 +115,7 @@ template <class T> void FlowbndFlux(Param XParam, double totaltime, BlockP<T> XB
 			zsbnd = interptime(bndseg.data[SLstepinbnd].wspeed, bndseg.data[SLstepinbnd - 1].wspeed, bndseg.data[SLstepinbnd].time - bndseg.data[SLstepinbnd - 1].time, totaltime - bndseg.data[SLstepinbnd - 1].time);
 
 
-			if (XParam.bndtaper > 0.0)
-			{
-				taper = min(totaltime / XParam.bndtaper, 1.0);
-			}
+
 		}
 		else
 		{
@@ -131,25 +133,25 @@ template <class T> void FlowbndFlux(Param XParam, double totaltime, BlockP<T> XB
 				//Left
 				//template <class T> __global__ void bndFluxGPUSide(Param XParam, bndsegmentside side, BlockP<T> XBlock, DynForcingP<float> Atmp, DynForcingP<float> Zsmap, bool uniform, float zsbnd, T * zs, T * h, T * un, T * ut, T * Fh, T * Fq, T * Ss)
 				//bndFluxGPUSide <<< gridDimBBND, blockDim, 0 >>> (XParam, bndseg.left, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), XEv.zs, XEv.h, un, ut, Fh, Fq, S);
-				bndFluxGPUSide << < gridDimBBNDLeft, blockDim, 0 >> > (XParam, bndseg.left, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.u, XEv.v, XFlux.Fhu, XFlux.Fqux, XFlux.Su);
+				bndFluxGPUSide <<< gridDimBBNDLeft, blockDim, 0 >>> (XParam, bndseg.left, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.u, XEv.v, XFlux.Fhu, XFlux.Fqux, XFlux.Su);
 				CUDA_CHECK(cudaDeviceSynchronize());
 			}
 			//if (bndseg.right.nblk > 0)
 			{
 				//Right
-				bndFluxGPUSide << < gridDimBBNDRight, blockDim, 0 >> > (XParam, bndseg.right, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.u, XEv.v, XFlux.Fhu, XFlux.Fqux, XFlux.Su);
+				bndFluxGPUSide <<< gridDimBBNDRight, blockDim, 0 >>> (XParam, bndseg.right, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.u, XEv.v, XFlux.Fhu, XFlux.Fqux, XFlux.Su);
 				CUDA_CHECK(cudaDeviceSynchronize());
 			}
 			//if (bndseg.top.nblk > 0)
 			{
 				//top
-				bndFluxGPUSide << < gridDimBBNDTop, blockDim, 0 >> > (XParam, bndseg.top, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.v, XEv.u, XFlux.Fhv, XFlux.Fqvy, XFlux.Sv);
+				bndFluxGPUSide <<< gridDimBBNDTop, blockDim, 0 >>> (XParam, bndseg.top, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.v, XEv.u, XFlux.Fhv, XFlux.Fqvy, XFlux.Sv);
 				CUDA_CHECK(cudaDeviceSynchronize());
 			}
 			//if (bndseg.bot.nblk > 0)
 			{
 				//bot
-				bndFluxGPUSide << < gridDimBBNDBot, blockDim, 0 >> > (XParam, bndseg.bot, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.v, XEv.u, XFlux.Fhv, XFlux.Fqvy, XFlux.Sv);
+				bndFluxGPUSide <<< gridDimBBNDBot, blockDim, 0 >>> (XParam, bndseg.bot, XBlock, Atmp, bndseg.WLmap, bndseg.uniform, bndseg.type, float(zsbnd), taper, XEv.zs, XEv.h, XEv.v, XEv.u, XFlux.Fhv, XFlux.Fqvy, XFlux.Sv);
 				CUDA_CHECK(cudaDeviceSynchronize());
 			}
 		}
@@ -219,7 +221,7 @@ template <class T> void FlowbndFluxold(Param XParam, double totaltime, BlockP<T>
 
 	if (XParam.GPUDEVICE >= 0)
 	{
-		//bndFluxGPU << < gridDimBBND, blockDim, 0 >> > (XParam, side, XBlock, Atmp, float(itime), XEv.zs, XEv.h, un, ut, Fh, Fq, S);
+		//bndFluxGPU <<< gridDimBBND, blockDim, 0 >>> (XParam, side, XBlock, Atmp, float(itime), XEv.zs, XEv.h, un, ut, Fh, Fq, S);
 		//CUDA_CHECK(cudaDeviceSynchronize());
 	}
 	else
@@ -290,9 +292,9 @@ template <class T> __global__ void bndFluxGPUSide(Param XParam, bndsegmentside s
 	}
 
 
-
 
-
+	zsbnd = zsbnd + XParam.zsoffset;
+
 
 	int inside = Inside(halowidth, blkmemwidth, side.isright, side.istop, ix, iy, ib);
 
@@ -343,6 +345,19 @@ template <class T> __global__ void bndFluxGPUSide(Param XParam, bndsegmentside s
 		noslipbndQ(F, G, S);//noslipbndQ(T & F, T & G, T & S) F = T(0.0); S = G;
 
 	}
+	else if (type == 2)
+	{
+		if (h[i] > XParam.eps || zsX > zsi)
+		{
+			//
+			Dirichlet1Q(T(XParam.g), sign, zsX, zsinside, hinside, uninside, F);
+		}
+		else
+		{
+			noslipbndQ(F, G, S);
+			qmean = T(0.0);
+		}
+	}
 	else if (type == 3)
 	{
 		if (h[i] > XParam.eps || zsX > zsi )
@@ -443,6 +458,9 @@ template <class T> void bndFluxGPUSideCPU(Param XParam, bndsegmentside side, Blo
 				zsbnd = interp2BUQ(XParam.xo + xx, XParam.yo + yy, Zsmap);
 			}
 
+
+			zsbnd = zsbnd + XParam.zsoffset;
+
 
 			int i = memloc(halowidth, blkmemwidth, ix, iy, ib);
 			int inside = Inside(halowidth, blkmemwidth, side.isright, side.istop, ix, iy, ib);
@@ -504,6 +522,21 @@ template <class T> void bndFluxGPUSideCPU(Param XParam, bndsegmentside side, Blo
 				}
 				side.qmean[iq] = qmean;
 			}
+			else if (type == 2)
+			{
+				if (h[i] > XParam.eps || zsX > zsi)
+				{
+					//ABS1DQ(T(XParam.g), sign, factime, facrel, zsi, zsX, zsinside, h[i], qmean, F, G, S);
+					//qmean = T(0.0);
+					Dirichlet1Q(T(XParam.g), sign, zsX, zsinside, hinside, uninside, F);
+				}
+				else
+				{
+					noslipbndQ(F, G, S);
+					qmean = T(0.0);
+				}
+				side.qmean[iq] = qmean;
+			}
 
 
 			// write the results
@@ -1568,6 +1601,20 @@ template <class T> __device__ __host__ void Dirichlet1D(T g, T sign, T zsbnd, T
 	h = hinside;
 }
 
+template <class T> __device__ __host__ void Dirichlet1Q(T g, T sign, T zsbnd, T zsinside, T hinside, T uninside, T& q)
+{
+	// Is this even the right formulation?.
+	// I don't really like this formulation. while a bit less dissipative then abs1D with 0 unbnd (but worse if forcing uniside with 0) it is very reflective an not stable  
+	T zbinside = zsinside - hinside;
+	T un = sign * T(2.0) * (sqrt(g * max(hinside, T(0.0))) - sqrt(g * max(zsbnd - zbinside, T(0.0)))) + uninside;
+	T ut = T(0.0);
+	//zs = zsinside;
+	//ut[i] = ut[inside];
+	//h = hinside;
+
+	q = un * hinside;
+}
+
 
 
 /*

diff --git a/src/Boundary.h b/src/Boundary.h
@@ -42,6 +42,8 @@ template <class T> __device__ __host__ void noslipbndQ(T& F, T& G, T& S);
 template <class T> __device__ __host__ void ABS1D(T g, T sign, T zsbnd, T zsinside, T hinside, T utbnd,T unbnd, T& un, T& ut, T& zs, T& h);
 template <class T> __device__ __host__ void ABS1DQ(T g, T sign, T factime, T facrel, T zs, T zsbnd, T zsinside, T h, T& qmean, T& q, T& G, T& S);
 template <class T> __device__ __host__ void Dirichlet1D(T g, T sign, T zsbnd, T zsinside, T hinside,  T uninside, T& un, T& ut, T& zs, T& h);
+template <class T> __device__ __host__ void Dirichlet1Q(T g, T sign, T zsbnd, T zsinside, T hinside, T uninside, T& q);
+
 
 // End of global definition
 #endif