fix preload sizes

devsh · devsh · commit 9a233c723f98 · 2024-11-12T16:04:10.000+01:00
diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h
@@ -159,7 +159,7 @@ class CComputeBlit : public core::IReferenceCounted
 			return computePerWorkGroup(sharedMemorySize,minSupport,maxSupport,type,inExtent,outExtent,halfPrecision);
 		}
 		NBL_API2 static hlsl::blit::SPerWorkgroup computePerWorkGroup(
-			const uint16_t sharedMemorySize, const hlsl::float32_t3 minSupportInOutput, const hlsl::float32_t3 maxSupportInOutput, const IGPUImage::E_TYPE type,
+			const uint16_t sharedMemorySize, const hlsl::float32_t3 minSupportInInput, const hlsl::float32_t3 maxSupportInInput, const IGPUImage::E_TYPE type,
 			const hlsl::uint16_t3 inExtent, const hlsl::uint16_t3 outExtent, const bool halfPrecision=false
 		);
 
diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp
@@ -117,7 +117,7 @@ struct ConstevalParameters
 }
 
 SPerWorkgroup CComputeBlit::computePerWorkGroup(
-	const uint16_t sharedMemorySize, const float32_t3 minSupportInOutput, const float32_t3 maxSupportInOutput, const IGPUImage::E_TYPE type,
+	const uint16_t sharedMemorySize, const float32_t3 minSupportInInput, const float32_t3 maxSupportInInput, const IGPUImage::E_TYPE type,
 	const uint16_t3 inExtent, const uint16_t3 outExtent, const bool halfPrecision
 )
 {
@@ -126,16 +126,16 @@ SPerWorkgroup CComputeBlit::computePerWorkGroup(
 
 	const auto Dims = static_cast<uint8_t>(type)+1;
 	const auto scale = float32_t3(inExtent)/float32_t3(outExtent);
-	const auto supportWidthInOutput = maxSupportInOutput-minSupportInOutput;
+	const auto supportWidthInInput = maxSupportInInput-minSupportInInput;
 
 	IGPUImage::E_TYPE minDimAxes[3] = { IGPUImage::ET_1D, IGPUImage::ET_2D, IGPUImage::ET_3D };
 	using namespace nbl::hlsl;
 	for (uint16_t3 output(1,1,1); true;)
 	{
 		// now try and grow our support
-		const auto combinedSupportInOutput = supportWidthInOutput+float32_t3(output-uint16_t3(1,1,1));
+		const auto combinedSupportInInput = supportWidthInInput+float32_t3(output-uint16_t3(1,1,1))*scale;
 		// note that its not ceil on purpose
-		uint32_t3 preload = uint32_t3(hlsl::floor(combinedSupportInOutput*scale))+uint32_t3(1,1,1);
+		uint32_t3 preload = uint32_t3(hlsl::floor(combinedSupportInInput))+uint32_t3(1,1,1);
 		// Set the unused dimensions to 1 to avoid weird behaviours with scaled kernels
 		for (auto a=Dims; a<3; a++)
 			preload[a] = 1;
@@ -162,9 +162,9 @@ SPerWorkgroup CComputeBlit::computePerWorkGroup(
 		
 		// we want to fix the dimension that's the smallest, so that we increase the volume of the support by a smallest increment and stay close to a cube shape
 		{
-			std::sort(minDimAxes,minDimAxes+Dims,[output](const IGPUImage::E_TYPE a, const IGPUImage::E_TYPE b)->bool
+			std::sort(minDimAxes,minDimAxes+Dims,[preload](const IGPUImage::E_TYPE a, const IGPUImage::E_TYPE b)->bool
 				{
-					return output[a]<output[b];
+					return preload[a]<preload[b];
 				}
 			);
 			// grow along smallest axis, but skip if already grown to output size

Original file line number	Diff line number	Diff line change
`@@ -159,7 +159,7 @@ class CComputeBlit : public core::IReferenceCounted`
`159`	`159`	`return computePerWorkGroup(sharedMemorySize,minSupport,maxSupport,type,inExtent,outExtent,halfPrecision);`
`160`	`160`	`}`
`161`	`161`	`NBL_API2 static hlsl::blit::SPerWorkgroup computePerWorkGroup(`
`162`		`- const uint16_t sharedMemorySize, const hlsl::float32_t3 minSupportInOutput, const hlsl::float32_t3 maxSupportInOutput, const IGPUImage::E_TYPE type,`
	`162`	`+ const uint16_t sharedMemorySize, const hlsl::float32_t3 minSupportInInput, const hlsl::float32_t3 maxSupportInInput, const IGPUImage::E_TYPE type,`
`163`	`163`	`const hlsl::uint16_t3 inExtent, const hlsl::uint16_t3 outExtent, const bool halfPrecision=false`
`164`	`164`	`);`
`165`	`165`
Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,7 @@ struct ConstevalParameters`
`117`	`117`	`}`
`118`	`118`
`119`	`119`	`SPerWorkgroup CComputeBlit::computePerWorkGroup(`
`120`		`- const uint16_t sharedMemorySize, const float32_t3 minSupportInOutput, const float32_t3 maxSupportInOutput, const IGPUImage::E_TYPE type,`
	`120`	`+ const uint16_t sharedMemorySize, const float32_t3 minSupportInInput, const float32_t3 maxSupportInInput, const IGPUImage::E_TYPE type,`
`121`	`121`	`const uint16_t3 inExtent, const uint16_t3 outExtent, const bool halfPrecision`
`122`	`122`	`)`
`123`	`123`	`{`
`@@ -126,16 +126,16 @@ SPerWorkgroup CComputeBlit::computePerWorkGroup(`
`126`	`126`
`127`	`127`	`const auto Dims = static_cast<uint8_t>(type)+1;`
`128`	`128`	`const auto scale = float32_t3(inExtent)/float32_t3(outExtent);`
`129`		`- const auto supportWidthInOutput = maxSupportInOutput-minSupportInOutput;`
	`129`	`+ const auto supportWidthInInput = maxSupportInInput-minSupportInInput;`
`130`	`130`
`131`	`131`	`IGPUImage::E_TYPE minDimAxes[3] = { IGPUImage::ET_1D, IGPUImage::ET_2D, IGPUImage::ET_3D };`
`132`	`132`	`using namespace nbl::hlsl;`
`133`	`133`	`for (uint16_t3 output(1,1,1); true;)`
`134`	`134`	`{`
`135`	`135`	`// now try and grow our support`
`136`		`- const auto combinedSupportInOutput = supportWidthInOutput+float32_t3(output-uint16_t3(1,1,1));`
	`136`	`+ const auto combinedSupportInInput = supportWidthInInput+float32_t3(output-uint16_t3(1,1,1))*scale;`
`137`	`137`	`// note that its not ceil on purpose`
`138`		`- uint32_t3 preload = uint32_t3(hlsl::floor(combinedSupportInOutput*scale))+uint32_t3(1,1,1);`
	`138`	`+ uint32_t3 preload = uint32_t3(hlsl::floor(combinedSupportInInput))+uint32_t3(1,1,1);`
`139`	`139`	`// Set the unused dimensions to 1 to avoid weird behaviours with scaled kernels`
`140`	`140`	`for (auto a=Dims; a<3; a++)`
`141`	`141`	`preload[a] = 1;`
`@@ -162,9 +162,9 @@ SPerWorkgroup CComputeBlit::computePerWorkGroup(`
`162`	`162`
`163`	`163`	`// we want to fix the dimension that's the smallest, so that we increase the volume of the support by a smallest increment and stay close to a cube shape`
`164`	`164`	`{`
`165`		`- std::sort(minDimAxes,minDimAxes+Dims,[output](const IGPUImage::E_TYPE a, const IGPUImage::E_TYPE b)->bool`
	`165`	`+ std::sort(minDimAxes,minDimAxes+Dims,[preload](const IGPUImage::E_TYPE a, const IGPUImage::E_TYPE b)->bool`
`166`	`166`	`{`
`167`		`- return output[a]<output[b];`
	`167`	`+ return preload[a]<preload[b];`
`168`	`168`	`}`
`169`	`169`	`);`
`170`	`170`	`// grow along smallest axis, but skip if already grown to output size`