Skip to content

Commit 50fed1d

Browse files
committed
Merge pull request opencv#19115 from alalek:dnn_ocl_conv_fp16_consistency
2 parents a9edcc1 + c240355 commit 50fed1d

File tree

6 files changed

+77
-61
lines changed

6 files changed

+77
-61
lines changed

modules/core/src/convert.dispatch.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int
138138
sdepth == CV_32F ? "half" : "float",
139139
rowsPerWI,
140140
sdepth == CV_32F ? " -D FLOAT_TO_HALF " : "");
141-
ocl::Kernel k("convertFp16", ocl::core::halfconvert_oclsrc, build_opt);
141+
ocl::Kernel k(sdepth == CV_32F ? "convertFp16_FP32_to_FP16" : "convertFp16_FP16_to_FP32", ocl::core::halfconvert_oclsrc, build_opt);
142142
if (k.empty())
143143
return false;
144144

modules/core/src/opencl/halfconvert.cl

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,17 @@
4747
#endif
4848
#endif
4949

50-
__kernel void convertFp16(__global const uchar * srcptr, int src_step, int src_offset,
51-
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols)
50+
__kernel void
51+
#ifdef FLOAT_TO_HALF
52+
convertFp16_FP32_to_FP16
53+
#else
54+
convertFp16_FP16_to_FP32
55+
#endif
56+
(
57+
__global const uchar * srcptr, int src_step, int src_offset,
58+
__global uchar * dstptr, int dst_step, int dst_offset,
59+
int dst_rows, int dst_cols
60+
)
5261
{
5362
int x = get_global_id(0);
5463
int y0 = get_global_id(1) * rowsPerWI;

modules/dnn/src/layers/convolution_layer.cpp

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1461,16 +1461,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
14611461
umat_blobs.resize(n);
14621462
for (size_t i = 0; i < n; i++)
14631463
{
1464-
if (use_half)
1465-
{
1466-
Mat matFP32;
1467-
convertFp16(inputs[i + 1], matFP32);
1468-
matFP32.copyTo(umat_blobs[i]);
1469-
}
1470-
else
1471-
{
1472-
inputs[i + 1].copyTo(umat_blobs[i]);
1473-
}
1464+
inputs[i + 1].copyTo(umat_blobs[i]);
14741465
}
14751466
inputs.resize(1);
14761467
}
@@ -1481,7 +1472,10 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
14811472
umat_blobs.resize(n);
14821473
for (size_t i = 0; i < n; i++)
14831474
{
1484-
blobs[i].copyTo(umat_blobs[i]);
1475+
if (use_half)
1476+
convertFp16(blobs[i], umat_blobs[i]);
1477+
else
1478+
blobs[i].copyTo(umat_blobs[i]);
14851479
}
14861480
}
14871481

@@ -1537,14 +1531,20 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
15371531

15381532
if (fusedWeights)
15391533
{
1540-
weightsMat.copyTo(umat_blobs[0]);
1534+
if (use_half)
1535+
convertFp16(weightsMat, umat_blobs[0]);
1536+
else
1537+
weightsMat.copyTo(umat_blobs[0]);
15411538
fusedWeights = false;
15421539
}
15431540
if (fusedBias)
15441541
{
15451542
if ( umat_blobs.size() < 2 )
15461543
umat_blobs.resize(2);
1547-
umat_blobs[1] = UMat(biasvec, true);
1544+
if (use_half)
1545+
convertFp16(Mat(biasvec, true), umat_blobs[1]);
1546+
else
1547+
Mat(biasvec, true).copyTo(umat_blobs[1]);
15481548
convolutionOp->setBias(true);
15491549
fusedBias = false;
15501550
}

modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,8 +274,6 @@ class OCL4DNNConvSpatial
274274
int32_t group_;
275275
bool bias_term_;
276276
UMat swizzled_weights_umat;
277-
UMat weights_half;
278-
UMat bias_half;
279277
UMat bottom_data2_;
280278

281279
int32_t bottom_index_;

modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp

Lines changed: 46 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -588,16 +588,16 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
588588
fused_eltwise_ = false;
589589
}
590590

591-
if (use_half_ && bias_half.empty() && !bias.empty())
592-
convertFp16(bias, bias_half);
591+
if (use_half_ && !bias.empty())
592+
CV_CheckTypeEQ(bias.type(), CV_16SC1, "");
593593

594-
if (use_half_ && weights_half.empty())
595-
convertFp16(weight, weights_half);
594+
if (use_half_)
595+
CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
596596

597-
prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages);
597+
prepareKernel(bottom, top, weight, bias, numImages);
598598
if (bestKernelConfig.empty())
599599
return false;
600-
return convolve(bottom, top, weight, (use_half_) ? bias_half : bias, numImages, bestKernelConfig);
600+
return convolve(bottom, top, weight, bias, numImages, bestKernelConfig);
601601
}
602602

603603
template<typename Dtype>
@@ -744,29 +744,26 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
744744
kernel_h_ * (int)alignSize(kernel_w_, 2),
745745
(use_half_) ? CV_16SC1 : CV_32FC1);
746746

747-
UMat swizzled_weights_tmp;
748-
if (use_half_)
749-
swizzled_weights_tmp.create(shape(swizzled_weights_umat), CV_32F);
750-
751747
if (!interleave) {
752-
cl_uint argIdx = 0;
753748
int32_t channels = channels_ / group_;
754749

755-
ocl::Kernel oclk_copy_weight(CL_KERNEL_SELECT("copyWeightsSwizzled"),
756-
cv::ocl::dnn::conv_spatial_helper_oclsrc);
750+
ocl::Kernel oclk_copy_weight(
751+
use_half_ ? "copyWeightsSwizzled_half" : "copyWeightsSwizzled_float",
752+
cv::ocl::dnn::conv_spatial_helper_oclsrc,
753+
use_half_ ? "-DHALF_SUPPORT=1 -DDtype=half" : "-DDtype=float"
754+
);
757755
if (oclk_copy_weight.empty())
758756
return false;
759757

760-
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
761-
if (use_half_)
762-
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_tmp));
763-
else
764-
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
765-
oclk_copy_weight.set(argIdx++, kernel_w_);
766-
oclk_copy_weight.set(argIdx++, kernel_h_);
767-
oclk_copy_weight.set(argIdx++, channels);
768-
oclk_copy_weight.set(argIdx++, num_output_);
769-
oclk_copy_weight.set(argIdx++, swizzled_factor);
758+
oclk_copy_weight.args(
759+
ocl::KernelArg::PtrReadOnly(weight),
760+
ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat),
761+
kernel_w_,
762+
kernel_h_,
763+
channels,
764+
num_output_,
765+
swizzled_factor
766+
);
770767

771768
size_t global_work_size_copy[3] = {
772769
(size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
@@ -778,40 +775,53 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
778775
}
779776
} else {
780777
// assumption: kernel dimension is 2
781-
Mat weightMat = weight.getMat(ACCESS_READ);
782-
Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
778+
Mat weightMat;
783779
Mat swizzledWeightMat;
780+
UMat weight_tmp; // FP32 in half mode, TODO implement FP16 repack
784781
if (use_half_)
785-
swizzledWeightMat = swizzled_weights_tmp.getMat(ACCESS_WRITE);
782+
{
783+
CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
784+
convertFp16(weight, weight_tmp);
785+
weightMat = weight_tmp.getMat(ACCESS_READ);
786+
swizzledWeightMat.create(shape(swizzled_weights_umat), CV_32F);
787+
}
786788
else
789+
{
790+
weightMat = weight.getMat(ACCESS_READ);
787791
swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
792+
}
793+
794+
CV_CheckTypeEQ(weightMat.type(), CV_32FC1, "");
795+
Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
788796
Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
789797

790798
int interleavedRows = (kernel_w_ / 2) * 2;
791799
int nonInterleavedRows = kernel_w_ % 2;
792800
int blockWidth = swizzled_factor; // should equal to simd size.
793801
int rowAlignment = 32;
794802
size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype);
795-
Dtype * tmpSwizzledWeight = reinterpret_cast<Dtype*>(malloc(interleaved_filter_size));
796-
CHECK_EQ(tmpSwizzledWeight != NULL, true) << "Failed to allocate temporary swizzled weight";
803+
cv::AutoBuffer<Dtype, 0> tmpSwizzledWeight(interleaved_filter_size);
797804
for (int od = 0; od < M_; od++)
798805
for (int id = 0; id < channels_; id++)
799806
for (int r = 0; r < kernel_h_; r++)
800807
for (int c = 0; c < kernel_w_; c++)
801808
tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] =
802809
cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c];
810+
803811
interleaveMatrix(cpu_swizzled_weight,
804-
tmpSwizzledWeight,
812+
tmpSwizzledWeight.data(),
805813
kernel_w_ * kernel_h_ * channels_, M_,
806814
interleavedRows,
807815
nonInterleavedRows,
808816
blockWidth,
809817
rowAlignment);
810-
free(tmpSwizzledWeight);
811-
}
812818

813-
if (use_half_)
814-
convertFp16(swizzled_weights_tmp, swizzled_weights_umat);
819+
// unmap OpenCL buffers
820+
weightMat.release();
821+
822+
if (use_half_)
823+
convertFp16(swizzledWeightMat, swizzled_weights_umat);
824+
}
815825

816826
return true;
817827
}
@@ -1104,10 +1114,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
11041114
cl_uint argIdx = 0;
11051115
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
11061116
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
1107-
if (use_half_)
1108-
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
1109-
else
1110-
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
1117+
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
11111118
if (bias_term_)
11121119
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
11131120
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
@@ -1148,10 +1155,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
11481155
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
11491156
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
11501157
kernel.set(argIdx++, image_offset);
1151-
if (use_half_)
1152-
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
1153-
else
1154-
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
1158+
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
11551159
kernel.set(argIdx++, kernel_offset);
11561160
if (bias_term_)
11571161
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
@@ -1956,7 +1960,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
19561960

19571961
UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
19581962

1959-
calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages);
1963+
calculateBenchmark(bottom, benchData, weight, bias, numImages);
19601964

19611965
if (run_auto_tuning_ || force_auto_tuning_)
19621966
{

modules/dnn/src/opencl/conv_spatial_helper.cl

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,14 @@
3939
//
4040
//M*/
4141

42+
#ifdef HALF_SUPPORT
43+
#ifdef cl_khr_fp16
44+
#pragma OPENCL EXTENSION cl_khr_fp16:enable
45+
#endif
46+
#endif
47+
4248
#define CONCAT(A,B) A##_##B
4349
#define TEMPLATE(name,type) CONCAT(name,type)
44-
#define Dtype float
4550

4651
__kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
4752
(__global Dtype* weightIn,

0 commit comments

Comments
 (0)