@@ -588,16 +588,16 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
588
588
fused_eltwise_ = false ;
589
589
}
590
590
591
- if (use_half_ && bias_half. empty () && !bias.empty ())
592
- convertFp16 (bias, bias_half );
591
+ if (use_half_ && !bias.empty ())
592
+ CV_CheckTypeEQ (bias. type (), CV_16SC1, " " );
593
593
594
- if (use_half_ && weights_half. empty () )
595
- convertFp16 (weight, weights_half );
594
+ if (use_half_)
595
+ CV_CheckTypeEQ (weight. type (), CV_16SC1, " " );
596
596
597
- prepareKernel (bottom, top, weight, (use_half_) ? bias_half : bias, numImages);
597
+ prepareKernel (bottom, top, weight, bias, numImages);
598
598
if (bestKernelConfig.empty ())
599
599
return false ;
600
- return convolve (bottom, top, weight, (use_half_) ? bias_half : bias, numImages, bestKernelConfig);
600
+ return convolve (bottom, top, weight, bias, numImages, bestKernelConfig);
601
601
}
602
602
603
603
template <typename Dtype>
@@ -744,29 +744,26 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
744
744
kernel_h_ * (int )alignSize (kernel_w_, 2 ),
745
745
(use_half_) ? CV_16SC1 : CV_32FC1);
746
746
747
- UMat swizzled_weights_tmp;
748
- if (use_half_)
749
- swizzled_weights_tmp.create (shape (swizzled_weights_umat), CV_32F);
750
-
751
747
if (!interleave) {
752
- cl_uint argIdx = 0 ;
753
748
int32_t channels = channels_ / group_;
754
749
755
- ocl::Kernel oclk_copy_weight (CL_KERNEL_SELECT (" copyWeightsSwizzled" ),
756
- cv::ocl::dnn::conv_spatial_helper_oclsrc);
750
+ ocl::Kernel oclk_copy_weight (
751
+ use_half_ ? " copyWeightsSwizzled_half" : " copyWeightsSwizzled_float" ,
752
+ cv::ocl::dnn::conv_spatial_helper_oclsrc,
753
+ use_half_ ? " -DHALF_SUPPORT=1 -DDtype=half" : " -DDtype=float"
754
+ );
757
755
if (oclk_copy_weight.empty ())
758
756
return false ;
759
757
760
- oclk_copy_weight.set (argIdx++, ocl::KernelArg::PtrReadOnly (weight));
761
- if (use_half_)
762
- oclk_copy_weight.set (argIdx++, ocl::KernelArg::PtrWriteOnly (swizzled_weights_tmp));
763
- else
764
- oclk_copy_weight.set (argIdx++, ocl::KernelArg::PtrWriteOnly (swizzled_weights_umat));
765
- oclk_copy_weight.set (argIdx++, kernel_w_);
766
- oclk_copy_weight.set (argIdx++, kernel_h_);
767
- oclk_copy_weight.set (argIdx++, channels);
768
- oclk_copy_weight.set (argIdx++, num_output_);
769
- oclk_copy_weight.set (argIdx++, swizzled_factor);
758
+ oclk_copy_weight.args (
759
+ ocl::KernelArg::PtrReadOnly (weight),
760
+ ocl::KernelArg::PtrWriteOnly (swizzled_weights_umat),
761
+ kernel_w_,
762
+ kernel_h_,
763
+ channels,
764
+ num_output_,
765
+ swizzled_factor
766
+ );
770
767
771
768
size_t global_work_size_copy[3 ] = {
772
769
(size_t ) (alignSize (num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1 , 1 };
@@ -778,40 +775,53 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
778
775
}
779
776
} else {
780
777
// assumption: kernel dimension is 2
781
- Mat weightMat = weight.getMat (ACCESS_READ);
782
- Dtype* cpu_weight = (Dtype *)weightMat.ptr <float >();
778
+ Mat weightMat;
783
779
Mat swizzledWeightMat;
780
+ UMat weight_tmp; // FP32 in half mode, TODO implement FP16 repack
784
781
if (use_half_)
785
- swizzledWeightMat = swizzled_weights_tmp.getMat (ACCESS_WRITE);
782
+ {
783
+ CV_CheckTypeEQ (weight.type (), CV_16SC1, " " );
784
+ convertFp16 (weight, weight_tmp);
785
+ weightMat = weight_tmp.getMat (ACCESS_READ);
786
+ swizzledWeightMat.create (shape (swizzled_weights_umat), CV_32F);
787
+ }
786
788
else
789
+ {
790
+ weightMat = weight.getMat (ACCESS_READ);
787
791
swizzledWeightMat = swizzled_weights_umat.getMat (ACCESS_WRITE);
792
+ }
793
+
794
+ CV_CheckTypeEQ (weightMat.type (), CV_32FC1, " " );
795
+ Dtype* cpu_weight = (Dtype *)weightMat.ptr <float >();
788
796
Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr <float >();
789
797
790
798
int interleavedRows = (kernel_w_ / 2 ) * 2 ;
791
799
int nonInterleavedRows = kernel_w_ % 2 ;
792
800
int blockWidth = swizzled_factor; // should equal to simd size.
793
801
int rowAlignment = 32 ;
794
802
size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof (Dtype);
795
- Dtype * tmpSwizzledWeight = reinterpret_cast <Dtype*>(malloc (interleaved_filter_size));
796
- CHECK_EQ (tmpSwizzledWeight != NULL , true ) << " Failed to allocate temporary swizzled weight" ;
803
+ cv::AutoBuffer<Dtype, 0 > tmpSwizzledWeight (interleaved_filter_size);
797
804
for (int od = 0 ; od < M_; od++)
798
805
for (int id = 0 ; id < channels_; id++)
799
806
for (int r = 0 ; r < kernel_h_; r++)
800
807
for (int c = 0 ; c < kernel_w_; c++)
801
808
tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] =
802
809
cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c];
810
+
803
811
interleaveMatrix (cpu_swizzled_weight,
804
- tmpSwizzledWeight,
812
+ tmpSwizzledWeight. data () ,
805
813
kernel_w_ * kernel_h_ * channels_, M_,
806
814
interleavedRows,
807
815
nonInterleavedRows,
808
816
blockWidth,
809
817
rowAlignment);
810
- free (tmpSwizzledWeight);
811
- }
812
818
813
- if (use_half_)
814
- convertFp16 (swizzled_weights_tmp, swizzled_weights_umat);
819
+ // unmap OpenCL buffers
820
+ weightMat.release ();
821
+
822
+ if (use_half_)
823
+ convertFp16 (swizzledWeightMat, swizzled_weights_umat);
824
+ }
815
825
816
826
return true ;
817
827
}
@@ -1104,10 +1114,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
1104
1114
cl_uint argIdx = 0 ;
1105
1115
setFusionArg (fused_activ_, fused_eltwise_, kernel, argIdx);
1106
1116
kernel.set (argIdx++, ocl::KernelArg::PtrReadOnly (bottom));
1107
- if (use_half_)
1108
- kernel.set (argIdx++, ocl::KernelArg::PtrReadOnly (weights_half));
1109
- else
1110
- kernel.set (argIdx++, ocl::KernelArg::PtrReadOnly (weight));
1117
+ kernel.set (argIdx++, ocl::KernelArg::PtrReadOnly (weight));
1111
1118
if (bias_term_)
1112
1119
kernel.set (argIdx++, ocl::KernelArg::PtrReadOnly (bias));
1113
1120
kernel.set (argIdx++, ocl::KernelArg::PtrWriteOnly (top));
@@ -1148,10 +1155,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
1148
1155
setFusionArg (fused_activ_, fused_eltwise_, kernel, argIdx);
1149
1156
kernel.set (argIdx++, ocl::KernelArg::PtrReadOnly (bottom));
1150
1157
kernel.set (argIdx++, image_offset);
1151
- if (use_half_)
1152
- kernel.set (argIdx++, ocl::KernelArg::PtrReadOnly (weights_half));
1153
- else
1154
- kernel.set (argIdx++, ocl::KernelArg::PtrReadOnly (weight));
1158
+ kernel.set (argIdx++, ocl::KernelArg::PtrReadOnly (weight));
1155
1159
kernel.set (argIdx++, kernel_offset);
1156
1160
if (bias_term_)
1157
1161
kernel.set (argIdx++, ocl::KernelArg::PtrReadOnly (bias));
@@ -1956,7 +1960,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
1956
1960
1957
1961
UMat benchData (1 , numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
1958
1962
1959
- calculateBenchmark (bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages);
1963
+ calculateBenchmark (bottom, benchData, weight, bias, numImages);
1960
1964
1961
1965
if (run_auto_tuning_ || force_auto_tuning_)
1962
1966
{
0 commit comments