@@ -150,7 +150,7 @@ void findScaleSpaceExtrema(
150
150
151
151
void calcSIFTDescriptor (
152
152
const Mat& img, Point2f ptf, float ori, float scl,
153
- int d, int n, float * dst
153
+ int d, int n, Mat& dst, int row
154
154
);
155
155
156
156
@@ -555,7 +555,7 @@ void findScaleSpaceExtrema(
555
555
556
556
void calcSIFTDescriptor (
557
557
const Mat& img, Point2f ptf, float ori, float scl,
558
- int d, int n, float * dst
558
+ int d, int n, Mat& dstMat, int row
559
559
)
560
560
{
561
561
CV_TRACE_FUNCTION ();
@@ -575,9 +575,18 @@ void calcSIFTDescriptor(
575
575
int i, j, k, len = (radius*2 +1 )*(radius*2 +1 ), histlen = (d+2 )*(d+2 )*(n+2 );
576
576
int rows = img.rows , cols = img.cols ;
577
577
578
- AutoBuffer<float > buf (len*6 + histlen);
579
- float *X = buf.data (), *Y = X + len, *Mag = Y, *Ori = Mag + len, *W = Ori + len;
580
- float *RBin = W + len, *CBin = RBin + len, *hist = CBin + len;
578
+ cv::utils::BufferArea area;
579
+ float *X = 0 , *Y = 0 , *Mag, *Ori = 0 , *W = 0 , *RBin = 0 , *CBin = 0 , *hist = 0 , *rawDst = 0 ;
580
+ area.allocate (X, len, CV_SIMD_WIDTH);
581
+ area.allocate (Y, len, CV_SIMD_WIDTH);
582
+ area.allocate (Ori, len, CV_SIMD_WIDTH);
583
+ area.allocate (W, len, CV_SIMD_WIDTH);
584
+ area.allocate (RBin, len, CV_SIMD_WIDTH);
585
+ area.allocate (CBin, len, CV_SIMD_WIDTH);
586
+ area.allocate (hist, histlen, CV_SIMD_WIDTH);
587
+ area.allocate (rawDst, len, CV_SIMD_WIDTH);
588
+ area.commit ();
589
+ Mag = Y;
581
590
582
591
for ( i = 0 ; i < d+2 ; i++ )
583
592
{
@@ -628,10 +637,10 @@ void calcSIFTDescriptor(
628
637
const v_int32 __n_plus_2 = vx_setall_s32 (n+2 );
629
638
for ( ; k <= len - vecsize; k += vecsize )
630
639
{
631
- v_float32 rbin = vx_load (RBin + k);
632
- v_float32 cbin = vx_load (CBin + k);
633
- v_float32 obin = (vx_load (Ori + k) - __ori) * __bins_per_rad;
634
- v_float32 mag = vx_load (Mag + k) * vx_load (W + k);
640
+ v_float32 rbin = vx_load_aligned (RBin + k);
641
+ v_float32 cbin = vx_load_aligned (CBin + k);
642
+ v_float32 obin = (vx_load_aligned (Ori + k) - __ori) * __bins_per_rad;
643
+ v_float32 mag = vx_load_aligned (Mag + k) * vx_load_aligned (W + k);
635
644
636
645
v_int32 r0 = v_floor (rbin);
637
646
v_int32 c0 = v_floor (cbin);
@@ -723,7 +732,7 @@ void calcSIFTDescriptor(
723
732
hist[idx] += hist[idx+n];
724
733
hist[idx+1 ] += hist[idx+n+1 ];
725
734
for ( k = 0 ; k < n; k++ )
726
- dst [(i*d + j)*n + k] = hist[idx+k];
735
+ rawDst [(i*d + j)*n + k] = hist[idx+k];
727
736
}
728
737
// copy histogram to the descriptor,
729
738
// apply hysteresis thresholding
@@ -735,17 +744,17 @@ void calcSIFTDescriptor(
735
744
#if CV_SIMD
736
745
{
737
746
v_float32 __nrm2 = vx_setzero_f32 ();
738
- v_float32 __dst ;
747
+ v_float32 __rawDst ;
739
748
for ( ; k <= len - v_float32::nlanes; k += v_float32::nlanes )
740
749
{
741
- __dst = vx_load (dst + k);
742
- __nrm2 = v_fma (__dst, __dst , __nrm2);
750
+ __rawDst = vx_load_aligned (rawDst + k);
751
+ __nrm2 = v_fma (__rawDst, __rawDst , __nrm2);
743
752
}
744
753
nrm2 = (float )v_reduce_sum (__nrm2);
745
754
}
746
755
#endif
747
756
for ( ; k < len; k++ )
748
- nrm2 += dst [k]*dst [k];
757
+ nrm2 += rawDst [k]*rawDst [k];
749
758
750
759
float thr = std::sqrt (nrm2)*SIFT_DESCR_MAG_THR;
751
760
@@ -760,9 +769,9 @@ void calcSIFTDescriptor(
760
769
__m256 __thr = _mm256_set1_ps(thr);
761
770
for( ; i <= len - 8; i += 8 )
762
771
{
763
- __dst = _mm256_loadu_ps(&dst [i]);
772
+ __dst = _mm256_loadu_ps(&rawDst [i]);
764
773
__dst = _mm256_min_ps(__dst, __thr);
765
- _mm256_storeu_ps(&dst [i], __dst);
774
+ _mm256_storeu_ps(&rawDst [i], __dst);
766
775
#if CV_FMA3
767
776
__nrm2 = _mm256_fmadd_ps(__dst, __dst, __nrm2);
768
777
#else
@@ -776,44 +785,78 @@ void calcSIFTDescriptor(
776
785
#endif
777
786
for ( ; i < len; i++ )
778
787
{
779
- float val = std::min (dst [i], thr);
780
- dst [i] = val;
788
+ float val = std::min (rawDst [i], thr);
789
+ rawDst [i] = val;
781
790
nrm2 += val*val;
782
791
}
783
792
nrm2 = SIFT_INT_DESCR_FCTR/std::max (std::sqrt (nrm2), FLT_EPSILON);
784
793
785
794
#if 1
786
795
k = 0 ;
796
+ if ( dstMat.type () == CV_32F )
797
+ {
798
+ float * dst = dstMat.ptr <float >(row);
787
799
#if CV_SIMD
800
+ v_float32 __dst;
801
+ v_float32 __min = vx_setzero_f32 ();
802
+ v_float32 __max = vx_setall_f32 (255 .0f ); // max of uchar
803
+ v_float32 __nrm2 = vx_setall_f32 (nrm2);
804
+ for ( k = 0 ; k <= len - v_float32::nlanes; k += v_float32::nlanes )
788
805
{
789
- v_float32 __dst;
790
- v_float32 __min = vx_setzero_f32 ();
791
- v_float32 __max = vx_setall_f32 (255 .0f ); // max of uchar
792
- v_float32 __nrm2 = vx_setall_f32 (nrm2);
793
- for ( k = 0 ; k <= len - v_float32::nlanes; k += v_float32::nlanes )
794
- {
795
- __dst = vx_load (dst + k);
796
- __dst = v_min (v_max (v_cvt_f32 (v_round (__dst * __nrm2)), __min), __max);
797
- v_store (dst + k, __dst);
798
- }
806
+ __dst = vx_load_aligned (rawDst + k);
807
+ __dst = v_min (v_max (v_cvt_f32 (v_round (__dst * __nrm2)), __min), __max);
808
+ v_store (dst + k, __dst);
799
809
}
800
810
#endif
801
811
for ( ; k < len; k++ )
802
812
{
803
- dst[k] = saturate_cast<uchar>(dst [k]*nrm2);
813
+ dst[k] = saturate_cast<uchar>(rawDst [k]*nrm2);
804
814
}
815
+ }
816
+ else // CV_8U
817
+ {
818
+ uint8_t * dst = dstMat.ptr <uint8_t >(row);
819
+ #if CV_SIMD
820
+ v_float32 __dst0, __dst1;
821
+ v_uint16 __pack01;
822
+ v_float32 __nrm2 = vx_setall_f32 (nrm2);
823
+ for ( k = 0 ; k <= len - v_float32::nlanes * 2 ; k += v_float32::nlanes * 2 )
824
+ {
825
+ __dst0 = vx_load_aligned (rawDst + k);
826
+ __dst1 = vx_load_aligned (rawDst + k + v_float32::nlanes);
827
+
828
+ __pack01 = v_pack_u (v_round (__dst0 * __nrm2), v_round (__dst1 * __nrm2));
829
+ v_pack_store (dst + k, __pack01);
830
+ }
831
+ #endif
832
+ for ( ; k < len; k++ )
833
+ {
834
+ dst[k] = saturate_cast<uchar>(rawDst[k]*nrm2);
835
+ }
836
+ }
805
837
#else
838
+ float* dst = dstMat.ptr<float>(row);
806
839
float nrm1 = 0;
807
840
for( k = 0; k < len; k++ )
808
841
{
809
- dst [k] *= nrm2;
810
- nrm1 += dst [k];
842
+ rawDst [k] *= nrm2;
843
+ nrm1 += rawDst [k];
811
844
}
812
845
nrm1 = 1.f/std::max(nrm1, FLT_EPSILON);
846
+ if( dstMat.type() == CV_32F )
847
+ {
813
848
for( k = 0; k < len; k++ )
814
849
{
815
- dst[k] = std::sqrt(dst [k] * nrm1);//saturate_cast<uchar>(std::sqrt(dst[k] * nrm1)*SIFT_INT_DESCR_FCTR );
850
+ dst[k] = std::sqrt(rawDst [k] * nrm1);
816
851
}
852
+ }
853
+ else // CV_8U
854
+ {
855
+ for( k = 0; k < len; k++ )
856
+ {
857
+ dst[k] = saturate_cast<uchar>(std::sqrt(rawDst[k] * nrm1)*SIFT_INT_DESCR_FCTR);
858
+ }
859
+ }
817
860
#endif
818
861
}
819
862
0 commit comments