Skip to content

Commit e5d2642

Browse files
committed
Merge pull request opencv#19015 from alalek:dnn_use_fma
2 parents a9f4f8d + 00f36a3 commit e5d2642

File tree

2 files changed

+32
-27
lines changed

2 files changed

+32
-27
lines changed

modules/dnn/src/layers/convolution_layer.cpp

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1370,15 +1370,15 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
13701370
v_float32x4 r2 = v_load_aligned(rptr + vsz_a*2);
13711371
v_float32x4 r3 = v_load_aligned(rptr + vsz_a*3);
13721372

1373-
vs00 += w0*r0;
1374-
vs01 += w0*r1;
1375-
vs02 += w0*r2;
1376-
vs03 += w0*r3;
1377-
1378-
vs10 += w1*r0;
1379-
vs11 += w1*r1;
1380-
vs12 += w1*r2;
1381-
vs13 += w1*r3;
1373+
vs00 = v_fma(w0, r0, vs00);
1374+
vs01 = v_fma(w0, r1, vs01);
1375+
vs02 = v_fma(w0, r2, vs02);
1376+
vs03 = v_fma(w0, r3, vs03);
1377+
1378+
vs10 = v_fma(w1, r0, vs10);
1379+
vs11 = v_fma(w1, r1, vs11);
1380+
vs12 = v_fma(w1, r2, vs12);
1381+
vs13 = v_fma(w1, r3, vs13);
13821382
}
13831383
s0 += v_reduce_sum4(vs00, vs01, vs02, vs03);
13841384
s1 += v_reduce_sum4(vs10, vs11, vs12, vs13);
@@ -2035,29 +2035,32 @@ class DeConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
20352035

20362036
for( ; n <= nmax - 4; n += 4 )
20372037
{
2038+
v_float32x4 d0 = v_load(dst0 + n);
2039+
v_float32x4 d1 = v_load(dst1 + n);
20382040
v_float32x4 b0 = v_load(bptr0 + n);
20392041
v_float32x4 b1 = v_load(bptr1 + n);
20402042
v_float32x4 b2 = v_load(bptr2 + n);
20412043
v_float32x4 b3 = v_load(bptr3 + n);
2042-
v_float32x4 d0 = v_load(dst0 + n);
2043-
v_float32x4 d1 = v_load(dst1 + n);
2044-
d0 += b0*a00;
2045-
d1 += b0*a01;
2046-
d0 += b1*a10;
2047-
d1 += b1*a11;
2048-
d0 += b2*a20;
2049-
d1 += b2*a21;
2050-
d0 += b3*a30;
2051-
d1 += b3*a31;
2044+
// TODO try to improve pipeline width
2045+
d0 = v_fma(b0, a00, d0);
2046+
d1 = v_fma(b0, a01, d1);
2047+
d0 = v_fma(b1, a10, d0);
2048+
d1 = v_fma(b1, a11, d1);
2049+
d0 = v_fma(b2, a20, d0);
2050+
d1 = v_fma(b2, a21, d1);
2051+
d0 = v_fma(b3, a30, d0);
2052+
d1 = v_fma(b3, a31, d1);
20522053
v_store(dst0 + n, d0);
20532054
v_store(dst1 + n, d1);
20542055
}
20552056
#endif
20562057

20572058
for( ; n < nmax; n++ )
20582059
{
2059-
float b0 = bptr0[n], b1 = bptr1[n];
2060-
float b2 = bptr2[n], b3 = bptr3[n];
2060+
float b0 = bptr0[n];
2061+
float b1 = bptr1[n];
2062+
float b2 = bptr2[n];
2063+
float b3 = bptr3[n];
20612064
float d0 = dst0[n] + alpha00*b0 + alpha10*b1 + alpha20*b2 + alpha30*b3;
20622065
float d1 = dst1[n] + alpha01*b0 + alpha11*b1 + alpha21*b2 + alpha31*b3;
20632066
dst0[n] = d0;

modules/dnn/src/layers/fully_connected_layer.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -241,16 +241,18 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
241241
#if CV_SIMD128
242242
for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
243243
{
244-
v_float32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);
245-
v_float32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);
244+
v_float32x4 vs0 = v_setall_f32(0.f);
245+
v_float32x4 vs1 = v_setall_f32(0.f);
246+
v_float32x4 vs2 = v_setall_f32(0.f);
247+
v_float32x4 vs3 = v_setall_f32(0.f);
246248

247249
for( k = 0; k < vecsize; k += 4 )
248250
{
249251
v_float32x4 v = v_load_aligned(sptr + k);
250-
vs0 += v*v_load_aligned(wptr + k);
251-
vs1 += v*v_load_aligned(wptr + wstep + k);
252-
vs2 += v*v_load_aligned(wptr + wstep*2 + k);
253-
vs3 += v*v_load_aligned(wptr + wstep*3 + k);
252+
vs0 = v_fma(v, v_load_aligned(wptr + k), vs0);
253+
vs1 = v_fma(v, v_load_aligned(wptr + wstep + k), vs1);
254+
vs2 = v_fma(v, v_load_aligned(wptr + wstep*2 + k), vs2);
255+
vs3 = v_fma(v, v_load_aligned(wptr + wstep*3 + k), vs3);
254256
}
255257

256258
v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);

0 commit comments

Comments
 (0)