@@ -1370,15 +1370,15 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
1370
1370
v_float32x4 r2 = v_load_aligned (rptr + vsz_a*2 );
1371
1371
v_float32x4 r3 = v_load_aligned (rptr + vsz_a*3 );
1372
1372
1373
- vs00 += w0*r0 ;
1374
- vs01 += w0*r1 ;
1375
- vs02 += w0*r2 ;
1376
- vs03 += w0*r3 ;
1377
-
1378
- vs10 += w1*r0 ;
1379
- vs11 += w1*r1 ;
1380
- vs12 += w1*r2 ;
1381
- vs13 += w1*r3 ;
1373
+ vs00 = v_fma (w0, r0, vs00) ;
1374
+ vs01 = v_fma (w0, r1, vs01) ;
1375
+ vs02 = v_fma (w0, r2, vs02) ;
1376
+ vs03 = v_fma (w0, r3, vs03) ;
1377
+
1378
+ vs10 = v_fma (w1, r0, vs10) ;
1379
+ vs11 = v_fma (w1, r1, vs11) ;
1380
+ vs12 = v_fma (w1, r2, vs12) ;
1381
+ vs13 = v_fma (w1, r3, vs13) ;
1382
1382
}
1383
1383
s0 += v_reduce_sum4 (vs00, vs01, vs02, vs03);
1384
1384
s1 += v_reduce_sum4 (vs10, vs11, vs12, vs13);
@@ -2035,29 +2035,32 @@ class DeConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
2035
2035
2036
2036
for ( ; n <= nmax - 4 ; n += 4 )
2037
2037
{
2038
+ v_float32x4 d0 = v_load (dst0 + n);
2039
+ v_float32x4 d1 = v_load (dst1 + n);
2038
2040
v_float32x4 b0 = v_load (bptr0 + n);
2039
2041
v_float32x4 b1 = v_load (bptr1 + n);
2040
2042
v_float32x4 b2 = v_load (bptr2 + n);
2041
2043
v_float32x4 b3 = v_load (bptr3 + n);
2042
- v_float32x4 d0 = v_load (dst0 + n);
2043
- v_float32x4 d1 = v_load (dst1 + n);
2044
- d0 += b0*a00;
2045
- d1 += b0*a01;
2046
- d0 += b1*a10;
2047
- d1 += b1*a11;
2048
- d0 += b2*a20;
2049
- d1 += b2*a21;
2050
- d0 += b3*a30;
2051
- d1 += b3*a31;
2044
+ // TODO try to improve pipeline width
2045
+ d0 = v_fma (b0, a00, d0);
2046
+ d1 = v_fma (b0, a01, d1);
2047
+ d0 = v_fma (b1, a10, d0);
2048
+ d1 = v_fma (b1, a11, d1);
2049
+ d0 = v_fma (b2, a20, d0);
2050
+ d1 = v_fma (b2, a21, d1);
2051
+ d0 = v_fma (b3, a30, d0);
2052
+ d1 = v_fma (b3, a31, d1);
2052
2053
v_store (dst0 + n, d0);
2053
2054
v_store (dst1 + n, d1);
2054
2055
}
2055
2056
#endif
2056
2057
2057
2058
for ( ; n < nmax; n++ )
2058
2059
{
2059
- float b0 = bptr0[n], b1 = bptr1[n];
2060
- float b2 = bptr2[n], b3 = bptr3[n];
2060
+ float b0 = bptr0[n];
2061
+ float b1 = bptr1[n];
2062
+ float b2 = bptr2[n];
2063
+ float b3 = bptr3[n];
2061
2064
float d0 = dst0[n] + alpha00*b0 + alpha10*b1 + alpha20*b2 + alpha30*b3;
2062
2065
float d1 = dst1[n] + alpha01*b0 + alpha11*b1 + alpha21*b2 + alpha31*b3;
2063
2066
dst0[n] = d0;
0 commit comments