Skip to content

Commit ef665fb

Browse files
Inhibit vectorization of short loops
In the code that mirrors independent harmonics up to Nyquist frequency, the short loop containing computation of location of the harmonics and its dependent counterpart was being vectorized by ICC with detrimental effects on performance for FFT-dimensions smaller than vector register width. Use pragma novector for now. This improves performance of mkl_fft.fft2 on (8K,8K) array of doubles making ICC compiled code 18% faster than GCC, rather than 2.5X slower.
1 parent d54fef7 commit ef665fb

File tree

1 file changed

+26
-4
lines changed

1 file changed

+26
-4
lines changed

mkl_fft/src/mklfft.c.src

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,9 @@ int @name@_mkl_@mode@_in(PyArrayObject* x_inout, npy_intp n, int axis)
560560
while(!MultiIter_Done(mit)) {
561561
char *tmp;
562562

563+
#if defined(__ICC) || defined(__INTEL_COMPILER)
564+
#pragma novector
565+
#endif
563566
for(tmp = (char *) x_data, i = 0; i < x_rank; i++)
564567
tmp += x_strides[i] * MultiIter_IndexElem(mit, i);
565568

@@ -747,6 +750,9 @@ int @REALIN@_@COMPLEXOUT@_mkl_@mode@_out(
747750
while(!MultiIter_Done(mit)) {
748751
char *tmp1, *tmp2;
749752

753+
#if defined(__ICC) || defined(__INTEL_COMPILER)
754+
#pragma novector
755+
#endif
750756
for(tmp1 = (char *) xin_data,
751757
tmp2 = (char *) xout_data,
752758
i = 0; i < xin_rank; i++) {
@@ -808,6 +814,9 @@ int @REALIN@_@COMPLEXOUT@_mkl_@mode@_out(
808814
@MKL_OUT_TYPE@ *dest, *src;
809815
/* npy_intp k_last = MultiIter_IndexElem(mit, axis); */
810816

817+
#if defined(__ICC) || defined(__INTEL_COMPILER)
818+
#pragma novector
819+
#endif
811820
for(tmp1 = (char *) xout_data, tmp2 = (char *) xout_data,
812821
i = 0; i < xout_rank; i++) {
813822
npy_intp si = xout_strides[i],
@@ -997,6 +1006,9 @@ int @COMPLEXIN@_@COMPLEXOUT@_mkl_@mode@_out(
9971006
while(!MultiIter_Done(mit)) {
9981007
char *tmp1, *tmp2;
9991008

1009+
#if defined(__ICC) || defined(__INTEL_COMPILER)
1010+
#pragma novector
1011+
#endif
10001012
for(tmp1 = (char *) xin_data,
10011013
tmp2 = (char *) xout_data,
10021014
i = 0; i < xin_rank; i++) {
@@ -1135,6 +1147,9 @@ int @name@_mkl_@mode@_in(PyArrayObject* x_inout, npy_intp n, int axis)
11351147
while(!MultiIter_Done(mit)) {
11361148
char *tmp;
11371149

1150+
#if defined(__ICC) || defined(__INTEL_COMPILER)
1151+
#pragma novector
1152+
#endif
11381153
for(tmp = (char *) x_data, i = 0; i < x_rank; i++)
11391154
tmp += x_strides[i] * MultiIter_IndexElem(mit, i);
11401155

@@ -1297,6 +1312,9 @@ int
12971312
while(!MultiIter_Done(mit)) {
12981313
char *tmp1, *tmp2;
12991314

1315+
#if defined(__ICC) || defined(__INTEL_COMPILER)
1316+
#pragma novector
1317+
#endif
13001318
for(tmp1 = (char *) xin_data,
13011319
tmp2 = (char *) xout_data,
13021320
i = 0; i < xin_rank; i++) {
@@ -1475,6 +1493,9 @@ int @name@_@name@_mkl_@mode@_out(
14751493
while(!MultiIter_Done(mit)) {
14761494
char *tmp1, *tmp2;
14771495

1496+
#if defined(__ICC) || defined(__INTEL_COMPILER)
1497+
#pragma novector
1498+
#endif
14781499
for(tmp1 = (char *) xin_data,
14791500
tmp2 = (char *) xout_data,
14801501
i = 0; i < xin_rank; i++) {
@@ -1872,6 +1893,9 @@ int
18721893
char *tmp1, *tmp2;
18731894
@MKL_OUT_TYPE@ *dest, *src;
18741895

1896+
#if defined(__ICC) || defined(__INTEL_COMPILER)
1897+
#pragma novector
1898+
#endif
18751899
for(tmp1 = (char *) xout_data, tmp2 = (char *) xout_data,
18761900
i = 0; i < xout_rank; i++) {
18771901
npy_intp si = xout_strides[i],
@@ -1890,12 +1914,10 @@ int
18901914
tmp1 += si * dest_ki;
18911915
tmp2 += si * src_ki;
18921916
}
1917+
18931918
dest = (@MKL_OUT_TYPE@*) tmp1;
18941919
src = (@MKL_OUT_TYPE@*) tmp2;
1895-
1896-
/* is there a nicer way to complex conjugate ? */
1897-
dest->real = src->real;
1898-
dest->imag = -src->imag;
1920+
SET_CONJ(dest, src);
18991921

19001922
if (multi_iter_next(&mit))
19011923
break;

0 commit comments

Comments
 (0)