27
27
#endif
28
28
29
29
// Scalar horizonal max across four lanes.
30
- float hmax (__m128 m)
31
- {
32
- float f[4 ];
33
- _mm_storeu_ps (f, m);
34
- return fmax (fmax (f[0 ], f[1 ]), fmax (f[2 ], f[3 ]));
30
+ float hmax (__m128 m) {
31
+ float f[4 ];
32
+ _mm_storeu_ps (f, m);
33
+ return fmax (fmax (f[0 ], f[1 ]), fmax (f[2 ], f[3 ]));
35
34
}
36
35
37
36
#include " ../tick.h"
@@ -45,33 +44,33 @@ const char *chartName = "";
45
44
#define SETCHART (x ) chartName = (x);
46
45
47
46
#define START () \
48
- do { \
49
- tick_t start = tick ();
47
+ do { \
48
+ tick_t start = tick ();
50
49
51
50
bool comma=false ;
52
51
#define END (result, name ) \
53
- tick_t end = tick (); \
54
- tick_t ticks = end - start; \
55
- scalarTotalTicks += scalarTicks; \
56
- simdTotalTicks += ticks; \
57
- double nsecs = (double )ticks * 1000.0 * 1000.0 * 1000.0 / ticks_per_sec () / N; \
58
- printf (" %s{ \" chart\" : \" %s\" , \" category\" : \" %s\" , \" scalar\" : %f, \" simd\" : %f }\n " , comma?" ," :" " , chartName, name, scalarTime, nsecs); \
59
- comma = true ; \
60
- printf (" %s" , (result) != 0 ? " Error!" : " " ); \
61
- } while (0 )
52
+ tick_t end = tick (); \
53
+ tick_t ticks = end - start; \
54
+ scalarTotalTicks += scalarTicks; \
55
+ simdTotalTicks += ticks; \
56
+ double nsecs = (double )ticks * 1000.0 * 1000.0 * 1000.0 / ticks_per_sec () / N; \
57
+ printf (" %s{ \" chart\" : \" %s\" , \" category\" : \" %s\" , \" scalar\" : %f, \" simd\" : %f }\n " , comma?" ," :" " , chartName, name, scalarTime, nsecs); \
58
+ comma = true ; \
59
+ printf (" %s" , (result) != 0 ? " Error!" : " " ); \
60
+ } while (0 )
62
61
63
62
#define ENDSCALAR (result, name ) \
64
- tick_t end = tick(); \
65
- scalarTicks = end - start; \
66
- scalarTime = (double )scalarTicks * 1000.0 * 1000.0 * 1000.0 / ticks_per_sec() / N; \
67
- printf (" %s" , (result) != 0 ? "Error!" : ""); \
68
- } while (0 )
63
+ tick_t end = tick(); \
64
+ scalarTicks = end - start; \
65
+ scalarTime = (double )scalarTicks * 1000.0 * 1000.0 * 1000.0 / ticks_per_sec() / N; \
66
+ printf (" %s" , (result) != 0 ? "Error!" : ""); \
67
+ } while (0 )
69
68
70
69
void Print (__m128 m)
71
70
{
72
- float val[4 ];
73
- _mm_storeu_ps (val, m);
74
- fprintf (stderr, " [%g, %g, %g, %g]\n " , val[3 ], val[2 ], val[1 ], val[0 ]);
71
+ float val[4 ];
72
+ _mm_storeu_ps (val, m);
73
+ fprintf (stderr, " [%g, %g, %g, %g]\n " , val[3 ], val[2 ], val[1 ], val[0 ]);
75
74
}
76
75
77
76
bool always_true () { return time (NULL ) != 0 ; } // This function always returns true, but the compiler should not know this.
@@ -90,14 +89,13 @@ float NOINLINE *alloc_float_buffer() { return always_true() ? (float*)aligned_al
90
89
double NOINLINE *alloc_double_buffer () { return always_true () ? (double *)aligned_alloc (16 , (N+16 )*sizeof (double )) : 0 ; }
91
90
92
91
template <typename T>
93
- T checksum_dst (T *dst)
94
- {
95
- if (always_true ()) return 0 .f ;
96
- else
97
- {
98
- T s = 0 .f ; for (int i = 0 ; i < N; ++i) s += dst[i];
99
- return s;
100
- }
92
+ T checksum_dst (T *dst) {
93
+ if (always_true ()) {
94
+ return 0 .f ;
95
+ } else {
96
+ T s = 0 .f ; for (int i = 0 ; i < N; ++i) s += dst[i];
97
+ return s;
98
+ }
101
99
}
102
100
103
101
uint32_t fcastu (float f) { return *(uint32_t *)&f; }
@@ -106,115 +104,114 @@ float ucastf(uint32_t t) { return *(float*)&t; }
106
104
double ucastd (uint64_t t) { return *(double *)&t; }
107
105
108
106
#define LOAD_STORE_F (msg, load_instr, load_offset, store_instr, store_ptr_type, store_offset, num_elems_stride ) \
109
- START (); \
110
- for (int i = 0 ; i < N; i += num_elems_stride) \
111
- store_instr ((store_ptr_type)dst_flt+store_offset+i, load_instr(src_flt+load_offset+i)); \
112
- END (checksum_dst(dst_flt), msg);
107
+ START (); \
108
+ for (int i = 0 ; i < N; i += num_elems_stride) \
109
+ store_instr ((store_ptr_type)dst_flt+store_offset+i, load_instr(src_flt+load_offset+i)); \
110
+ END (checksum_dst(dst_flt), msg);
113
111
114
112
#define LOAD_STORE_D (msg, load_instr, load_offset, store_instr, store_ptr_type, store_offset, num_elems_stride ) \
115
- START (); \
116
- for (int i = 0 ; i < N; i += num_elems_stride) \
117
- store_instr ((store_ptr_type)dst_dbl+store_offset+i, load_instr(src_dbl+load_offset+i)); \
118
- END (checksum_dst(dst_dbl), msg);
113
+ START (); \
114
+ for (int i = 0 ; i < N; i += num_elems_stride) \
115
+ store_instr ((store_ptr_type)dst_dbl+store_offset+i, load_instr(src_dbl+load_offset+i)); \
116
+ END (checksum_dst(dst_dbl), msg);
119
117
120
118
#define LOAD_STORE_I (msg, load_instr, load_offset, store_instr, store_offset, num_elems_stride ) \
121
- START (); \
122
- for (int i = 0 ; i < N; i += num_elems_stride) \
123
- store_instr ((__m128i*)(dst_int+store_offset+i), load_instr((__m128i*)(src_int+load_offset+i))); \
124
- END (checksum_dst(dst_int), msg);
119
+ START (); \
120
+ for (int i = 0 ; i < N; i += num_elems_stride) \
121
+ store_instr ((__m128i*)(dst_int+store_offset+i), load_instr((__m128i*)(src_int+load_offset+i))); \
122
+ END (checksum_dst(dst_int), msg);
125
123
126
124
// load M64*, store M128
127
125
#define LOAD_STORE_M64 (msg, reg, load_instr, load_ptr_type, load_offset, store_instr, store_ptr_type, store_offset, num_elems_stride ) \
128
- START (); \
129
- for (int i = 0 ; i < N; i += num_elems_stride) \
130
- store_instr ((store_ptr_type)dst_flt+store_offset+i, load_instr(reg, (load_ptr_type)(src_flt+load_offset+i))); \
131
- END (checksum_dst(dst_flt), msg);
126
+ START (); \
127
+ for (int i = 0 ; i < N; i += num_elems_stride) \
128
+ store_instr ((store_ptr_type)dst_flt+store_offset+i, load_instr(reg, (load_ptr_type)(src_flt+load_offset+i))); \
129
+ END (checksum_dst(dst_flt), msg);
132
130
133
131
#define LOAD_STORE_64_F (msg, load_instr, load_offset, store_instr, store_offset, num_elems_stride ) \
134
- START (); \
135
- for (int i = 0 ; i < N; i += num_elems_stride) \
136
- store_instr ((__m64*)(dst_flt+store_offset+i), load_instr(src_flt+load_offset+i)); \
137
- END (checksum_dst(dst_flt), msg);
132
+ START (); \
133
+ for (int i = 0 ; i < N; i += num_elems_stride) \
134
+ store_instr ((__m64*)(dst_flt+store_offset+i), load_instr(src_flt+load_offset+i)); \
135
+ END (checksum_dst(dst_flt), msg);
138
136
139
137
#define LOAD_STORE_64_D (msg, load_instr, load_offset, store_instr, store_offset, num_elems_stride ) \
140
- START (); \
141
- for (int i = 0 ; i < N; i += num_elems_stride) \
142
- store_instr ((__m64*)(dst_dbl+store_offset+i), load_instr(src_dbl+load_offset+i)); \
143
- END (checksum_dst(dst_dbl), msg);
138
+ START (); \
139
+ for (int i = 0 ; i < N; i += num_elems_stride) \
140
+ store_instr ((__m64*)(dst_dbl+store_offset+i), load_instr(src_dbl+load_offset+i)); \
141
+ END (checksum_dst(dst_dbl), msg);
144
142
145
143
#define SET_STORE_F (msg, set_instr ) \
146
- START (); \
147
- for (int i = 0 ; i < N; i += 4 ) \
148
- _mm_store_ps (dst_flt+i, set_instr); \
149
- END (checksum_dst(dst_flt), msg);
144
+ START (); \
145
+ for (int i = 0 ; i < N; i += 4 ) \
146
+ _mm_store_ps (dst_flt+i, set_instr); \
147
+ END (checksum_dst(dst_flt), msg);
150
148
151
149
#define SET_STORE_D (msg, set_instr ) \
152
- START (); \
153
- for (int i = 0 ; i < N; i += 4 ) \
154
- _mm_store_pd (dst_dbl+i, set_instr); \
155
- END (checksum_dst(dst_dbl), msg);
150
+ START (); \
151
+ for (int i = 0 ; i < N; i += 4 ) \
152
+ _mm_store_pd (dst_dbl+i, set_instr); \
153
+ END (checksum_dst(dst_dbl), msg);
156
154
157
155
#define UNARYOP_F_F (msg, instr, op0 ) \
158
- START (); \
159
- __m128 o = op0; \
160
- for (int i = 0 ; i < N; i += 4 ) \
161
- o = instr(o); \
162
- _mm_store_ps (dst_flt, o); \
163
- END (checksum_dst(dst_flt), msg);
156
+ START (); \
157
+ __m128 o = op0; \
158
+ for (int i = 0 ; i < N; i += 4 ) \
159
+ o = instr(o); \
160
+ _mm_store_ps (dst_flt, o); \
161
+ END (checksum_dst(dst_flt), msg);
164
162
165
163
#define UNARYOP_I_I (msg, instr, op0 ) \
166
- START (); \
167
- __m128 o = op0; \
168
- for (int i = 0 ; i < N; i += 4 ) \
169
- o = instr(o); \
170
- _mm_store_si128 ((__m128i*)dst_int, o); \
171
- END (checksum_dst(dst_int), msg);
164
+ START (); \
165
+ __m128 o = op0; \
166
+ for (int i = 0 ; i < N; i += 4 ) \
167
+ o = instr(o); \
168
+ _mm_store_si128 ((__m128i*)dst_int, o); \
169
+ END (checksum_dst(dst_int), msg);
172
170
173
171
#define UNARYOP_i_F (msg, instr ) \
174
- START (); \
175
- for (int i = 0 ; i < N; i += 4 ) \
176
- dst_int_scalar += instr; \
177
- END (dst_int_scalar, msg);
172
+ START (); \
173
+ for (int i = 0 ; i < N; i += 4 ) \
174
+ dst_int_scalar += instr; \
175
+ END (dst_int_scalar, msg);
178
176
179
177
#define UNARYOP_D_D (msg, instr, op0 ) \
180
- START (); \
181
- __m128d o = op0; \
182
- for (int i = 0 ; i < N; i += 2 ) \
183
- o = instr(o); \
184
- _mm_store_pd (dst_dbl, o); \
185
- END (checksum_dst(dst_dbl), msg);
178
+ START (); \
179
+ __m128d o = op0; \
180
+ for (int i = 0 ; i < N; i += 2 ) \
181
+ o = instr(o); \
182
+ _mm_store_pd (dst_dbl, o); \
183
+ END (checksum_dst(dst_dbl), msg);
186
184
187
185
#define BINARYOP_F_FF (msg, instr, op0, op1 ) \
188
- START (); \
189
- __m128 o0 = op0; \
190
- __m128 o1 = op1; \
191
- for (int i = 0 ; i < N; i += 4 ) \
192
- o0 = instr(o0, o1); \
193
- _mm_store_ps (dst_flt, o0); \
194
- END (checksum_dst(dst_flt), msg);
186
+ START (); \
187
+ __m128 o0 = op0; \
188
+ __m128 o1 = op1; \
189
+ for (int i = 0 ; i < N; i += 4 ) \
190
+ o0 = instr(o0, o1); \
191
+ _mm_store_ps (dst_flt, o0); \
192
+ END (checksum_dst(dst_flt), msg);
195
193
196
194
#define BINARYOP_I_II (msg, instr, op0, op1 ) \
197
- START (); \
198
- __m128 o0 = op0; \
199
- __m128 o1 = op1; \
200
- for (int i = 0 ; i < N; i += 4 ) \
201
- o0 = instr(o0, o1); \
202
- _mm_store_si128 ((__m128i*)dst_int, o0); \
203
- END (checksum_dst(dst_int), msg);
195
+ START (); \
196
+ __m128 o0 = op0; \
197
+ __m128 o1 = op1; \
198
+ for (int i = 0 ; i < N; i += 4 ) \
199
+ o0 = instr(o0, o1); \
200
+ _mm_store_si128 ((__m128i*)dst_int, o0); \
201
+ END (checksum_dst(dst_int), msg);
204
202
205
203
#define BINARYOP_D_DD (msg, instr, op0, op1 ) \
206
- START (); \
207
- __m128d o0 = op0; \
208
- __m128d o1 = op1; \
209
- for (int i = 0 ; i < N; i += 2 ) \
210
- o0 = instr(o0, o1); \
211
- _mm_store_pd (dst_dbl, o0); \
212
- END (checksum_dst(dst_dbl), msg);
204
+ START (); \
205
+ __m128d o0 = op0; \
206
+ __m128d o1 = op1; \
207
+ for (int i = 0 ; i < N; i += 2 ) \
208
+ o0 = instr(o0, o1); \
209
+ _mm_store_pd (dst_dbl, o0); \
210
+ END (checksum_dst(dst_dbl), msg);
213
211
214
212
#define Max (a,b ) ((a) >= (b) ? (a) : (b))
215
213
#define Min (a,b ) ((a) <= (b) ? (a) : (b))
216
214
217
- static INLINE int Isnan (float __f)
218
- {
215
+ static INLINE int Isnan (float __f) {
219
216
return (*(unsigned int *)&__f << 1 ) > 0xFF000000u ;
220
217
}
0 commit comments