Skip to content

Commit 31ff24a

Browse files
committed
cleaning up accumulate example
1 parent 9011d00 commit 31ff24a

File tree

3 files changed

+86
-268
lines changed

3 files changed

+86
-268
lines changed

Vectorisation/VecX/accumulate_transform.h

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -562,15 +562,13 @@ typename InstructionTraits<INS_VEC>::FloatType ApplyTransformAccumulate2UR_X(con
562562
for (; i <= (rhsSZ - step); i += step)
563563
{
564564
RHS1.load_a(pRhs1 + i);
565-
RES = operAcc(RES, operTransform(RHS1) );
566-
567565
RHS2.load_a(pRhs1 + i + width);
568-
RES1 = operAcc(RES1, operTransform(RHS2) );
569-
570566
RHS3.load_a(pRhs1 + i + width * 2);
571-
RES2 = operAcc(RES2, operTransform(RHS3) );
572-
573567
RHS4.load_a(pRhs1 + i + width * 3);
568+
569+
RES = operAcc(RES, operTransform(RHS1) );
570+
RES1 = operAcc(RES1, operTransform(RHS2) );
571+
RES2 = operAcc(RES2, operTransform(RHS3) );
574572
RES3 = operAcc(RES3, operTransform(RHS4));
575573

576574
}

accumulateExample/AVX512Dance.cpp

Lines changed: 44 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,7 @@
1515

1616
#include "../Vectorisation/VecX/dr3.h"
1717

18-
//pick an instruction set for intrinsics by selecting a name space
19-
20-
//using namespace DRC::VecDb;
21-
//using namespace DRC::VecD2D; //sse2 double
22-
//using namespace DRC::VecD4D; //avx2 double
23-
//using namespace DRC::VecF8F; // avx2 float
24-
//using namespace DRC::VecD8D; //avx512 double
25-
//using namespace DRC::VecF16F; //avx512 float
18+
2619
const double billion = 1000000000.0;
2720

2821

@@ -34,6 +27,7 @@ struct RunResults
3427
{
3528
Mapped_Performance_Results m_raw_results;
3629
Calc_Values m_calc_results;
30+
double time;
3731
};
3832

3933
class TimerGuard
@@ -55,12 +49,7 @@ class TimerGuard
5549

5650
auto getRandomShuffledVectorxxx(int SZ, int instance_number = 0)
5751
{
58-
//using FloatType = double; // typename InstructionTraits<VecXX::INS>::FloatType;
59-
60-
6152
static std::map<int, std::vector<double> > vectors;
62-
63-
6453
int key = 10 * SZ + instance_number;
6554
//store vectors with key 10 times size and add on 0-9 integer for instance of different random vector
6655

@@ -109,157 +98,96 @@ auto runFunctionOverDifferentSize = [](int testRepeats, int vec_start_size, int
10998

11099
void doAVXMax512Dance()
111100
{
112-
/*
113-
const long TEST_LOOP_SZ = 1000;
114-
const int repeatRuns = 20;
115-
const int vectorStepSize = 200;
116-
const int maxVectorSize = 20000;
117-
const int minVectorSize = 400;
118-
*/
119-
120-
//const double sleepTime = 10000.;/// 10 seconds
121-
122101

123102
const int maxVectorSize = 4400;
124103
const int minVectorSize = 3800;
125104
const long TEST_LOOP_SZ = 100000;
126105
const int vectorStepSize = 8;
127106
const int repeatRuns = 10;
128107

129-
//auto zero = 0.0;// InstructionTraits<VecXX::INS>::nullValue;
130-
131108
getRandomShuffledVectorxxx(-1); // reset random input vectors
132109

133-
/*
134-
auto accumulate_run = [&](int VEC_SZ, long TEST_LOOP_SZ)
135-
{
136-
double time = 0.;
137-
volatile double res = 0.;
138-
//auto v = getRandomShuffledVector(SZ); // std stl vector double or float
139-
auto v1 = getRandomShuffledVector(VEC_SZ, 0);
140110

141-
{
142-
//warm up
143-
for (long l = 0; l < 100; l++)
144-
{
145-
res = *std::max_element(v1.begin(), v1.end());
146-
}
147-
148-
TimerGuard timer(time);
149-
{
150-
for (long l = 0; l < TEST_LOOP_SZ; l++)
151-
{
152-
res = *std::max_element(v1.begin(), v1.end());
153-
}
154-
}
155-
}
156-
return std::make_pair(res, numOps(TEST_LOOP_SZ, VEC_SZ) / time);
157-
};
158-
159-
*/
160-
161-
auto DR3_accumulate = [&](int SZ, long TEST_LOOP_SZ)
111+
//avx512 lambda
112+
auto DR3_avx512 = [&](int SZ, long TEST_LOOP_SZ)
162113
{
163114
using namespace DRC::VecD8D;
164115

165116
double time = 0.;
166117
volatile double res = 0.;
167118

168-
// generic lambda for max either calling a max instruction or doing a selection with iff
169-
// auto mxDbl = [](auto lhs, auto rhs) { return max(lhs, rhs); };
170-
auto mxDbl = [](auto lhs, auto rhs) { return iff(lhs > rhs, lhs, rhs); }; //using iff fastest
119+
auto mxDbl = [](auto lhs, auto rhs) { return iff(lhs > rhs, lhs, rhs); };
171120

172-
auto v1 = getRandomShuffledVectorxxx(SZ, 0); // std stl vector double or float
121+
auto v1 = getRandomShuffledVectorxxx(SZ, 0);
173122
VecXX vec(v1);
174-
{
123+
175124

176-
//warm up
177-
for (long l = 0; l < 100; l++)
178-
{
179-
res = reduce(vec, mxDbl);
180-
}
181-
182-
TimerGuard timer(time);
183-
{
184-
for (long l = 0; l < TEST_LOOP_SZ; l++)
185-
{
186-
res = reduce(vec, mxDbl);
187-
}
188-
}
125+
for (long l = 0; l < TEST_LOOP_SZ; l++)
126+
{
127+
res = reduce(vec, mxDbl);
189128
}
190-
//return std::make_pair(res, numOps(TEST_LOOP_SZ, SZ) / time);
191-
129+
130+
return std::make_pair(res, time);
192131
};
193132

194133

195-
auto DR3_accumulate2 = [&](int SZ, long TEST_LOOP_SZ)
134+
auto DR3_avx2 = [&](int SZ, long TEST_LOOP_SZ)
196135
{
197136
using namespace DRC::VecD4D;
198137

199138
double time = 0.;
200139
volatile double res = 0.;
201140

202-
// generic lambda for max either calling a max instruction or doing a selection with iff
203-
// auto mxDbl = [](auto lhs, auto rhs) { return max(lhs, rhs); };
204-
auto mxDbl = [](auto lhs, auto rhs) { return iff(lhs > rhs, lhs, rhs); }; //using iff fastest
141+
auto mxDbl = [](auto lhs, auto rhs) { return iff(lhs > rhs, lhs, rhs); };
205142

206-
auto v1 = getRandomShuffledVectorxxx(SZ, 0); // std stl vector double or float
143+
auto v1 = getRandomShuffledVectorxxx(SZ, 0);
207144
VecXX vec(v1);
145+
146+
147+
for (long l = 0; l < TEST_LOOP_SZ; l++)
208148
{
209-
210-
//warm up
211-
for (long l = 0; l < 100; l++)
212-
{
213-
res = reduce(vec, mxDbl);
214-
}
215-
216-
TimerGuard timer(time);
217-
{
218-
for (long l = 0; l < TEST_LOOP_SZ; l++)
219-
{
220-
res = reduce(vec, mxDbl);
221-
}
222-
}
149+
res = reduce(vec, mxDbl);
223150
}
224-
//return std::make_pair(res, numOps(TEST_LOOP_SZ, SZ) / time);
151+
152+
153+
return std::make_pair(res, time);
225154

226155
};
227156

228157

158+
using namespace std::chrono_literals;
229159

230160
for (;;)
231161
{
232162

233163
double time = 0.0;
164+
165+
//AVX512
166+
for (int K = 0; K < 4; K++)
234167
{
235-
236-
for (int K = 0; K < 4; K++)
168+
time = 0.;
169+
std::cout << "AVX 512 " << K + 1 << "of 4 " << std::endl;
237170
{
238-
time = 0.;
239-
std::cout << "AVX 512 " << K + 1 << "of 4" << std::endl;
240-
auto dr3_raw_results = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, DR3_accumulate, TEST_LOOP_SZ);
241-
std::cout << "AVX 512 " << K + 1 << "of 4" << time << "seconds now sleep" << std::endl;
242-
243-
using namespace std::chrono_literals;
244-
std::this_thread::sleep_for(15000ms);
245-
171+
TimerGuard timer(time);
172+
auto dr3_raw_results = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, DR3_avx512, TEST_LOOP_SZ);
246173
}
174+
std::cout << "AVX 512 " << K + 1 << "of 4 " << time << " seconds now sleep" << std::endl;
175+
std::this_thread::sleep_for(15000ms);
247176
}
248-
using namespace std::chrono_literals;
177+
249178
std::this_thread::sleep_for(15000ms);
250-
251-
{
252-
253-
for (int K = 0; K < 4; K++)
254-
{
255-
time = 0.;
256-
std::cout << "AVX 2 " << K + 1 << "of 4" << std::endl;
257-
auto dr3_raw_results = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, DR3_accumulate2, TEST_LOOP_SZ);
258-
std::cout << "AVX 2 " << K + 1 << "of 4" << time << "seconds now sleep" << std::endl;
259-
using namespace std::chrono_literals;
260-
std::this_thread::sleep_for(15000ms);
261-
179+
180+
//AVX2
181+
for (int K = 0; K < 4; K++)
182+
{
183+
time = 0.;
184+
std::cout << "AVX 2 " << K + 1 << "of 4 " << std::endl;
185+
{ TimerGuard timer(time);
186+
auto dr3_raw_results = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, DR3_avx2, TEST_LOOP_SZ);
262187
}
188+
std::cout << "AVX 2 " << K + 1 << "of 4 " << time << " seconds now sleep" << std::endl;
189+
std::this_thread::sleep_for(15000ms);
190+
263191
}
264192

265193
}

0 commit comments

Comments
 (0)