@@ -208,6 +208,9 @@ std::vector<size_t> parseGpus() {
208
208
LOG (GSTREAM) << line; \
209
209
}
210
210
211
+ constexpr size_t GeneticTunerHarness::kEarlyPruneFactor ;
212
+ constexpr size_t GeneticTunerHarness::kCatastrophicPerfFactor ;
213
+
211
214
// This function is ran on a single pre-determined GPU, in a single thread
212
215
// It takes the input/output DLTensor objects that reside on that GPU
213
216
//
@@ -222,7 +225,7 @@ bool GeneticTunerHarness::warmupOrPrune(
222
225
const std::vector<DLTensor*>& outputs,
223
226
const std::vector<const DLTensor*>& inputs,
224
227
size_t handle,
225
- size_t bestTimeSoFar) {
228
+ Duration bestTimeSoFar) {
226
229
// Pruning based on number of threads: if you don't hit at least k warps
227
230
// (default k = 8; 256 total threads, controlled by
228
231
// FLAGS_tuner_min_launch_total_threads) then it's likely the kernel is not
@@ -276,10 +279,8 @@ bool GeneticTunerHarness::warmupOrPrune(
276
279
}
277
280
278
281
// 1.b.
279
- constexpr size_t kCatastrophicPerfFactor = 100 ;
280
- if (bestTimeSoFar < std::numeric_limits<size_t >::max () and
281
- prof >= std::chrono::microseconds (
282
- (kCatastrophicPerfFactor * bestTimeSoFar))) {
282
+ if (bestTimeSoFar < Duration::max () and
283
+ prof >= kCatastrophicPerfFactor * bestTimeSoFar) {
283
284
return true ;
284
285
}
285
286
@@ -291,8 +292,8 @@ bool GeneticTunerHarness::warmupOrPrune(
291
292
// 2. After reasonable warmup, look at the performance and prune with
292
293
// kEarlyPruneFactor
293
294
prof = engine.run (handle, inputs, outputs, true );
294
- if (bestTimeSoFar < std::numeric_limits< size_t > ::max () and
295
- prof >= std::chrono::microseconds (( kEarlyPruneFactor * bestTimeSoFar)) ) {
295
+ if (bestTimeSoFar < Duration ::max () and
296
+ prof >= kEarlyPruneFactor * bestTimeSoFar) {
296
297
return true ;
297
298
}
298
299
@@ -394,7 +395,7 @@ void GeneticTunerHarness::doGpuWork(
394
395
395
396
std::vector<Duration> runtimes;
396
397
try {
397
- size_t bestTimeSoFar;
398
+ Duration bestTimeSoFar;
398
399
{
399
400
std::lock_guard<std::mutex> lock (bestTimeMtx_);
400
401
bestTimeSoFar = bestTime_;
@@ -451,8 +452,8 @@ void GeneticTunerHarness::doGpuWork(
451
452
// Save best time under lock
452
453
{
453
454
std::lock_guard<std::mutex> lock (bestTimeMtx_);
454
- if (prof_us < bestTime_) {
455
- bestTime_ = prof_us ;
455
+ if (prof < bestTime_) {
456
+ bestTime_ = prof ;
456
457
bestCudaMappingOptions_ = options;
457
458
}
458
459
}
0 commit comments