@@ -395,7 +395,7 @@ struct KernelLauncher {
395
395
} // namespace fbgemm_gpu::utils
396
396
397
397
// //////////////////////////////////////////////////////////////////////////////
398
- // Macro create a compile-time concatenation of __TEMPLATE_SOURCE_FILE__ and
398
+ // Macro to create a compile-time concatenation of __TEMPLATE_SOURCE_FILE__ and
399
399
// __FILE__
400
400
//
401
401
// This is used for reporting the template filename into to Torch DSA. Runtime
@@ -412,25 +412,8 @@ struct KernelLauncher {
412
412
#endif
413
413
414
414
// //////////////////////////////////////////////////////////////////////////////
415
- // General Kernel Launch Macros for FBGEMM GPU Kernels
416
- //
417
- // This macro is used to launch GPU kernels in FBGEMM GPU codebase. It runs a
418
- // set of constraint checks on kernel parameters and and tensor arguments, and
419
- // throws descriptive errors on constraint failures.
420
- //
421
- // NOTES:
422
- //
423
- // - Since the code is wrapped inside an immediately-invoked lambda,
424
- // source_location::current() will point to the function where the macro is
425
- // called.
426
- //
427
- // - The constexpr decltype(KERNEL) declaration is added to enable for better
428
- // compilation error messages upon template argument and function overload
429
- // mismatches.
430
- //
431
- // - The macro expression is wrapped inside a parenthesis to avoid commas from
432
- // interfering with preoprocessing when this macro is invoked inside another
433
- // macro.
415
+ // Macro to define _FKL_TFILE_ to be __TEMPLATE_SOURCE_FILE__ if it is defined,
416
+ // else empty string
434
417
// //////////////////////////////////////////////////////////////////////////////
435
418
436
419
#ifdef __TEMPLATE_SOURCE_FILE__
@@ -439,12 +422,31 @@ struct KernelLauncher {
439
422
#define _FKL_TFILE_ " "
440
423
#endif
441
424
425
+ // //////////////////////////////////////////////////////////////////////////////
426
+ // Enable Kernel Barrier Isolation
427
+ //
428
+ // When this flag is defined, kernel's execution is isolated from other GPU
429
+ // processes that might otherwise have been running concurrently. This acts as
430
+ // a performance profiling tool used in conjunction with trace inspection to
431
+ // determine whether a kernel's regression might be due to other GPU processes
432
+ // competing for memory bandwidth that is causing the kernel slowdown, which can
433
+ // be especially relevant when data accessed by the kernel is in UVM.
434
+ // //////////////////////////////////////////////////////////////////////////////
435
+
442
436
#ifdef FBGEMM_GPU_ISOLATE_KERNEL_LAUNCH
443
437
#define _FKL_BLOCKING_ true
444
438
#else
445
439
#define _FKL_BLOCKING_ false
446
440
#endif
447
441
442
+ // //////////////////////////////////////////////////////////////////////////////
443
+ // Enable Tensor Value Checks
444
+ //
445
+ // When defined, tensors that are passed into the kernel launcher via TA_B() or
446
+ // PTA_B() will be checked for NaN and Inf values. This is an expensive check
447
+ // and is meant to be used for debugging.
448
+ // //////////////////////////////////////////////////////////////////////////////
449
+
448
450
#ifdef FBGEMM_GPU_TENSORCHECK
449
451
#define _FKL_TENSORCHECK_ true
450
452
#else
@@ -473,8 +475,22 @@ struct KernelLauncher {
473
475
// //////////////////////////////////////////////////////////////////////////////
474
476
// Kernel Launcher Macros for FBGEMM GPU Kernels
475
477
//
476
- // This macro simplifies the kernel launch process by wrapping the kernel
477
- // launches into simple-to-use macros.
478
+ // This macro simplifies the construction and execution of KernelLauncher
479
+ // instances by wrapping the kernel launches into simple-to-use macros.
480
+ //
481
+ // NOTES:
482
+ //
483
+ // - Since the code is wrapped inside an immediately-invoked lambda,
484
+ // source_location::current() will point to the function where the macro is
485
+ // called.
486
+ //
487
+ // - The constexpr decltype(KERNEL) declaration is added to enable for better
488
+ // compilation error messages upon template argument and function overload
489
+ // mismatches.
490
+ //
491
+ // - The macro expression is wrapped inside a parenthesis to avoid commas from
492
+ // interfering with preoprocessing when this macro is invoked inside another
493
+ // macro.
478
494
// //////////////////////////////////////////////////////////////////////////////
479
495
480
496
#define FBGEMM_LAUNCH_KERNEL (KERNEL, GRID, BLOCK, SMEM, STREAM, ...) \
0 commit comments