From e3533ffea3e24ab0c253753d9ecfd7a2e6461e7c Mon Sep 17 00:00:00 2001 From: Anthony Chang <27950904+rosenrodt@users.noreply.github.com> Date: Fri, 13 Jun 2025 09:44:04 +0800 Subject: [PATCH] update moe fp4 cubins and runner update test_moe fp4 cases Signed-off-by: Anthony Chang <27950904+rosenrodt@users.noreply.github.com> --- .../batchedGemm/KernelRunner.cpp | 144 +- .../batchedGemm/KernelRunner.h | 9 +- .../trtllmGen_bmm_export/BatchedGemmEnums.h | 67 + .../BatchedGemmInterface.h | 49 +- .../trtllmGen_bmm_export/BatchedGemmOptions.h | 144 +- .../batchedGemm/trtllmGen_bmm_export/Enums.h | 56 + .../GemmGatedActOptions.h | 30 +- .../trtllmGen_bmm_export/GemmOptions.h | 287 +- .../trtllmGen_bmm_export/KernelMetaInfo.h | 3266 +++++++++++------ .../trtllmGen_bmm_export/KernelParams.h | 422 ++- .../trtllmGen_bmm_export/KernelTraits.h | 60 +- .../trtllmGen_bmm_export/TmaDescriptor.h | 128 +- .../trtllmGen_bmm_export/config.json | 215 +- ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 + ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 + ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 + ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 + ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 + ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 + ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 + ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 + ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 + ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 + ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 + ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 + ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 + ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 + ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 + ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 + ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 + ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 + ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 + ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 + ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 + ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 + ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 + ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 + ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 + ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 + ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 + ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 + ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 + ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 + ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 + ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 + ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 + ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 + ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 + ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 + tests/unittest/_torch/thop/test_moe.py | 10 +- 88 files changed, 3650 insertions(+), 1459 deletions(-) create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmEnums.h create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp index 6f6e186d71..de5f1f650d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp @@ -27,6 +27,138 @@ namespace kernels { using namespace batchedGemm::batchedGemm; +using namespace batchedGemm::gemm; +using namespace batchedGemm::trtllm::gen; + +std::vector prioritizePredefinedConfigs(int m, int n, int k, std::vector const& sortedIndices, + batchedGemm::batchedGemm::BatchedGemmConfig const* configs) +{ + + // Function to bubble up the pre-determined config. + auto bubbleUpConfig = [&configs](std::vector const& sortedIndices, auto&& pred) -> std::vector + { + std::vector prioritizedIndices_; + // Copy matching configs to new vector + std::copy_if(sortedIndices.begin(), sortedIndices.end(), std::back_inserter(prioritizedIndices_), + [&configs, &pred](int idx) + { + BatchedGemmConfig const& config = configs[idx]; + return (pred(config)); + }); + // Copy the rest of the configs to new vector, if not already copied + std::copy_if(sortedIndices.begin(), sortedIndices.end(), std::back_inserter(prioritizedIndices_), + [&prioritizedIndices_](int idx) { + return std::find(prioritizedIndices_.begin(), prioritizedIndices_.end(), idx) + == prioritizedIndices_.end(); + }); + return prioritizedIndices_; + }; + + // Init empty vector + std::vector prioritizedIndices; + + // + // Qwen3 + // + + // Qwen3_235B_TP1_EP8_MoE_FC1 m=3072 k=4096 + if (n /* out_dim */ == 3072 && k /* in_dim */ == 4096) + { + auto pred = [](BatchedGemmConfig const& config) + { + BatchedGemmOptions const& options = config.mOptions; + return options.mNumStages == 4 && options.mNumStagesMma == 1 && options.mTileK == 512 + && options.mTileScheduler == TileScheduler::Static; + }; + prioritizedIndices = bubbleUpConfig(sortedIndices, pred); + } + // Qwen3_235B_TP1_EP8_MoE_FC2 m=4096 k=1536 + else if (n /* out_dim */ == 4096 && k /* in_dim */ == 1536) + { + auto pred = [](BatchedGemmConfig const& config) + { + BatchedGemmOptions const& options = config.mOptions; + return options.mNumStages == 4 && options.mNumStagesMma == 1 && options.mTileK == 512 + && options.mTileScheduler == TileScheduler::Static; + }; + prioritizedIndices = bubbleUpConfig(sortedIndices, pred); + } + // Qwen3_235B_TP2_EP4_MoE_FC1 m=1536 k=4096 + else if (n /* out_dim */ == 1536 && k /* in_dim */ == 4096) + { + auto pred = [](BatchedGemmConfig const& config) + { + BatchedGemmOptions const& options = config.mOptions; + return options.mNumStages == 4 && options.mNumStagesMma == 1 && options.mTileK == 512 + && options.mTileScheduler == TileScheduler::Static; + }; + prioritizedIndices = bubbleUpConfig(sortedIndices, pred); + } + // Qwen3_235B_TP2_EP4_MoE_FC2 m=4096 k=768 + else if (n /* out_dim */ == 4096 && k /* in_dim */ == 768) + { + auto pred = [](BatchedGemmConfig const& config) + { + BatchedGemmOptions const& options = config.mOptions; + return options.mNumStages == 4 && options.mNumStagesMma == 2 && options.mTileK == 512 + && options.mTileScheduler == TileScheduler::Persistent; + }; + prioritizedIndices = bubbleUpConfig(sortedIndices, pred); + } + // Qwen3_235B_TP4_EP2_MoE_FC1 m=768 k=4096 + else if (n /* out_dim */ == 768 && k /* in_dim */ == 4096) + { + auto pred = [](BatchedGemmConfig const& config) + { + BatchedGemmOptions const& options = config.mOptions; + return options.mNumStages == 4 && options.mNumStagesMma == 1 && options.mTileK == 512 + && options.mTileScheduler == TileScheduler::Static; + }; + prioritizedIndices = bubbleUpConfig(sortedIndices, pred); + } + // Qwen3_235B_TP4_EP2_MoE_FC2 m=4096 k=384 + else if (n /* out_dim */ == 4096 && k /* in_dim */ == 384) + { + auto pred = [](BatchedGemmConfig const& config) + { + BatchedGemmOptions const& options = config.mOptions; + return options.mNumStages == 4 && options.mNumStagesMma == 2 && options.mTileK == 512 + && options.mTileScheduler == TileScheduler::Persistent; + }; + prioritizedIndices = bubbleUpConfig(sortedIndices, pred); + } + // Qwen3_235B_TP8_EP1_MoE_FC1 m=384 k=4096 + else if (n /* out_dim */ == 384 && k /* in_dim */ == 4096) + { + auto pred = [](BatchedGemmConfig const& config) + { + BatchedGemmOptions const& options = config.mOptions; + return options.mNumStages == 4 && options.mNumStagesMma == 1 && options.mTileK == 512 + && options.mTileScheduler == TileScheduler::Static; + }; + prioritizedIndices = bubbleUpConfig(sortedIndices, pred); + } + // Qwen3_235B_TP8_EP1_MoE_FC2 m=4096 k=192 + else if (n /* out_dim */ == 4096 && k /* in_dim */ == 192) + { + auto pred = [](BatchedGemmConfig const& config) + { + BatchedGemmOptions const& options = config.mOptions; + return options.mNumStages == 4 && options.mNumStagesMma == 2 && options.mTileK == 256 + && options.mTileScheduler == TileScheduler::Persistent; + }; + prioritizedIndices = bubbleUpConfig(sortedIndices, pred); + } + // + // Fall back + // + else + { + prioritizedIndices = sortedIndices; + } + + return prioritizedIndices; +} TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(TrtllmGenBatchedGemmRunnerOptions const& options_) : mOptions(options_) @@ -44,7 +176,8 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(TrtllmGenBatchedGemmRunne // When we include low-latency kernels we can set transposeMmaOutput via constructor if (options.mDtypeA == mOptions.eltType && options.mDtypeC == mOptions.outputType && options.mUseDeepSeekFp8 == mOptions.deepSeekFp8 - && options.mTransposeMmaOutput == mOptions.transposeMmaOutput && options.mRouteAct == mOptions.routeAct + && options.mTransposeMmaOutput == mOptions.transposeMmaOutput + && (!doesRouteImplUseNoRoute(options.mRouteImpl)) == mOptions.routeAct && options.mFusedAct == mOptions.fusedAct && options.mIsStaticBatch == mOptions.staticBatch && tileSize == mOptions.tileSize) { @@ -227,9 +360,9 @@ std::vector TrtllmGenBatchedGemmRunner::getValidConfigIndices(int32_t m gemmData.mProblemDimensions.mWorldSize = 1; gemmData.mProblemDimensions.mMaxNumCtasInTokenDim = maxNumCtasInBatchDim; // Sort configs by options - std::vector sortedIndices = mPassingConfigIndices; + std::vector sortedIndices = mPassingConfigIndices; std::sort(sortedIndices.begin(), sortedIndices.end(), - [&configs](int32_t idx0, int32_t idx1) + [&configs](int64_t idx0, int64_t idx1) { auto const& optionsA = configs[idx0].mOptions; auto const& optionsB = configs[idx1].mOptions; @@ -247,7 +380,7 @@ std::vector TrtllmGenBatchedGemmRunner::getValidConfigIndices(int32_t m } // Then by tile scheduler (persistent scheduler is better for FC2 in MoE) - if (!optionsA.mRouteAct) + if (doesRouteImplUseNoRoute(optionsA.mRouteImpl)) { return optionsA.mTileScheduler == batchedGemm::gemm::TileScheduler::Persistent; } @@ -255,8 +388,9 @@ std::vector TrtllmGenBatchedGemmRunner::getValidConfigIndices(int32_t m return optionsA.mTileM > optionsB.mTileM; }); + std::vector prioritizedIndices = prioritizePredefinedConfigs(m, n, k, sortedIndices, configs); std::vector validConfigIndices; - for (auto const& configIndex : sortedIndices) + for (auto const& configIndex : prioritizedIndices) { auto const& config = configs[configIndex]; auto isValidConfig = bmm.isValidConfig(config, gemmData); diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h index 0584bc0bb4..7fe892511b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h @@ -50,6 +50,7 @@ class TrtllmGenBatchedGemmRunner std::vector const& batchedTokens, int32_t numTokens, int32_t numBatches, int32_t maxNumCtasInBatchDim, std::optional configIndex = std::nullopt); + // Generic GEMM interface void run(int32_t m, int32_t n, int32_t k, std::vector const& batchedTokens, int32_t numTokens, int32_t numBatches, int32_t maxNumCtasInBatchDim, void const* a, void const* sfA, void const* b, void const* sfB, void const* perTokensSfA, void const* perTokensSfB, float const* scaleC, @@ -57,16 +58,18 @@ class TrtllmGenBatchedGemmRunner int32_t const* ctaIdxXyToBatchIdx, int32_t const* ctaIdxXyToMnLimit, int32_t const* numNonExitingCtas, void* workspace, CUstream stream, int device, std::optional configIndex = std::nullopt); + // NVFP4 per-block scaling GEMM void run(int32_t m, int32_t n, int32_t k, std::vector const& batchedTokens, void const* a, void const* sfA, void const* b, void const* sfB, void* c, void* outSfC, void* workspace, CUstream stream, int device, std::optional configIndex = std::nullopt); + // FP8 per-tensor scaling GEMM void run(int32_t m, int32_t n, int32_t k, std::vector const& batchedTokens, void const* a, void const* b, float const* scaleC, float const* scaleGateC, void* c, void* workspace, CUstream stream, int device, std::optional configIndex = std::nullopt); // Get the list of configs that passed the validation based on the constructor options - [[nodiscard]] std::vector getPassingConfigIndices() const + [[nodiscard]] std::vector getPassingConfigIndices() const { return mPassingConfigIndices; } @@ -88,8 +91,8 @@ class TrtllmGenBatchedGemmRunner private: TrtllmGenBatchedGemmRunnerOptions mOptions; - std::vector mPassingConfigIndices; - std::optional mSelectedConfigIndex; + std::vector mPassingConfigIndices; + std::optional mSelectedConfigIndex; }; } // namespace kernels } // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmEnums.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmEnums.h new file mode 100644 index 0000000000..ffea777396 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmEnums.h @@ -0,0 +1,67 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & + * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace batchedGemm +{ + +namespace batchedGemm +{ + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +enum class RouteImpl +{ + // No Routing + NoRoute = 0, + // Use LDGSTS to do the routing + Ldgsts = 1, + // Use UTMALDG.GATHER4 to do the routing + Tma = 2 +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline bool doesRouteImplUseNoRoute(RouteImpl mode) +{ + return (mode == RouteImpl::NoRoute); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline bool doesRouteImplUseLdgsts(RouteImpl mode) +{ + return (mode == RouteImpl::Ldgsts); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline bool doesRouteImplUseTma(RouteImpl mode) +{ + return (mode == RouteImpl::Tma); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace batchedGemm + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace batchedGemm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h index e2de17c6de..4251a333e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h @@ -96,12 +96,16 @@ struct BatchedGemmData // Logical strides are [K, 1]. // // If batchN: - // If transposeMatrixA is false + // If layoutA is MatrixLayout::MajorK // Logical shape is [B, divUpMul(M, tileM), K]. // Logical strides are [divUpMul(M, tileM) * K, K, 1]. - // If transposeMatrixA is true + // If layoutA is MatrixLayout::MajorMn // Logical shape is [B, K, divUpMul(M, tileM)]. // Logical strides are [K * divUpMul(M, tileM), divUpMul(M, tileM), 1]. + // If layoutA is MatrixLayout::BlockMajorK + // Logical shape is [B, K / blockK, divUpMul(M, tileM), blockK]. + // Logical strides are [K * divUpMul(M, tileM), divUpMul(M, tileM) * blockK, blockK, 1]. + // where blockK is 128B. void const* mPtrA{nullptr}; // The block scaling factors to dequantize A. @@ -154,12 +158,16 @@ struct BatchedGemmData // Logical strides are [K, 1]. // // If batchM: - // If transposeMatrixB is true + // If layoutB is MatrixLayout::MajorK // Logical shape is [B, divUpMul(N, tileN), K]. // Logical strides are [divUpMul(N, tileN) * K, K, 1]. - // If transposeMatrixB is false + // If layoutB is MatrixLayout::MajorMn // Logical shape is [B, K, divUpMul(N, tileN)]. // Logical strides are [K * divUpMul(N, tileN), divUpMul(N, tileN), 1]. + // If layoutB is MatrixLayout::BlockMajorK + // Logical shape is [B, K / blockK, divUpMul(N, tileN), blockK]. + // Logical strides are [K * divUpMul(N, tileN), divUpMul(N, tileN) * blockK, blockK, 1]. + // where blockK is 128B. void const* mPtrB{nullptr}; // The scaling factors to dequantize B. @@ -210,6 +218,21 @@ struct BatchedGemmData // Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B)] void const* mPtrPerTokenSfB{nullptr}; + // The bias applied after the GEMM and before the activation function. + // The bias is applied before applying the global scaling factor. I.e. + // C = act(A * B + bias') * scaleC + // scaleC = dequantA * dequantB * quantC + // Thus, the bias' = bias / (dequantA * dequantB), where the bias is the original bias. + // + // If batchM, BiasType must be N, and bias shape is [B, N]. + // The bias is broadcasted along the M dimension. + // + // If batchN BiasType must be M, and bias shape is [B, M]. + // The bias is broadcasted along the N dimension. + // + // The dtype is float32. + void const* mPtrBias{nullptr}; + // The output tensor scaling factor for MxFp{4,8}, Fp8 and NvFp4 quantization. // TensorRT-LLM API requires a scaling factor on the device. // Shape is [B]. @@ -220,6 +243,12 @@ struct BatchedGemmData // Shape is [B]. float const* mPtrScaleGate{nullptr}; + // The alpha and beta for SwiGlu. + // gatedActivation <- (x0 + beta) * sigmoid(alpha * x1) + // Shape is [B] + float const* mPtrSwiGluAlpha{nullptr}; + float const* mPtrSwiGluBeta{nullptr}; + // Param is used when the kernel is configured with -routeAct true. // The inputs are not padded, but the outputs are padded to divUpMul(M[bi], tileM) for batchM or // divUpMul(N[bi], tileN) for batchN. @@ -609,11 +638,13 @@ int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspa batchedGemmData.mInputBuffers.mPtrB, batchedGemmData.mOutputBuffers.mPtrC, batchedGemmData.mInputBuffers.mPtrSfA, batchedGemmData.mInputBuffers.mPtrSfB, batchedGemmData.mInputBuffers.mPtrPerTokenSfA, batchedGemmData.mInputBuffers.mPtrPerTokenSfB, - batchedGemmData.mOutputBuffers.mPtrSfC, batchedGemmData.mInputBuffers.mPtrScaleC, - batchedGemmData.mInputBuffers.mPtrScaleGate, batchedGemmData.mInputBuffers.mPtrRouteMap, dPtrRowMax, - dPtrRowMaxBars, batchedGemmData.mInputBuffers.mPtrNumNonExitingCtas, - batchedGemmData.mInputBuffers.mPtrTotalNumPaddedTokens, batchedGemmData.mInputBuffers.mPtrCtaIdxXyToBatchIdx, - batchedGemmData.mInputBuffers.mPtrCtaIdxXyToMnLimit, maxNumCtasInBatchDim); + batchedGemmData.mInputBuffers.mPtrBias, batchedGemmData.mOutputBuffers.mPtrSfC, + batchedGemmData.mInputBuffers.mPtrScaleC, batchedGemmData.mInputBuffers.mPtrScaleGate, + batchedGemmData.mInputBuffers.mPtrSwiGluAlpha, batchedGemmData.mInputBuffers.mPtrSwiGluBeta, + batchedGemmData.mInputBuffers.mPtrRouteMap, dPtrRowMax, dPtrRowMaxBars, + batchedGemmData.mInputBuffers.mPtrNumNonExitingCtas, batchedGemmData.mInputBuffers.mPtrTotalNumPaddedTokens, + batchedGemmData.mInputBuffers.mPtrCtaIdxXyToBatchIdx, batchedGemmData.mInputBuffers.mPtrCtaIdxXyToMnLimit, + maxNumCtasInBatchDim); // The size of the grid. std::vector grid{numCtaX, numCtaY, numCtaZ}; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h index f59cdd1fff..8baf99a9f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h @@ -16,6 +16,7 @@ */ #pragma once +#include "BatchedGemmEnums.h" #include "GemmGatedActOptions.h" #include "GemmOptions.h" @@ -81,33 +82,36 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions BatchedGemmOptions() = default; // FIXME We create explicit constructor with all options to WAR stubgen issue in TRT-LLM. - BatchedGemmOptions(gemm::AllReduceAlgo allReduceAlgo, int clusterDimX, int clusterDimY, int clusterDimZ, - tg::Dtype dtypeAcc, tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, bool enablesEarlyExit, - bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, int epilogueLdtmDps, int epilogueLdtmBits, - int epilogueTileM, int epilogueTileN, bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, - bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, - bool hoistMmaTaskTryWaits, int k, gemm::KernelTraits kernelTraits, int m, int mmaK, tg::MmaKind mmaKind, - int mmaM, int mmaN, bool mockAllReduce, int n, int numSlicesForSplitK, int numSlicesForSliceK, int numStages, - int numStagesMma, int numStagesMmaWithinWorkTile, int numStagesMmaAcrossWorkTile, int numStagesWorkId, - bool outputDebugTensors, bool useShuffledMatrixA, bool sliceK, gemm::SplitK splitK, bool transposeMatrixA, - bool transposeMatrixB, bool transposeMmaOutput, int tileM, int tileN, int tileK, bool useUnrollLoop2xForMma, - bool useCustomMmaSchedule, bool useHoistTryWaitForCustomMmaSchedule, bool useDeepSeekFp8, bool usePerTokenSfA, - bool usePerTokenSfB, bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, tg::SfLayout sfLayoutA, - tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, gemm::TileScheduler tileScheduler, - gemmGatedAct::ActType actType, std::vector batchedM, std::vector batchedN, BatchMode batchMode, - int numBatches, bool isStaticBatch, int numTokens, bool routeAct, bool gridWaitForPrimaryRouting, bool fusedAct, - int numRegsPerThreadNonEpilogueWarp, int numRegsPerThreadEpilogueWarp) + BatchedGemmOptions(gemm::AllReduceAlgo allReduceAlgo, gemm::BiasType biasType, int blockK, int clusterDimX, + int clusterDimY, int clusterDimZ, tg::Dtype dtypeAcc, tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, + tg::Dtype dtypeMmaA, tg::Dtype dtypeMmaB, bool enablesEarlyExit, bool enablesDelayedEarlyExit, + bool enablesGlobalPtxKnobs, int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, + bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, + bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, + gemm::KernelTraits kernelTraits, gemm::MatrixLayout layoutA, gemm::MatrixLayout layoutB, int m, int mmaK, + tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n, int numSlicesForSplitK, + int numSlicesForSliceK, int numStages, int numStagesMma, int numStagesMmaWithinWorkTile, + int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, bool useShuffledMatrixA, + bool sliceK, gemm::SplitK splitK, bool transposeMmaOutput, int tileM, int tileN, int tileK, + bool useUnrollLoop2xForMma, bool useCustomMmaSchedule, bool useHoistTryWaitForCustomMmaSchedule, + bool useDeepSeekFp8, bool usePerTokenSfA, bool usePerTokenSfB, bool useTmaStore, bool useTwoTmaLoadWarps, + bool useTwoMmaWarps, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, + int32_t sfReshapeFactor, gemm::TileScheduler tileScheduler, gemmGatedAct::ActType actType, + std::vector batchedM, std::vector batchedN, BatchMode batchMode, int numBatches, bool isStaticBatch, + int numTokens, RouteImpl routeImpl, bool gridWaitForPrimaryRouting, bool fusedAct, + int numRegsPerThreadNonEpilogueWarp, int numRegsPerThreadEpilogueWarp, int numRegsCastAWarps) : gemmGatedAct::GemmGatedActOptions( - gemm::GemmOptions(allReduceAlgo, clusterDimX, clusterDimY, clusterDimZ, dtypeAcc, dtypeA, dtypeB, dtypeC, - enablesEarlyExit, enablesDelayedEarlyExit, enablesGlobalPtxKnobs, epilogueLdtmDps, epilogueLdtmBits, - epilogueTileM, epilogueTileN, gridTriggerSecondaryA, gridTriggerSecondaryB, gridWaitForPrimaryEarlyExit, - gridWaitForPrimaryA, gridWaitForPrimaryB, hoistLoadTaskInit, hoistMmaTaskTryWaits, k, kernelTraits, m, - mmaK, mmaKind, mmaM, mmaN, mockAllReduce, n, numSlicesForSplitK, numSlicesForSliceK, numStages, - numStagesMma, numStagesMmaWithinWorkTile, numStagesMmaAcrossWorkTile, numStagesWorkId, - outputDebugTensors, useShuffledMatrixA, sliceK, splitK, transposeMatrixA, transposeMatrixB, - transposeMmaOutput, tileM, tileN, tileK, useUnrollLoop2xForMma, useCustomMmaSchedule, - useHoistTryWaitForCustomMmaSchedule, useDeepSeekFp8, usePerTokenSfA, usePerTokenSfB, useTmaStore, - useTwoTmaLoadWarps, useTwoMmaWarps, sfLayoutA, sfLayoutB, sfLayoutC, tileScheduler), + gemm::GemmOptions(allReduceAlgo, biasType, blockK, clusterDimX, clusterDimY, clusterDimZ, dtypeAcc, dtypeA, + dtypeB, dtypeC, dtypeMmaA, dtypeMmaB, enablesEarlyExit, enablesDelayedEarlyExit, enablesGlobalPtxKnobs, + epilogueLdtmDps, epilogueLdtmBits, epilogueTileM, epilogueTileN, gridTriggerSecondaryA, + gridTriggerSecondaryB, gridWaitForPrimaryEarlyExit, gridWaitForPrimaryA, gridWaitForPrimaryB, + hoistLoadTaskInit, hoistMmaTaskTryWaits, k, kernelTraits, layoutA, layoutB, m, mmaK, mmaKind, mmaM, + mmaN, mockAllReduce, n, numSlicesForSplitK, numSlicesForSliceK, numStages, numStagesMma, + numStagesMmaWithinWorkTile, numStagesMmaAcrossWorkTile, numStagesWorkId, outputDebugTensors, + useShuffledMatrixA, sliceK, splitK, transposeMmaOutput, tileM, tileN, tileK, useUnrollLoop2xForMma, + useCustomMmaSchedule, useHoistTryWaitForCustomMmaSchedule, useDeepSeekFp8, usePerTokenSfA, + usePerTokenSfB, useTmaStore, useTwoTmaLoadWarps, useTwoMmaWarps, sfLayoutA, sfLayoutB, sfLayoutC, + sfReshapeFactor, tileScheduler), actType) , mBatchedM(batchedM) , mBatchedN(batchedN) @@ -115,11 +119,12 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions , mNumBatches(numBatches) , mIsStaticBatch(isStaticBatch) , mNumTokens(numTokens) - , mRouteAct(routeAct) + , mRouteImpl(routeImpl) , mGridWaitForPrimaryRouting(gridWaitForPrimaryRouting) , mFusedAct(fusedAct) , mNumRegsPerThreadNonEpilogueWarp(numRegsPerThreadNonEpilogueWarp) , mNumRegsPerThreadEpilogueWarp(numRegsPerThreadEpilogueWarp) + , mNumRegsCastAWarps(numRegsCastAWarps) { } @@ -137,7 +142,7 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions // Total number of tokens. int mNumTokens{32}; // Whether load the input tokens and do routing. - bool mRouteAct{false}; + RouteImpl mRouteImpl{RouteImpl::NoRoute}; // Whether the loads that load from ptrRouteMap, ptrTotalNumPaddedTokens, // ptrCtaIdxXyToBatchIdx, etc.. should wait on a grid dependency. bool mGridWaitForPrimaryRouting{true}; @@ -149,6 +154,8 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions int mNumRegsPerThreadNonEpilogueWarp{0}; // Number of registers per thread for epilogue warps int mNumRegsPerThreadEpilogueWarp{0}; + // Number of registers for the cast A warps. + int mNumRegsCastAWarps{0}; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -196,19 +203,15 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw if (batchM) { TLLM_CHECK_ERROR(options.mN > 0 && options.mK > 0, "N and K must be larger than 0"); - TLLM_CHECK_ERROR(options.mN >= options.mTileN && options.mK >= options.mTileK, - "N and K must be equal or larger than TileN and TileK respectively."); - TLLM_CHECK_ERROR(options.mN % options.mTileN == 0 && options.mK % options.mTileK == 0, - "N and K must be divisible by TileN and TileK respectively."); + TLLM_CHECK_ERROR(options.mN >= options.mTileN, "N must be equal or larger than TileN."); + TLLM_CHECK_ERROR(options.mN % options.mTileN == 0, "N must be divisible by TileN."); TLLM_CHECK_ERROR(!options.mTransposeMmaOutput, "When batchM the MMA output has to be in row-major."); } else { TLLM_CHECK_ERROR(options.mM > 0 && options.mK > 0, "M and K must be larger than 0"); - TLLM_CHECK_ERROR(options.mM >= options.mTileM && options.mK >= options.mTileK, - "N and K must be equal or larger than tileN and tileK respectively."); - TLLM_CHECK_ERROR(options.mM % options.mTileM == 0 && options.mK % options.mTileK == 0, - "M and K must be divisible by TileM and TileK respectively."); + TLLM_CHECK_ERROR(options.mM >= options.mTileM, "N must be equal or larger than tileN."); + TLLM_CHECK_ERROR(options.mM % options.mTileM == 0, "M must be divisible by TileM."); TLLM_CHECK_ERROR(options.mTransposeMmaOutput, "When batchN the MMA output has to be in column-major."); } } @@ -240,26 +243,82 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw { if (options.mDtypeA == tg::Dtype::MxE2m1 && options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) { - TLLM_CHECK_ERROR(!options.mRouteAct, "RouteAct is not supported with dtypeA = MxE2m1 and MxFp8Fp6Fp4."); + TLLM_CHECK_ERROR(doesRouteImplUseNoRoute(options.mRouteImpl), + "RouteAct is not supported with dtypeA = MxE2m1 and MxFp8Fp6Fp4."); } } else { if (options.mDtypeB == tg::Dtype::MxE2m1 && options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) { - TLLM_CHECK_ERROR(!options.mRouteAct, "RouteAct is not supported with dtypeB = MxE2m1 and MxFp8Fp6Fp4."); + TLLM_CHECK_ERROR(doesRouteImplUseNoRoute(options.mRouteImpl), + "RouteAct is not supported with dtypeB = MxE2m1 and MxFp8Fp6Fp4."); } } TLLM_CHECK_ERROR(options.mUseTmaStore, "Only TMA store is supported."); if (batchM) { - TLLM_CHECK_ERROR(!options.mTransposeMatrixA, "Activations must be in k-major format"); + TLLM_CHECK_ERROR(options.mLayoutA == gemm::MatrixLayout::MajorK, "Activations must be in k-major format"); } else { - TLLM_CHECK_ERROR(options.mTransposeMatrixB, "Activations must be in k-major format"); + TLLM_CHECK_ERROR(options.mLayoutB == gemm::MatrixLayout::MajorK, "Activations must be in k-major format"); } + + if (tg::mmaKindIsBlockFmt(options.mMmaKind) && !options.mUseDeepSeekFp8) + { + if (!doesRouteImplUseNoRoute(options.mRouteImpl)) + { + if (batchM) + { + TLLM_CHECK_ERROR( + options.mSfLayoutA == tg::SfLayout::Linear, "Tokens need use SF linear layout when being routed"); + } + else + { + TLLM_CHECK_ERROR( + options.mSfLayoutB == tg::SfLayout::Linear, "Tokens need use SF linear layout when being routed"); + } + } + + if (doesRouteImplUseTma(options.mRouteImpl)) + { + TLLM_CHECK_ERROR(!batchM, "UTMALDG.GATHER4 only supported for batch N."); + + if (tg::mmaKindIsBlockFmt(options.mMmaKind)) + { + auto dtypeRoute = batchM ? options.mDtypeA : options.mDtypeB; + TLLM_CHECK_ERROR(options.mTileK % tg::dtypeNumEltsPerSf(dtypeRoute) == 0, + "tileK needs to be a multiple of 16 * tg::dtypeNumEltsPerSf(dtypeA)."); + TLLM_CHECK_ERROR(options.mTileK % (tg::dtypeNumEltsPerSf(dtypeRoute) * 16) == 0, + "tileK needs to be a multiple of 16 * tg::dtypeNumEltsPerSf(dtypeA)."); + } + } + + if (!batchM || doesRouteImplUseNoRoute(options.mRouteImpl)) + { + TLLM_CHECK_ERROR(options.mSfLayoutA == tg::SfLayout::R128c4, + "options.mSfLayoutA has to be tg::SfLayout::R128c4 when not being routed"); + } + } + + if (!gemm::isBiasTypeNone(options.mBiasType)) + { + TLLM_CHECK_ERROR( + (gemm::isBiasTypeN(options.mBiasType) && options.mBatchMode == BatchedGemmOptions::BatchMode::BatchM) + || (gemm::isBiasTypeM(options.mBiasType) + && options.mBatchMode == BatchedGemmOptions::BatchMode::BatchN), + "BatchedGemm supports only per channel bias."); + } + + // We do not handle the case where K is not a multiple of TileK. + // TMA based load handles the case transparently. + if (doesRouteImplUseLdgsts(options.mRouteImpl)) + { + TLLM_CHECK_ERROR(options.mK % options.mTileK == 0, "K must be a multiple of TileK"); + } + return isValid; } @@ -301,11 +360,12 @@ inline std::string dumpOptions(BatchedGemmOptions const& options) ss << "mNumBatches=" << options.mNumBatches << "," << std::endl; ss << "mIsStaticBatch=" << options.mIsStaticBatch << "," << std::endl; ss << "mNumTokens=" << options.mNumTokens << "," << std::endl; - ss << "mRouteAct=" << options.mRouteAct << "," << std::endl; + ss << "mRouteImpl=batchedGemm::RouteImpl(" << static_cast(options.mRouteImpl) << ")," << std::endl; ss << "mGridWaitForPrimaryRouting=" << options.mGridWaitForPrimaryRouting << "," << std::endl; ss << "mFusedAct=" << options.mFusedAct << "," << std::endl; ss << "mNumRegsPerThreadNonEpilogueWarp=" << options.mNumRegsPerThreadNonEpilogueWarp << "," << std::endl; - ss << "mNumRegsPerThreadEpilogueWarp=" << options.mNumRegsPerThreadEpilogueWarp << std::endl; + ss << "mNumRegsPerThreadEpilogueWarp=" << options.mNumRegsPerThreadEpilogueWarp << "," << std::endl; + ss << "mNumRegsCastAWarps=" << options.mNumRegsCastAWarps << std::endl; return ss.str(); } diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h index 57cbc93982..8b517526a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h @@ -39,6 +39,31 @@ enum class AllReduceAlgo : uint32_t //////////////////////////////////////////////////////////////////////////////////////////////////// +enum class MatrixLayout +{ + // K-major layout (default). [Mn, K] + MajorK = 0, + // M-major for A and N-major for B. [K, Mn] + MajorMn, + // Layout is blocked along the K dimension as seen in the diagram below. [K / blockK, Mn, blockK] + // where blockK is fixed at 128B + // + // ├────────────── K ──────────────┤ + // ┬ ┬ ├──── K block ───┤ + // │ │ │ 0 1 2 3 ║ 32 33 34 35 │ + // │ CTA0 │ 4 5 6 7 ║ 36 37 38 39 │ + // │ │ │ 8 9 10 11 ║ 40 41 42 43 │ + // │ ┴ │ 12 13 14 15 ║ 44 45 46 47 │ + // M ┬ ├────────────────║────────────────┤ + // │ │ │ 16 17 18 19 ║ 48 49 50 51 │ + // │ CTA1 │ 20 21 22 23 ║ 52 53 54 55 │ + // │ │ │ 24 25 26 27 ║ 56 57 58 59 │ + // ┴ ┴ │ 28 29 30 31 ║ 60 61 62 63 │ + BlockMajorK +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + enum class SplitK : uint32_t { // No split-k is needed. I.e. mNumSlicesForSplitK == 1. @@ -54,6 +79,20 @@ enum class SplitK : uint32_t //////////////////////////////////////////////////////////////////////////////////////////////////// +enum class BiasType : uint32_t +{ + // No bias. + None = 0, + // One bias value per N of the output tensor. + M = 1, + // One bias value per row M of the output tensor. + N = 2, + // One bias value for each element of the output tensor. + Mn = 3, +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + enum class TileScheduler { // Static scheduler (Non-persistent). @@ -80,6 +119,23 @@ SPLIT_K_FUNCTION(Dsmem) //////////////////////////////////////////////////////////////////////////////////////////////////// +// Helper functions to check the Bias type. + +#define BIAS_TYPE_FUNCTION(Mode) \ + inline bool isBiasType##Mode(BiasType type) \ + { \ + return (type == BiasType::Mode); \ + } + +BIAS_TYPE_FUNCTION(None) +BIAS_TYPE_FUNCTION(N) +BIAS_TYPE_FUNCTION(M) +BIAS_TYPE_FUNCTION(Mn) + +#undef BIAS_TYPE_FUNCTION + +//////////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace gemm } // namespace batchedGemm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h index 890cb5fc54..aef650139b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h @@ -59,7 +59,17 @@ namespace tg = trtllm::gen; enum class ActType { // silu(x) = x * sigmoid(x) = x * (1 / (1 + e^(-x))) - Silu = 0 + // For ActType == Silu, + // gatedAct = scaleC * x0 * silu(x1 * scaleGate), + // where x0 and x1 are the raw numbers from Gemm, while scaleC and scaleGate are input scales. + Silu = 0, + // For ActType == SwiGlu, ideally we would like to have something like + // gatedAct = scaleC * (x0 * scaleAb + beta) * sigmoid(alpha * x1 * scaleGate). + // But for now, we use the simplified version + // gatedAct = scaleC' * (x0 + beta') * sigmoid(alpha * x1 * scaleGate), + // where x0 and x1 are the raw numbers from Gemm, while scaleC and scaleGate are input scales, + // beta' = beta / scaleAb, scaleC' = scaleC * scaleAb. + SwiGlu }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -73,11 +83,24 @@ enum class ActType } TLLM_ACT_TYPE_FUNCTION(Silu) +TLLM_ACT_TYPE_FUNCTION(SwiGlu) #undef TLLM_ACT_TYPE_FUNCTION //////////////////////////////////////////////////////////////////////////////////////////////////// +inline std::string getActTypeName(ActType type) +{ + switch (type) + { + case ActType::Silu: return "Silu"; + case ActType::SwiGlu: return "SwiGlu"; + default: return "Unknown type"; + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + struct GemmGatedActOptions : public gemm::GemmOptions { GemmGatedActOptions() = default; @@ -141,6 +164,11 @@ inline bool checkAndUpdateGemmGatedActOptions( TLLM_CHECK_ERROR(doesSplitKUseDsmem(options.mSplitK), "Split-k GMEM and GemmGatedAct are not supported yet."); } + if (gemm::isBiasTypeMn(options.mBiasType)) + { + TLLM_CHECK_ERROR(options.mTransposeMmaOutput, "Bias type Mn is not supported with not transpose mma output."); + } + return true; } diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h index feb5e71969..32678fb6d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h @@ -91,20 +91,23 @@ struct GemmOptions GemmOptions() = default; - GemmOptions(AllReduceAlgo allReduceAlgo, int clusterDimX, int clusterDimY, int clusterDimZ, tg::Dtype dtypeAcc, - tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, bool enablesEarlyExit, bool enablesDelayedEarlyExit, - bool enablesGlobalPtxKnobs, int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, - bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, - bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, - KernelTraits kernelTraits, int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n, - int numSlicesForSplitK, int numSlicesForSliceK, int numStages, int numStagesMma, int numStagesMmaWithinWorkTile, - int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, bool useShuffledMatrixA, - bool sliceK, SplitK splitK, bool transposeMatrixA, bool transposeMatrixB, bool transposeMmaOutput, int tileM, - int tileN, int tileK, bool useUnrollLoop2xForMma, bool useCustomMmaSchedule, - bool useHoistTryWaitForCustomMmaSchedule, bool useDeepSeekFp8, bool usePerTokenSfA, bool usePerTokenSfB, - bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, - tg::SfLayout sfLayoutC, TileScheduler tileScheduler) + GemmOptions(AllReduceAlgo allReduceAlgo, BiasType biasType, int blockK, int clusterDimX, int clusterDimY, + int clusterDimZ, tg::Dtype dtypeAcc, tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeMmaA, + tg::Dtype dtypeMmaB, bool enablesEarlyExit, bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, + int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, bool gridTriggerSecondaryA, + bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, + bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, KernelTraits kernelTraits, + MatrixLayout layoutA, MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, + bool mockAllReduce, int n, int numSlicesForSplitK, int numSlicesForSliceK, int numStages, int numStagesMma, + int numStagesMmaWithinWorkTile, int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, + bool useShuffledMatrixA, bool sliceK, SplitK splitK, bool transposeMmaOutput, int tileM, int tileN, int tileK, + bool useUnrollLoop2xForMma, bool useCustomMmaSchedule, bool useHoistTryWaitForCustomMmaSchedule, + bool useDeepSeekFp8, bool usePerTokenSfA, bool usePerTokenSfB, bool useTmaStore, bool useTwoTmaLoadWarps, + bool useTwoMmaWarps, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, + int sfReshapeFactor, TileScheduler tileScheduler) : mAllReduceAlgo{allReduceAlgo} + , mBiasType{biasType} + , mBlockK(blockK) , mClusterDimX{clusterDimX} , mClusterDimY{clusterDimY} , mClusterDimZ{clusterDimZ} @@ -112,6 +115,8 @@ struct GemmOptions , mDtypeA{dtypeA} , mDtypeB{dtypeB} , mDtypeC{dtypeC} + , mDtypeMmaA{dtypeMmaA} + , mDtypeMmaB{dtypeMmaB} , mEnablesEarlyExit{enablesEarlyExit} , mEnablesDelayedEarlyExit{enablesDelayedEarlyExit} , mEnablesGlobalPtxKnobs{enablesGlobalPtxKnobs} @@ -128,6 +133,8 @@ struct GemmOptions , mHoistMmaTaskTryWaits{hoistMmaTaskTryWaits} , mK{k} , mKernelTraits{kernelTraits} + , mLayoutA{layoutA} + , mLayoutB{layoutB} , mM{m} , mMmaK{mmaK} , mMmaKind{mmaKind} @@ -146,8 +153,6 @@ struct GemmOptions , mUseShuffledMatrixA{useShuffledMatrixA} , mSliceK{sliceK} , mSplitK{splitK} - , mTransposeMatrixA{transposeMatrixA} - , mTransposeMatrixB{transposeMatrixB} , mTransposeMmaOutput{transposeMmaOutput} , mTileM{tileM} , mTileN{tileN} @@ -164,13 +169,17 @@ struct GemmOptions , mSfLayoutA{sfLayoutA} , mSfLayoutB{sfLayoutB} , mSfLayoutC{sfLayoutC} + , mSfReshapeFactor{sfReshapeFactor} , mTileScheduler{tileScheduler} { } // The all-reduce algorithm. AllReduceAlgo mAllReduceAlgo{AllReduceAlgo::None}; - + // The type of bias. + BiasType mBiasType{BiasType::None}; + // Block size in the K dimension + int mBlockK{-1}; // Cluster size in X dim. int mClusterDimX{1}; // Cluster size in Y dim. @@ -185,6 +194,10 @@ struct GemmOptions tg::Dtype mDtypeB{tg::Dtype::Void}; // Data type of the outputs. tg::Dtype mDtypeC{tg::Dtype::Void}; + // Data type of the A matrix for the MMA, if different from the input type. + tg::Dtype mDtypeMmaA{tg::Dtype::Void}; + // Data type of the B matrix for the MMA, if different from the input type. + tg::Dtype mDtypeMmaB{tg::Dtype::Void}; // Whether to enable early exit. bool mEnablesEarlyExit{false}; // Whether to enable delayed early exit to overlap @@ -225,6 +238,10 @@ struct GemmOptions int mK{16 * 16}; // Traits of the kernel. KernelTraits mKernelTraits{}; + // Layout of A matrix + MatrixLayout mLayoutA{MatrixLayout::MajorK}; + // Layout of B matrix + MatrixLayout mLayoutB{MatrixLayout::MajorK}; // The M dimension of GEMM. int mM{128 * 2}; // Size of the MMA instruction in the K dimension. @@ -265,10 +282,6 @@ struct GemmOptions bool mSliceK{false}; // The location of the exchange for split-K (it's None when split-K is disabled). SplitK mSplitK{SplitK::None}; - // Is A matrix in a transposed layout? M major if true, K major otherwise - bool mTransposeMatrixA{false}; - // Is B matrix in a transposed layout? K major if true, N major otherwise - bool mTransposeMatrixB{true}; // Save output of MMA in M-major format. bool mTransposeMmaOutput{false}; // M tile dimension of GEMM. @@ -303,6 +316,12 @@ struct GemmOptions tg::SfLayout mSfLayoutB{tg::SfLayout::R128c4}; // Scale factors layout for C. tg::SfLayout mSfLayoutC{tg::SfLayout::R128c4}; + // Number of "repeats", i.e. reshaping factor, to fold hidden dimension into SfBlock dimension. + // As result, the hidden dimension of the SF tensor must be a multiple of NumRepeats * + // numEltsPerSf * 4. This reduces the problem shape space that the kernel is able to run. + // But it reduces the number of L2 requests under the hood and potentially improves perf. + // Applies to layout 8x4 only. + int mSfReshapeFactor{1}; // Tile scheduler type. TileScheduler mTileScheduler{TileScheduler::Static}; }; @@ -373,6 +392,10 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mAllReduceAlgo=" << "gemm::AllReduceAlgo(" << static_cast(options.mAllReduceAlgo) << ")" << "," << std::endl; + ss << "mBiasType=" + << "gemm::BiasType(" << static_cast(options.mBiasType) << ")" + << "," << std::endl; + ss << "mBlockK=" << options.mBlockK << "," << std::endl; ss << "mClusterDimX=" << options.mClusterDimX << "," << std::endl; ss << "mClusterDimY=" << options.mClusterDimY << "," << std::endl; ss << "mClusterDimZ=" << options.mClusterDimZ << "," << std::endl; @@ -388,6 +411,12 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mDtypeC=" << "trtllm::gen::Dtype(" << static_cast(options.mDtypeC) << ")" << "," << std::endl; + ss << "mDtypeMmaA=" + << "trtllm::gen::Dtype(" << static_cast(options.mDtypeMmaA) << ")" + << "," << std::endl; + ss << "mDtypeMmaB=" + << "trtllm::gen::Dtype(" << static_cast(options.mDtypeMmaB) << ")" + << "," << std::endl; ss << "mEnablesEarlyExit=" << options.mEnablesEarlyExit << "," << std::endl; ss << "mEnablesDelayedEarlyExit=" << options.mEnablesDelayedEarlyExit << "," << std::endl; ss << "mEnablesGlobalPtxKnobs=" << options.mEnablesGlobalPtxKnobs << "," << std::endl; @@ -405,6 +434,10 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mK=" << options.mK << "," << std::endl; ss << "mKernelTraits={}" << "," << std::endl; + ss << "mLayoutA=gemm::MatrixLayout(" << static_cast(options.mLayoutA) << ")" + << "," << std::endl; + ss << "mLayoutB=gemm::MatrixLayout(" << static_cast(options.mLayoutB) << ")" + << "," << std::endl; ss << "mM=" << options.mM << "," << std::endl; ss << "mMmaK=" << options.mMmaK << "," << std::endl; ss << "mMmaKind=" @@ -427,8 +460,6 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mSplitK=" << "gemm::SplitK(" << static_cast(options.mSplitK) << ")" << "," << std::endl; - ss << "mTransposeMatrixA=" << options.mTransposeMatrixA << "," << std::endl; - ss << "mTransposeMatrixB=" << options.mTransposeMatrixB << "," << std::endl; ss << "mTransposeMmaOutput=" << options.mTransposeMmaOutput << "," << std::endl; ss << "mTileM=" << options.mTileM << "," << std::endl; ss << "mTileN=" << options.mTileN << "," << std::endl; @@ -451,6 +482,7 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mSfLayoutC=" << "trtllm::gen::SfLayout(" << static_cast(options.mSfLayoutC) << ")" << "," << std::endl; + ss << "mSfReshapeFactor=" << options.mSfReshapeFactor << "," << std::endl; ss << "mTileScheduler=" << "gemm::TileScheduler(" << static_cast(options.mTileScheduler) << ")" << std::endl; return ss.str(); @@ -502,39 +534,89 @@ inline bool checkAndUpdateGemmOptions( } } + // If not specified, used the input dtypes as MMA dtypes (no cast required). + if (options.mDtypeMmaA == tg::Dtype::Void) + { + if (updateOptions) + { + options.mDtypeMmaA = options.mDtypeA; + } + else + { + return false; + } + } + if (options.mDtypeMmaB == tg::Dtype::Void) + { + if (updateOptions) + { + options.mDtypeMmaB = options.mDtypeB; + } + else + { + return false; + } + } + + // Check that the A cast is supported. + // Currently, we only support {MxFp4, NvFp4} -> Bf16. + TLLM_CHECK_ERROR((options.mDtypeA == options.mDtypeMmaA) + || ((options.mDtypeA == tg::Dtype::MxE2m1 || options.mDtypeA == tg::Dtype::E2m1) + && options.mDtypeMmaA == tg::Dtype::Bfloat16), + "Unsupported cast: ", tg::dtypeToString(options.mDtypeA), " -> ", tg::dtypeToString(options.mDtypeMmaA)); + // Casting B is currently not supported. + // Note: This is because we currently write the output of the cast directly to TMEM. + // In the future, we can support this through shared memory. + TLLM_CHECK_ERROR( + (options.mDtypeB == options.mDtypeMmaB), "Casting B is not supported: dtypeMmaB must be the same as dtypeB."); + + if (options.mDtypeA != options.mDtypeMmaA) + { + TLLM_CHECK_ERROR(options.mTileM == 128, "TileM must be 128 when casting the input matrix A before the MMA."); + } + // FIXME: We do not support different dtypes for A and B when not on Blackwell. if (!isBlackwell) { - TLLM_CHECK_ERROR(options.mDtypeA == options.mDtypeB, "For non-Blackwell, A and B must have the same dtype."); + TLLM_CHECK_ERROR( + options.mDtypeMmaA == options.mDtypeMmaB, "For non-Blackwell, A and B must have the same dtype."); } // Check that the different dtypes for A and B are supported by the tensor core // kind::f8f6f4 - if (options.mDtypeA == tg::Dtype::E4m3 || options.mDtypeA == tg::Dtype::E2m1) + if (options.mDtypeMmaA == tg::Dtype::E4m3 || options.mDtypeMmaA == tg::Dtype::E2m1) { - TLLM_CHECK_ERROR(options.mDtypeB == tg::Dtype::E4m3 || options.mDtypeB == tg::Dtype::E2m1, - "For E4m3/E2m1 A, B must also be E4m3/E2m1."); + TLLM_CHECK_ERROR(options.mDtypeMmaB == tg::Dtype::E4m3 || options.mDtypeMmaB == tg::Dtype::E2m1, + "For dtypeMmaA = E4m3/E2m1 A, dtypeMmaB must also be E4m3/E2m1."); } // kind::mxf8f6f4 - if (options.mDtypeA == tg::Dtype::MxE4m3 || options.mDtypeA == tg::Dtype::MxE2m1) + if (options.mDtypeMmaA == tg::Dtype::MxE4m3 || options.mDtypeMmaA == tg::Dtype::MxE2m1) { - TLLM_CHECK_ERROR(options.mDtypeB == tg::Dtype::MxE4m3 || options.mDtypeB == tg::Dtype::MxE2m1, - "For dtypeA = MxE4m3 or MxE2m1, dtypeB must also be MxE4m3 or MxE2m1."); + TLLM_CHECK_ERROR(options.mDtypeMmaB == tg::Dtype::MxE4m3 || options.mDtypeMmaB == tg::Dtype::MxE2m1, + "For dtypeMmaA = MxE4m3 or MxE2m1, dtypeMmaB must also be MxE4m3 or MxE2m1."); } - if (options.mDtypeB == tg::Dtype::MxE4m3 || options.mDtypeB == tg::Dtype::MxE2m1) + if (options.mDtypeMmaB == tg::Dtype::MxE4m3 || options.mDtypeMmaB == tg::Dtype::MxE2m1) { - TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::MxE4m3 || options.mDtypeA == tg::Dtype::MxE2m1, - "For dtypeB = MxE4m3 or MxE2m1, dtypeA must also be MxE4m3 or MxE2m1."); + TLLM_CHECK_ERROR(options.mDtypeMmaA == tg::Dtype::MxE4m3 || options.mDtypeMmaA == tg::Dtype::MxE2m1, + "For dtypeMmaB = MxE4m3 or MxE2m1, dtypeMmaA must also be MxE4m3 or MxE2m1."); } // kind::f16 - if (options.mDtypeA == tg::Dtype::Fp16 || options.mDtypeA == tg::Dtype::Bfloat16) + if (options.mDtypeMmaA == tg::Dtype::Fp16 || options.mDtypeMmaA == tg::Dtype::Bfloat16) { - TLLM_CHECK_ERROR(options.mDtypeB == options.mDtypeA, "For Fp16/Bfloat16 A, B must be the same type as A."); + TLLM_CHECK_ERROR(options.mDtypeMmaB == options.mDtypeMmaA, + "For dtypeMmaA = Fp16/Bfloat16, dtypeMmaB must be the same as dtypeMmaA."); } - // When different dtype are used for A and B, we must use different tile to do the loading. + // When one of the inputs needs to be cast, we must use two load warps. + if ((options.mDtypeMmaA != options.mDtypeA || options.mDtypeMmaB != options.mDtypeB) + && !options.mUseTwoTmaLoadWarps) + { + TLLM_LOG_WARNING("Two TMA load warps must be enabled if any of the inputs needs to be cast."); + } + + // When different dtypes are used for A and B, we must use different tiles to do the loading. // It is not strictly required, but current implementation of SmemAb requires that. if (options.mDtypeA != options.mDtypeB) { @@ -547,7 +629,7 @@ inline bool checkAndUpdateGemmOptions( { if (updateOptions) { - options.mMmaKind = dtypeGetMmaKind(options.mDtypeA, options.mDtypeB); + options.mMmaKind = dtypeGetMmaKind(options.mDtypeMmaA, options.mDtypeMmaB); } else { @@ -555,11 +637,6 @@ inline bool checkAndUpdateGemmOptions( } } - if (options.mMmaKind == tg::MmaKind::Fp16) - { - TLLM_CHECK_ERROR(options.mDtypeA == options.mDtypeB, "For Fp16 MMA, A and B must have the same dtype."); - } - if ((options.mMmaKind == tg::MmaKind::Fp8Fp6Fp4 || options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) && options.mMmaK != 32) { @@ -626,7 +703,8 @@ inline bool checkAndUpdateGemmOptions( { TLLM_CHECK_ERROR(isBlackwell, "Block scaling is only supported on Blackwell"); - TLLM_CHECK_ERROR(options.mSfLayoutB == tg::SfLayout::R128c4 || options.mSfLayoutB == tg::SfLayout::R8c4, + TLLM_CHECK_ERROR(options.mSfLayoutB == tg::SfLayout::R128c4 || options.mSfLayoutB == tg::SfLayout::R8c4 + || options.mSfLayoutB == tg::SfLayout::Linear, "Only the 128x4 and 8x4 SF layouts are supported for B, got ", tg::sfLayoutToString(options.mSfLayoutB)); int const mmaK = (options.mMmaKind == tg::MmaKind::MxFp4NvFp4) ? 64 : 32; @@ -653,11 +731,16 @@ inline bool checkAndUpdateGemmOptions( // The MMA N may only be smaller than 64 if it is equal to the tile N. TLLM_CHECK_ERROR(options.mMmaN >= 64 || options.mMmaN == options.mTileN, "MmaN (", options.mMmaN, ") must be >= 64 or equal to TileN (", options.mTileN, ")"); - + } + if (tg::dtypeIsBlockFmt(options.mDtypeA)) + { int numEltsPerSfA = tg::dtypeNumEltsPerSf(options.mDtypeA); - int numEltsPerSfB = tg::dtypeNumEltsPerSf(options.mDtypeB); TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfA) == 0, "TileK (", options.mTileK, ") must be a multiple of ", (4 * numEltsPerSfA), " for typeA ", gemm::toString(options.mDtypeA)); + } + if (tg::dtypeIsBlockFmt(options.mDtypeB)) + { + int numEltsPerSfB = tg::dtypeNumEltsPerSf(options.mDtypeB); TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfB) == 0, "TileK (", options.mTileK, ") must be a multiple of ", (4 * numEltsPerSfB), " for typeB ", gemm::toString(options.mDtypeB)); } @@ -753,7 +836,6 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mM > 0 && options.mN > 0 && options.mK > 0, "M, N and K must be larger than 0"); TLLM_CHECK_ERROR(options.mNumSlicesForSplitK > 0, "Split K must be larger than 0."); - TLLM_CHECK_ERROR(options.mK % options.mTileK == 0, "K must be a multiple of TileK"); if (options.mUseShuffledMatrixA) { @@ -911,6 +993,11 @@ inline bool checkAndUpdateGemmOptions( { TLLM_CHECK_ERROR( options.mNumStagesMmaWithinWorkTile == 1, "Non-DeepSeekFp8 requires numStagesMmaWithinWorkTile == 1"); + if (options.mNumStagesMma > 1) + { + TLLM_CHECK_ERROR(options.mTileScheduler == TileScheduler::Persistent, + "Non-DeepSeekFp8 requires persistent scheduler when using numStagesMma >1"); + } } if (options.mUseDeepSeekFp8) { @@ -923,6 +1010,7 @@ inline bool checkAndUpdateGemmOptions( // Check that TileK = 128 for correct scaling of every 128 channels. TLLM_CHECK_ERROR(options.mTileK == 128, "Tile-K must be equal to 128 for DeepSeek Fp8"); + TLLM_CHECK_ERROR(options.mK % options.mTileK == 0, "K must be a multiple of TileK"); // Tile sizes of the output hidden dimension. auto hiddenDimPerOutputTile = options.mTransposeMmaOutput ? options.mTileM : options.mTileN; auto hiddenDimPerEpilogueTile = options.mTransposeMmaOutput ? options.mEpilogueTileM : options.mEpilogueTileN; @@ -997,14 +1085,22 @@ inline bool checkAndUpdateGemmOptions( if (options.mUseUnrollLoop2xForMma) { - bool notSupported = (options.mK / options.mNumSlicesForSplitK) % (options.mTileK * 2) != 0; - // Check that the 2*TileK is a multiple of MmaK when UnrollLoop2x is enabled. - // This is to avoid deadlock when mma runs even-numbered loop while the other warps run - // odd-numbered loop. + // Number of iterations in K dimension after padding. + // Note the perCtaK in each CTA in the splitK group are padded to the same number of iterations. + // E.g., K = 512, TileK = 128, numSlicesForSplitK = 3. Then the padded K is + // + // ceil(512 / (128*3)) * (128*3) = 768 + // + int paddedK = divUpMul(options.mK, options.mTileK * options.mNumSlicesForSplitK); + // Check that the padded K (K rounded to next multiple of tileK) is a multiple of 2*TileK when + // UnrollLoop2x is enabled. This is to avoid deadlock when mma runs even-numbered loop while the + // other warps run odd-numbered loop. + // + bool notSupported = (paddedK / options.mNumSlicesForSplitK) % (options.mTileK * 2) != 0; if (notSupported) { TLLM_LOG_WARNING("Size K / splitK must be a multiple of TileK * 2. Found TileK=", options.mTileK, - " and K=", options.mK, " and numSlicesForSplitK=", options.mNumSlicesForSplitK, + " and K=", options.mK, " (paddedK=", paddedK, ") and numSlicesForSplitK=", options.mNumSlicesForSplitK, ". Disabling unrollLoop2xForMma."); if (updateOptions) { @@ -1059,43 +1155,108 @@ inline bool checkAndUpdateGemmOptions( // // Kernel 1: ----PREEXIT-----------FLUSH // Kernel 2: -------PREEXIT----ACQBULK---FLUSH - // Kernel 3: Warp 0: ---- (!) Output of 1,2 is not yet visible ----------------------- - // Warp 1: ---- (!) We normally assume that 1 is visible is not yet visible- - // Warp 2: -------------------ACQBULK-- Kernel 1,2 output visible ---------- + // Kernel 3: Warp 0: ---- (!) Output of 1,2 is not yet visible + // ----------------------- + // Warp 1: ---- (!) We normally assume that 1 is visible is not yet + // visible- Warp 2: -------------------ACQBULK-- Kernel 1,2 output visible + // ---------- TLLM_CHECK_ERROR((options.mGridWaitForPrimaryA || !options.mGridTriggerSecondaryA), "A: If a task triggers a secondary kernel, it must also wait for primary kernel."); TLLM_CHECK_ERROR((options.mGridWaitForPrimaryB || !options.mGridTriggerSecondaryB), "B: If a task triggers a secondary kernel, it must also wait for primary kernel."); + if (options.mUsePerTokenSfA || options.mUsePerTokenSfB) + { + // Checks applicable to both MetaFP8 and RoutingScalesOnInput + TLLM_CHECK_ERROR(!options.mUseDeepSeekFp8, "DeepSeek FP8 and per-token scaling are not compatible"); + TLLM_CHECK_ERROR(isBlackwell, "Per-token scaling is not supported for Hopper"); + if (options.mUsePerTokenSfA && options.mUsePerTokenSfB) + { + // MetaFP8 case + TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::E4m3 && options.mDtypeB == tg::Dtype::E4m3, + "A and B dtype must be E4m3 for Meta Fp8. Found dtypeA=", tg::dtypeToString(options.mDtypeA), + " dtypeB=", tg::dtypeToString(options.mDtypeB)); + } + else + { + // RoutingScalesOnInput case + TLLM_CHECK_ERROR((options.mUsePerTokenSfA && !options.mTransposeMmaOutput) + || (options.mUsePerTokenSfB && options.mTransposeMmaOutput), + "In RoutingScalesOnInput mode, perToken scales must be used on activations"); + } + } + // The generation should support non K-major layouts for both A and B; however, it is unclear if // there is a use-case - TLLM_CHECK_ERROR(!options.mTransposeMatrixA || options.mTransposeMatrixB, - "TransposeA true and TransposeB false is not supported"); + TLLM_CHECK_ERROR((options.mLayoutA == MatrixLayout::MajorK) || (options.mLayoutB == MatrixLayout::MajorK), + "At least one matrix must be in k-major layout"); // Some features are currently only support when both matrices are in K-major format - if (options.mTransposeMatrixA || !options.mTransposeMatrixB) + if (options.mLayoutB != MatrixLayout::MajorK || options.mLayoutB != MatrixLayout::MajorK) { TLLM_CHECK_ERROR(isBlackwell, "Non K-major layouts are only supported on Blackwell"); TLLM_CHECK_ERROR(options.mSplitK == SplitK::None, "Non K-major layouts do not support split K"); } - if (options.mTransposeMatrixA) + if (options.mLayoutA == MatrixLayout::MajorMn) { TLLM_CHECK_ERROR(tg::dtypeGetNumBits(options.mDtypeA) >= 8, "Subbyte types only support K major layout"); } - if (!options.mTransposeMatrixB) + if (options.mLayoutB == MatrixLayout::MajorMn) { TLLM_CHECK_ERROR(tg::dtypeGetNumBits(options.mDtypeB) >= 8, "Subbyte types only support K major layout"); } + if ((options.mLayoutA == MatrixLayout::BlockMajorK) || (options.mLayoutB == MatrixLayout::BlockMajorK)) + { + bool const isBlockA = options.mLayoutA == MatrixLayout::BlockMajorK; + + // Block K size must be 128B. + // TODO Leaving this as an option for now in case we want to expertiment with other block sizes + // As the user is not expected to set this, do not fail if updateOptions is false + int32_t const elemSizeInBits + = (isBlockA) ? tg::dtypeGetNumBits(options.mDtypeA) : tg::dtypeGetNumBits(options.mDtypeB); + int32_t const elemsIn128B = 128 * 8 /* Bits in byte */ / elemSizeInBits; + + if (options.mBlockK != elemsIn128B) + { + if (updateOptions) + { + options.mBlockK = elemsIn128B; + } + else + { + return false; + } + } + + if (options.mBlockK > options.mTileK) + { + TLLM_CHECK_ERROR(options.mBlockK % options.mTileK == 0, + "If block size is greater than tile size, block size must be a multiple of tile size"); + } + else if (options.mBlockK < options.mTileK) + { + TLLM_CHECK_ERROR(options.mTileK % options.mBlockK == 0, + "If tile size is greater than block size, tile size must be a multiple of block size"); + } + } + + if (!isBiasTypeNone(options.mBiasType)) + { + TLLM_CHECK_ERROR(!isBiasTypeMn(options.mBiasType), "BiasType::Mn is not supported"); + TLLM_CHECK_ERROR(!options.mUseDeepSeekFp8, "Bias is not supported for DeepSeek Fp8"); + TLLM_CHECK_ERROR(!(options.mUsePerTokenSfA && options.mUsePerTokenSfB), "Bias is not supported for Meta Fp8"); + } + if (updateOptions) { // Init kernel traits. options.mKernelTraits = KernelTraits(options.mDtypeA, options.mDtypeB, options.mDtypeC, options.mDtypeAcc, - options.mMmaKind, options.mTileM, options.mTileN, options.mTileK, options.mEpilogueTileM, - options.mEpilogueTileN, options.mNumStages, options.mNumStagesMma, options.mNumSlicesForSplitK, - options.mNumSlicesForSliceK, options.mSplitK, options.mUseTmaStore, options.mTransposeMmaOutput, - options.mAllReduceAlgo, options.mTileScheduler == TileScheduler::Persistent, options.mUseDeepSeekFp8, - options.mUsePerTokenSfA, options.mUsePerTokenSfB); + options.mDtypeMmaA, options.mDtypeMmaB, options.mMmaKind, options.mTileM, options.mTileN, options.mTileK, + options.mEpilogueTileM, options.mEpilogueTileN, options.mNumStages, options.mNumStagesMma, + options.mNumSlicesForSplitK, options.mNumSlicesForSliceK, options.mSplitK, options.mUseTmaStore, + options.mTransposeMmaOutput, options.mAllReduceAlgo, options.mTileScheduler == TileScheduler::Persistent, + options.mUseDeepSeekFp8, options.mUsePerTokenSfA, options.mUsePerTokenSfB, options.mBiasType); } return true; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h index 61b166a2f3..307f50fbc8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h @@ -28,169 +28,187 @@ namespace kernels { // clang-format off -#define TLLM_GEN_COMMIT "2b209663" +#define TLLM_GEN_COMMIT "e3f037b7-dirty" #define TLLM_GEN_EXPORT_VERSION "6.0.3.0.2.1" -static constexpr size_t tllmGenBatchedGemmListLen = 74; +static constexpr size_t tllmGenBatchedGemmListLen = 82; #ifndef EXCLUDE_SM_100 -extern unsigned char BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Fp16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Fp16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128u2_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128u2_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128u2_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128u2_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128u2_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128u2_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128u2_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128u2_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x128x128_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x128x128u2_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x16x128_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x16x128u2_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x32x128_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x32x128u2_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x64x128_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x64x128u2_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_perTokenSfB_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_perTokenSfB_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_MxE4m3_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a_cubin[]; -extern unsigned char BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_MxE4m3_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin[]; +extern unsigned char BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin[]; #endif // EXCLUDE_SM_100 #ifndef EXCLUDE_SM_100 -extern unsigned int BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Fp16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Fp16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128u2_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128u2_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128u2_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128u2_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128u2_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128u2_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128u2_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128u2_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x128x128_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x128x128u2_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x16x128_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x16x128u2_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x32x128_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x32x128u2_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x64x128_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x64x128u2_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_perTokenSfB_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_perTokenSfB_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_MxE4m3_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a_cubin_len; -extern unsigned int BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_MxE4m3_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len; +extern unsigned int BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len; #endif // EXCLUDE_SM_100 static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { #ifndef EXCLUDE_SM_100 -{BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len, 126976, "BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len, 126976, "BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -198,6 +216,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -214,6 +234,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -224,16 +246,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -250,6 +270,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -258,13 +279,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len, 126976, "BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len, 126976, "BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -272,6 +296,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -288,6 +314,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -306,8 +334,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -324,6 +350,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -332,13 +359,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len, 126976, "BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len, 88064, "BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -346,6 +376,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -362,6 +394,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -371,7 +405,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -380,13 +414,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -398,21 +430,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 +, /* mNumBatches */ 128 , /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mNumTokens */ 32 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len, 126976, "BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len, 126976, "BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -420,6 +456,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -436,6 +474,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -446,16 +486,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -472,7 +510,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -480,13 +519,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len, 166912, "BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len, 126976, "BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -494,6 +536,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -510,6 +554,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -519,7 +565,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -528,13 +574,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -546,7 +590,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -554,13 +599,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len, 166912, "BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len, 166912, "BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -568,6 +616,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -584,6 +634,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -594,16 +646,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 4 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -620,7 +670,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -628,13 +679,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin, BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 166912, "BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len, 166912, "BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -642,7 +696,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 0 +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 @@ -658,6 +714,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -676,8 +734,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -694,21 +750,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteAct */ 0 -, /* mGridWaitForPrimaryRouting */ 1 +, /* mNumBatches */ 128 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 32 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len, 166912, "BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin, BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len, 166912, "BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -716,7 +776,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 @@ -732,6 +794,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -742,21 +806,19 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 4 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -768,21 +830,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 1 +, /* mNumTokens */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len, 166912, "BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len, 166912, "BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -790,6 +856,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -806,6 +874,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -824,13 +894,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -842,6 +910,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -850,13 +919,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin, BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 166912, "BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len, 166912, "BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -864,80 +936,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteAct */ 0 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 - }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a_cubin_len, 165888, "BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -954,6 +954,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -964,16 +966,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 4 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -990,29 +990,35 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 +, /* mNumBatches */ 128 , /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 +, /* mNumTokens */ 32 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin, BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 165888, "BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len, 166912, "BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 0 +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 @@ -1028,6 +1034,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -1038,16 +1046,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 4 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -1064,28 +1070,34 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteAct */ 0 -, /* mGridWaitForPrimaryRouting */ 1 +, /* mNumBatches */ 128 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 32 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a_cubin_len, 165888, "BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len, 166912, "BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1102,6 +1114,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -1120,8 +1134,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -1138,6 +1150,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -1146,94 +1159,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 - }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin, BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 165888, "BatchedGemmKernel_BatchN_E2m1E2m1Fp32_E2m1_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteAct */ 0 -, /* mGridWaitForPrimaryRouting */ 1 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Fp16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin, BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Fp16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 166912, "BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Fp16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin, BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len, 166912, "BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1250,6 +1194,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -1268,13 +1214,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -1286,6 +1230,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -1294,21 +1239,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 1 , /* mNumTokens */ 0 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Fp16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin, BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Fp16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 166912, "BatchedGemmKernel_BatchN_E2m1E2m1Fp32_Fp16_Tile128x8x512u2_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len, 166912, "BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mEnablesEarlyExit */ 0 +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 @@ -1324,6 +1274,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -1334,16 +1286,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 4 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -1360,21 +1310,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteAct */ 0 -, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len, 175104, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len, 175104, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1382,6 +1336,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1398,6 +1354,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1416,8 +1374,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 @@ -1434,6 +1390,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -1442,13 +1399,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len, 175104, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len, 175104, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1456,6 +1416,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1472,6 +1434,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1490,8 +1454,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 @@ -1508,6 +1470,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -1516,13 +1479,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128u2_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128u2_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len, 175104, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128u2_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len, 175104, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1530,6 +1496,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1546,6 +1514,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1564,8 +1534,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 @@ -1582,6 +1550,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -1590,13 +1559,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128u2_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128u2_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len, 175104, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x128x128u2_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len, 175104, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1604,6 +1576,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1620,6 +1594,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1638,8 +1614,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 @@ -1656,6 +1630,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -1664,13 +1639,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len, 84992, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len, 84992, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1678,6 +1656,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1694,6 +1674,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1712,8 +1694,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 16 @@ -1730,6 +1710,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -1738,13 +1719,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len, 84992, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len, 84992, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1752,6 +1736,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1768,6 +1754,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1786,8 +1774,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 16 @@ -1804,6 +1790,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -1812,13 +1799,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128u2_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128u2_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len, 84992, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128u2_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len, 84992, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1826,6 +1816,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1842,6 +1834,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1860,8 +1854,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 16 @@ -1878,6 +1870,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -1886,13 +1879,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128u2_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128u2_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len, 84992, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x16x128u2_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len, 84992, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1900,6 +1896,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1916,6 +1914,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1934,8 +1934,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 16 @@ -1952,6 +1950,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -1960,13 +1959,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len, 97280, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len, 97280, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1974,6 +1976,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1990,6 +1994,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2008,8 +2014,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 32 @@ -2026,6 +2030,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -2034,13 +2039,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len, 97280, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len, 97280, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2048,6 +2056,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2064,6 +2074,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2082,8 +2094,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 32 @@ -2100,6 +2110,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -2108,13 +2119,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128u2_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128u2_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len, 97280, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128u2_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len, 97280, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2122,6 +2136,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2138,6 +2154,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2156,8 +2174,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 32 @@ -2174,6 +2190,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -2182,13 +2199,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128u2_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128u2_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len, 97280, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x32x128u2_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len, 97280, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2196,6 +2216,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2212,6 +2234,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2230,8 +2254,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 32 @@ -2248,6 +2270,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -2256,13 +2279,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len, 123904, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len, 123904, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2270,6 +2296,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2286,6 +2314,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2304,8 +2334,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 64 @@ -2322,6 +2350,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -2330,13 +2359,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len, 123904, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len, 123904, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2344,6 +2376,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2360,6 +2394,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2378,8 +2414,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 64 @@ -2396,6 +2430,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -2404,13 +2439,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128u2_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128u2_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len, 123904, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128u2_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len, 123904, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2418,6 +2456,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2434,6 +2474,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2452,8 +2494,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 64 @@ -2470,6 +2510,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -2478,13 +2519,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128u2_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128u2_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len, 123904, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x64x128u2_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len, 123904, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2492,6 +2536,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2508,6 +2554,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2526,8 +2574,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 64 @@ -2544,6 +2590,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -2552,13 +2599,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len, 149504, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len, 149504, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2566,6 +2616,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2582,6 +2634,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2592,16 +2646,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 8 -, /* mNumStagesMma */ 4 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -2618,6 +2670,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -2626,13 +2679,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len, 149504, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin_len, 61440, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2640,81 +2696,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 - }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin_len, 61440, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 0 +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 @@ -2730,6 +2714,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2748,8 +2734,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -2766,6 +2750,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -2774,13 +2759,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 1 , /* mNumTokens */ 0 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a_cubin_len, 149504, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len, 149504, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2788,6 +2776,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2804,6 +2794,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2822,13 +2814,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 @@ -2840,7 +2830,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -2848,13 +2839,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a_cubin_len, 149504, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_persistent_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin_len, 149504, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2862,6 +2856,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2878,6 +2874,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2888,16 +2886,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 8 -, /* mNumStagesMma */ 4 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -2914,7 +2910,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -2922,13 +2919,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin_len, 61440, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin_len, 61440, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2936,6 +2936,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2952,6 +2954,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2970,8 +2974,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -2988,6 +2990,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -2996,13 +2999,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 1 , /* mNumTokens */ 0 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len, 217088, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin_len, 149504, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -3010,12 +3016,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 +, /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3026,34 +3034,34 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 +, /* mMmaM */ 64 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileK */ 128 +, /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseTmaStore */ 1 @@ -3062,7 +3070,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -3070,13 +3079,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len, 217088, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a", 256, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len, 217088, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -3084,6 +3096,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3100,6 +3114,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -3110,16 +3126,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 3 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -3136,7 +3150,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -3144,13 +3159,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 217088, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len, 217088, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -3158,6 +3176,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3174,6 +3194,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -3192,8 +3214,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -3210,6 +3230,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -3218,13 +3239,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 1 , /* mNumTokens */ 0 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len, 217088, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len, 217088, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a", 256, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -3232,6 +3256,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3248,6 +3274,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -3266,13 +3294,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -3284,7 +3310,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -3292,13 +3319,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len, 217088, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a", 256, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len, 217088, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -3306,6 +3336,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3322,6 +3354,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -3332,16 +3366,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 3 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -3358,7 +3390,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -3366,13 +3399,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 217088, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len, 217088, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -3380,6 +3416,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3396,6 +3434,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -3414,8 +3454,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -3432,6 +3470,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -3440,64 +3479,69 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 1 , /* mNumTokens */ 0 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x128x128_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x128x128_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len, 158720, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x128x128_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len, 217088, "BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a", 256, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 128 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseTmaStore */ 1 @@ -3506,7 +3550,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -3514,72 +3559,78 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x128x128u2_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x128x128u2_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len, 158720, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x128x128u2_EpiTile64x128_Mma64x128x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len, 153600, "BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 128 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 128 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -3588,73 +3639,79 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x16x128_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x16x128_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len, 82944, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x16x128_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len, 153600, "BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 , /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -3662,72 +3719,78 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x16x128u2_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x16x128u2_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len, 82944, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x16x128u2_EpiTile64x16_Mma64x16x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len, 153600, "BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 , /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -3736,161 +3799,1056 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x32x128_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x32x128_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len, 93184, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x32x128_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len, 153600, "BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len, 225280, "BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len, 225280, "BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin_len, 225280, "BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin, BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin_len, 225280, "BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin, BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len, 165888, "BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin, BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len, 165888, "BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 1 +, /* mNumTokens */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin, BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len, 165888, "BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin, BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len, 165888, "BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 128 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 32 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin, BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len, 165888, "BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 128 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 32 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin, BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len, 165888, "BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 128 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 32 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin, BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len, 165888, "BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 +, /* mNumBatches */ 128 , /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteAct */ 1 +, /* mNumTokens */ 32 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) , /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x32x128u2_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x32x128u2_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len, 93184, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x32x128u2_EpiTile64x32_Mma64x32x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin, BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len, 165888, "BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 , /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteAct */ 1 +, /* mIsStaticBatch */ 1 +, /* mNumTokens */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x64x128_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x64x128_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len, 115712, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x64x128_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin, BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len, 158720, "BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -3898,13 +4856,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 128 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -3914,11 +4874,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 64 -, /* mMmaN */ 64 +, /* mMmaN */ 128 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 @@ -3932,11 +4894,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 128 , /* mTileK */ 128 , /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 @@ -3950,6 +4910,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -3958,13 +4919,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 1 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x64x128u2_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x64x128u2_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len, 115712, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x64x128u2_EpiTile64x64_Mma64x64x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin, BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len, 158720, "BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -3972,13 +4936,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 128 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -3988,11 +4954,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 64 -, /* mMmaN */ 64 +, /* mMmaN */ 128 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 @@ -4006,11 +4974,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 128 , /* mTileK */ 128 , /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 @@ -4024,6 +4990,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -4032,13 +4999,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 1 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len, 149504, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin, BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len, 82944, "BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -4046,45 +5016,47 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 16 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 +, /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 64 -, /* mMmaN */ 8 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 4 , /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 16 , /* mTileK */ 128 , /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 @@ -4098,6 +5070,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -4106,13 +5079,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 1 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin_len, 61440, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin, BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len, 82944, "BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -4120,47 +5096,49 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 16 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 +, /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 64 -, /* mMmaN */ 8 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 4 , /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 16 , /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 @@ -4172,21 +5150,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteAct */ 0 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a_cubin_len, 149504, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin, BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len, 93184, "BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -4194,47 +5176,49 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 +, /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 64 -, /* mMmaN */ 8 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 4 , /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 @@ -4246,6 +5230,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -4254,13 +5239,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 1 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin_len, 61440, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin, BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len, 93184, "BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -4268,45 +5256,47 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 +, /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 64 -, /* mMmaN */ 8 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 4 , /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileK */ 128 , /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 @@ -4320,21 +5310,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteAct */ 0 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_perTokenSfB_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_perTokenSfB_dynamic_sm100a_cubin_len, 216064, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_perTokenSfB_dynamic_sm100a", 256, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin, BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len, 115712, "BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -4342,58 +5336,61 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 64 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 +, /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 64 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 +, /* mTileN */ 64 +, /* mTileK */ 128 , /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 +, /* mUsePerTokenSfB */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -4402,13 +5399,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 1 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) , /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 216064, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin, BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len, 115712, "BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -4416,50 +5416,52 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 64 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 +, /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 64 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileN */ 64 +, /* mTileK */ 128 +, /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseTmaStore */ 1 @@ -4468,21 +5470,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteAct */ 0 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_perTokenSfB_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_perTokenSfB_dynamic_sm100a_cubin_len, 216064, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_perTokenSfB_dynamic_sm100a", 256, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin, BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len, 149504, "BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -4490,12 +5496,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 +, /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4506,42 +5514,43 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 +, /* mMmaM */ 64 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileK */ 128 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 +, /* mUsePerTokenSfB */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -4550,13 +5559,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 1 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) , /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 216064, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_E4m3_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin, BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin_len, 61440, "BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -4564,12 +5576,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 +, /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4580,34 +5594,34 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 +, /* mMmaM */ 64 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileK */ 128 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseTmaStore */ 1 @@ -4616,6 +5630,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -4624,21 +5639,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 1 , /* mNumTokens */ 0 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin_len, 61440, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin, BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin_len, 149504, "BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mEnablesEarlyExit */ 0 +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 @@ -4654,6 +5674,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -4663,7 +5685,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 +, /* mNumStages */ 8 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -4672,13 +5694,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 @@ -4690,28 +5710,34 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteAct */ 0 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin_len, 61440, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x128u2_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin, BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin_len, 61440, "BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -4728,6 +5754,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -4746,8 +5774,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -4764,6 +5790,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -4772,21 +5799,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 1 , /* mNumTokens */ 0 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 217088, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin, BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len, 216064, "BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a", 256, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mEnablesEarlyExit */ 0 +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 @@ -4802,6 +5834,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -4820,8 +5854,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -4838,28 +5870,34 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteAct */ 0 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) , /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin, BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 217088, "BatchedGemmKernel_BatchN_E4m3E4m3Fp32_Fp16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin, BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len, 216064, "BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -4876,6 +5914,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -4894,13 +5934,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -4912,6 +5950,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -4920,20 +5959,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 1 , /* mNumTokens */ 0 -, /* mRouteAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len, 153600, "BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin, BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len, 216064, "BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a", 256, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -4950,31 +5994,31 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -4984,8 +6028,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -4994,21 +6039,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len, 153600, "BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin, BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len, 216064, "BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 @@ -5024,31 +6074,31 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -5058,31 +6108,37 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 1 +, /* mNumTokens */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len, 153600, "BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin, BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len, 166912, "BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 @@ -5098,9 +6154,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 @@ -5108,21 +6166,19 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 4 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -5134,29 +6190,35 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 1 +, /* mNumTokens */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len, 153600, "BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x256u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin, BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len, 166912, "BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 @@ -5172,9 +6234,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 @@ -5182,20 +6246,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 4 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 @@ -5208,34 +6270,40 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 1 +, /* mNumTokens */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len, 225280, "BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin, BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin_len, 61440, "BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 +, /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5246,10 +6314,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -5257,59 +6327,63 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 3 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 512 +, /* mTileK */ 128 , /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 1 +, /* mNumTokens */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len, 225280, "BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin, BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin_len, 61440, "BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 +, /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5320,10 +6394,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -5331,54 +6407,58 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 3 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 +, /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mTileK */ 128 +, /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 1 +, /* mNumTokens */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a_cubin_len, 225280, "BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_dynamic_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin, BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len, 217088, "BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 @@ -5394,9 +6474,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 @@ -5404,21 +6486,19 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 3 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -5428,31 +6508,37 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 1 +, /* mNumTokens */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a_cubin_len, 225280, "BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_Bfloat16_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_persistent_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin, BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin_len, 217088, "BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 @@ -5468,9 +6554,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 @@ -5478,16 +6566,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 3 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -5502,23 +6588,27 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 1 +, /* mNumTokens */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_MxE4m3_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_MxE4m3_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a_cubin_len, 224256, "BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_MxE4m3_Tile128x8x512_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin, BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len, 224256, "BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -5526,6 +6616,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826828) , /* mDtypeB */ trtllm::gen::Dtype(17827853) , /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -5542,6 +6634,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(5) @@ -5560,8 +6654,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -5576,8 +6668,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -5586,13 +6679,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 1 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 1 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, -{BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_MxE4m3_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a_cubin, BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_MxE4m3_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a_cubin_len, 224256, "BatchedGemmKernel_BatchN_MxE2m1MxE4m3Fp32_MxE4m3_Tile128x8x512u2_EpiTile128x8_Mma128x8x32_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin, BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin_len, 224256, "BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -5600,6 +6696,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826828) , /* mDtypeB */ trtllm::gen::Dtype(17827853) , /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -5616,6 +6714,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(5) @@ -5634,8 +6734,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -5650,8 +6748,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) , /* mBatchedM */ {} @@ -5660,11 +6759,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumBatches */ 2 , /* mIsStaticBatch */ 0 , /* mNumTokens */ 2 -, /* mRouteAct */ 1 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) , /* mGridWaitForPrimaryRouting */ 1 , /* mFusedAct */ 1 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 }, gemm::SmVersion::Sm100a }, #endif // EXCLUDE_SM_100 }; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h index 77dd7e43d7..6fb72cf40d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h @@ -19,6 +19,7 @@ #include "trtllm/gen/CommonUtils.h" #include "trtllm/gen/SfLayoutDecl.h" +#include "BatchedGemmEnums.h" #include "Enums.h" #include "TmaDescriptor.h" @@ -65,16 +66,22 @@ struct KernelParams // Tile box strides are [tileK, 1]. // // If batchN: - // If transposeMatrixA is false + // If layoutA is MatrixLayout::MajorK // Logical shape is [B, divUpMul(M, tileM), K]. // Logical strides are [divUpMul(M, tileM) * K, K, 1]. // Tile box shape is [1, tileM, tileK]. // Tile box strides are [0, tileK, 1]. - // If transposeMatrixA is true + // If layoutA is MatrixLayout::Mn // Logical shape is [B, K, divUpMul(M, tileM)]. // Logical strides are [K * divUpMul(M, tileM), divUpMul(M, tileM), 1]. // Tile box shape is [1, tileK, tileM]. // Tile box strides are [0, tileM, 1]. + // If layoutA is MatrixLayout::BlockMajorK + // Logical shape is [B, K / blockK, divUpMul(M, tileM), blockK]. + // Logical strides are [K * divUpMul(M, tileM), divUpMul(M, tileM) * blockK, blockK, 1]. + // Tile box shape is [1, tileK / min(blockK, tileK), tileM, min(blockK, tileK)]. + // Tile box strides are [0, tileM * min(blockK, tileK), min(blockK, tileK), 1]. + // where blockK is 128B. // // Dtype is set from options.mDtypeA. CUtensorMap tmaA[1]; @@ -84,16 +91,22 @@ struct KernelParams // makeTmaShapeStrideAbc. // // If batchM: - // If transposeMatrixB is true + // If layoutB is MatrixLayout::MajorK // Logical shape is [B, divUpMul(N, tileN), K]. // Logical strides are [divUpMul(N, tileN) * K, K, 1]. // Tile box shape is [1, tileN, tileK]. // Tile box strides are [0, tileK, 1]. - // If transposeMatrixB is false + // If layoutB is MatrixLayout::MajorMn // Logical shape is [B, K, divUpMul(N, tileN)]. // Logical strides are [K * divUpMul(N, tileN), divUpMul(N, tileN), 1]. // Tile box shape is [1, tileK, tileN]. // Tile box strides are [0, tileN, 1]. + // If layoutB is MatrixLayout::BlockMajorK + // Logical shape is [B, K / blockK, divUpMul(N, tileN), blockK]. + // Logical strides are [K * divUpMul(N, tileN), divUpMul(N, tileN) * blockK, blockK, 1]. + // Tile box shape is [1, tileK / min(blockK, tileK), tileN, min(blockK, tileK)]. + // Tile box strides are [0, tileN * min(blockK, tileK), min(blockK, tileK), 1]. + // where blockK is 128B. // // If batchN: // Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B), K]. @@ -212,6 +225,11 @@ struct KernelParams // Shape is [B]. One scaling factor per tensor in batch. float const* ptrScaleGate; + // The alpha and beta for SwiGlu. + // Shape is [B]. One alpha and one beta per tensor in batch. + float const* ptrSwiGluAlpha; + float const* ptrSwiGluBeta; + // The K dimension. It is the hidden dimension of the input matrices. int32_t k; @@ -294,6 +312,21 @@ struct KernelParams // Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B)] void const* ptrPerTokenSfB; + // The bias applied after the GEMM and before the activation function. + // The bias is applied before applying the global scaling factor. I.e. + // C = act(A * B + bias') * scaleC + // scaleC = dequantA * dequantB * quantC + // Thus, the bias' = bias / (dequantA * dequantB), where the bias is the original bias. + // + // If batchM, BiasType must be N, and bias shape is [B, N]. + // The bias is broadcasted along the M dimension. + // + // If batchNm BiasType must be M, and bias shape is [B, M]. + // The bias is broadcasted along the N dimension. + // + // The dtype is float32. + void const* ptrBias{nullptr}; + // The output block scaling factors for C. // // If MxFp{4,8} and NvFp4 formats are used, @@ -366,6 +399,9 @@ struct KernelParams // Total number of unpadded inputs int32_t numTokens; + // Total number of batches + int32_t numBatches; + ////////////////////////////////////////////////////////////////////////////////////////////////// // // Batching information parameters. @@ -478,29 +514,44 @@ struct KernelParams }; // Create the TMA shape/stride for A/B/C. - static auto makeTmaShapeStrideAbc(bool const transposeMatrixA, bool const transposeMatrixB, - bool const transposeMmaOutput, bool const useFusedAct, int numBatches, int mM, int mN, int mK, - MatrixType matrixType) + template + static auto makeTmaShapeStrideAbc( + GemmOptions const& options, int mM, int mN, int mK, int tileM, int tileN, int tileK, MatrixType matrixType) { // Weights matrix is A if we transpose the output of MMA (to have it M-major). // Otherwise, it is B, when the output of MMA is K-major. - bool const isWeights = (matrixType == MatrixType::MatrixA && transposeMmaOutput) - || (matrixType == MatrixType::MatrixB && !transposeMmaOutput); + bool const isWeights = (matrixType == MatrixType::MatrixA && options.mTransposeMmaOutput) + || (matrixType == MatrixType::MatrixB && !options.mTransposeMmaOutput); + // The outer dimension. auto numTokens = (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? mM : mN; + // The outer dimension tile size. + auto tileNumTokens = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileM + : (matrixType == MatrixType::MatrixA) ? tileM + : tileN; + // The inner dimension. auto hiddenSize = (matrixType == MatrixType::MatrixC) ? mN : mK; - if (matrixType == MatrixType::MatrixC && transposeMmaOutput) + // The inner dimension tile size. + auto tileHiddenSize = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileN : tileK; + + // Swap matrix C sizes if output is transpose + if (matrixType == MatrixType::MatrixC && options.mTransposeMmaOutput) { numTokens = mN; hiddenSize = mM; + tileNumTokens = options.mEpilogueTileN; + tileHiddenSize = options.mEpilogueTileM; } // For a fused activation kernel, the hidden size of output is halved. TODO: That's true for // gated activations but not regular activations. - if (useFusedAct) + if (options.mFusedAct) { if (matrixType == MatrixType::MatrixC) + { hiddenSize /= 2; + tileHiddenSize /= 2; + } } // The cute tensor shape for A/B: (numTokens, hiddenSize). @@ -511,7 +562,7 @@ struct KernelParams // Ativations matrix is 2D (sum(divUpMul(M[bi], tileM) for bi in B), K). if (isWeights) { - shape.push_back(static_cast(numBatches)); + shape.push_back(static_cast(options.mNumBatches)); } // Assemble the stride (strideTokens, 1). @@ -522,20 +573,41 @@ struct KernelParams stride.push_back(static_cast(hiddenSize * numTokens)); } - // Apply transpose if necessary - if ((matrixType == MatrixType::MatrixA && transposeMatrixA) - || (matrixType == MatrixType::MatrixB && !transposeMatrixB)) + // Assemble the box shape + std::vector tileShape = {tileHiddenSize, tileNumTokens}; + + // Alternate layouts do not apply to matrixC + if (matrixType != MatrixType::MatrixC) { - std::swap(shape[0], shape[1]); - stride[1] = numTokens; + gemm::MatrixLayout layout = (matrixType == MatrixType::MatrixA) ? options.mLayoutA : options.mLayoutB; + // Note, only the weights support non MajorK layouts + if (layout == gemm::MatrixLayout::MajorMn) + { + // Apply transpose if necessary + std::swap(shape[0], shape[1]); + stride[1] = numTokens; + std::swap(tileShape[0], tileShape[1]); + } + else if (layout == gemm::MatrixLayout::BlockMajorK) + { + // Set shapes based on blocking layout + shape = {static_cast(options.mBlockK), static_cast(numTokens), + static_cast(mK / options.mBlockK), static_cast(options.mNumBatches)}; + stride = {1, static_cast(options.mBlockK), static_cast(numTokens * options.mBlockK), + static_cast(hiddenSize * numTokens)}; + + // If blockK > tileK, then the inner most box size will be based on the tile + int32_t const tileBlockK = std::min(options.mBlockK, tileHiddenSize); + tileShape = {tileBlockK, tileNumTokens, tileHiddenSize / tileBlockK}; + } } - return std::make_tuple(shape, stride); + return std::make_tuple(shape, stride, tileShape); } // Create the TMA shape/stride for A/B block scaling factors. static auto makeTmaShapeStrideSfAb(int mM, int mN, int mK, MatrixType matrixType, int tileM, int tileN, int tileK, - tg::Dtype dtypeElt, tg::SfLayout layout) + tg::Dtype dtypeElt, tg::SfLayout layout, int sfReshapeFactor) { // The outer dimension. @@ -585,15 +657,36 @@ struct KernelParams { // The scaling factor tensor packs 8x4 tiles into contiguous 32B blocks. // - // As the inner dimension (k) is required to be a multiple of the tile size, we - // can reshape to use fewer read requests, if the tile dimensions allow. - // I.e., let's define repeats = min(hiddenSizePerTile / numEltsPerSf / 4, 8) + // As the inner dimension (k) is often a multiple of the tile size, we can reshape to use + // fewer read requests, if the tile dimensions allow. It does not reduce the number of + // instructions. + // + // I.e., let's define r = min(⌈hiddenSizePerTile / (numEltsPerSf * 4)⌉, 8) + // + // The "logical" tensor is: [outer, inner / numEltsPerSf] + // The 8x4 SF layout is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf), 32] + // The TMA tensor shape is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf * r), r * 32] // - // The "logical" tensor is: [outer, inner / numEltsPerSf] - // The 8x4 SF layout is: [outer / 8, inner / numEltsPerSf / 4, 32] - // The TMA tensor shape is: [outer / 8, inner / numEltsPerSf / 4 / repeats, repeats * 32] + // The caveat of NumRepeats>1 is we must pad the hidden dimension of SF to multiples of + // NumRepeats * numEltsPerSf * 4. + + // Detect if the supplied factor is power of 2. E.g., 0b0100 and (0b0100 - 1) == 0b0000. + int const r = sfReshapeFactor; + if (r > 0 && (r & (r - 1)) != 0) + { + throw std::runtime_error( + "mSfReshapeFactor must be positive and a power of 2. Found " + std::to_string(r)); + } - int const repeats = std::min(ceilDiv(hiddenSizePerTile, numEltsPerSf * 4), 8); + // Sanitize number of repeats so it doesn't exceed the dimension. + int const repeats = std::min(ceilDiv(hiddenSizePerTile, numEltsPerSf * 4), r); + + // Detect if the input hidden size K is a multiple of the repeats. + if (ceilDiv(hiddenSize, numEltsPerSf * 4) % repeats != 0) + { + throw std::runtime_error("SF hiddenSize K (" + std::to_string(ceilDiv(hiddenSize, numEltsPerSf * 4)) + + ") must be a multiple of repeats (" + std::to_string(repeats) + ")"); + } auto shape = std::vector{static_cast(repeats * 32), static_cast(ceilDiv(hiddenSize, numEltsPerSf * 4 * repeats)), @@ -618,18 +711,14 @@ struct KernelParams return std::make_tuple(std::vector{}, std::vector{}, std::vector{}); } - static KernelParams setKernelParams(int32_t const numBatches, int32_t const numTokens, bool const batchM, - int32_t const m, int32_t const n, int32_t const k, std::vector const& batchedM, - std::vector const& batchedN, int32_t const tileM, int32_t const tileN, int32_t const tileK, - int32_t const epilogueTileM, int32_t const epilogueTileN, bool const useDeepSeekFp8, bool const useTmaStore, - bool const transposeMatrixA, bool const transposeMatrixB, bool const transposeMmaOutput, tg::SfLayout sfLayoutB, - bool const useFusedAct, tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::MmaKind mmaKind, - void const* ptrA, void const* ptrB, void* ptrC, void const* dSfA, void const* dSfB, void const* ptrPerTokenSfA, - void const* ptrPerTokenSfB, void* dSfC, float const* ptrScaleC, float const* ptrScaleGate, - int32_t const* ptrRouteMap, float* rowMax, uint32_t* rowMaxBars, bool isStaticBatch = true, - int32_t const* ptrNumNonExitingCtas = nullptr, int32_t const* ptrTotalNumPaddedTokens = nullptr, - int32_t const* ptrCtaIdxXyToBatchIdx = nullptr, int32_t const* ptrCtaIdxXyToMnLimit = nullptr, - int32_t const maxNumCtas = MaxNumCtas) + template + static KernelParams setKernelParams(GemmOptions_ const& options, bool const batchM, void const* ptrA, + void const* ptrB, void* ptrC, void const* dSfA, void const* dSfB, void const* ptrPerTokenSfA, + void const* ptrPerTokenSfB, void const* ptrBias, void* dSfC, float const* ptrScaleC, float const* ptrScaleGate, + float const* ptrSwiGluAlpha, float const* ptrSwiGluBeta, int32_t const* routeMap, float* rowMax, + uint32_t* rowMaxBars, int32_t const* ptrNumNonExitingCtas = nullptr, + int32_t const* ptrTotalNumPaddedTokens = nullptr, int32_t const* ptrCtaIdxXyToBatchIdx = nullptr, + int32_t const* ptrCtaIdxXyToMnLimit = nullptr, int32_t const maxNumCtas = MaxNumCtas) { static_assert(sizeof(KernelParams) <= 32 * 1024, "sizeof(KernelParams) has to be less or equal than 32KB"); @@ -637,31 +726,34 @@ struct KernelParams // Create the return struct. KernelParams params; - if (numBatches > KernelParams::MaxBatchSize) + if (options.mNumBatches > KernelParams::MaxBatchSize) { throw std::runtime_error("GEMM batch limit reached."); } - params.ptrRouteMap = ptrRouteMap; - params.numTokens = numTokens; + params.ptrRouteMap = routeMap; + params.numTokens = options.mNumTokens; params.ptrScaleC = ptrScaleC; params.ptrScaleGate = ptrScaleGate; + params.ptrSwiGluAlpha = ptrSwiGluAlpha; + params.ptrSwiGluBeta = ptrSwiGluBeta; + int32_t ctaOffset = 0; // Compute totalNumPaddedTokens, ctaIdxXyToBatchIdx and ctaIdxXyToMnLimit if the batch dims are // known at kernel launch time. Otherwise, these parameters are defined in the device buffers: // ptrTotalNumPaddedTokens, ptrCtaIdxXyToBatchIdx and ptrCtaIdxXyToMnLimit respectively. - if (isStaticBatch) + if (options.mIsStaticBatch) { params.totalNumPaddedTokens = 0; - for (int b = 0; b < numBatches; b++) + for (int b = 0; b < options.mNumBatches; b++) { - int mM = batchM ? batchedM[b] : n; - int mN = batchM ? m : batchedN[b]; + int mM = batchM ? options.mBatchedM[b] : options.mN; + int mN = batchM ? options.mM : options.mBatchedN[b]; // Skip Tma descriptor creation if expert isn't used if (mM == 0 || mN == 0) @@ -670,9 +762,10 @@ struct KernelParams } // The number of CTAs. - int32_t numCtas = batchM ? (mM + tileM - 1) / tileM : (mN + tileN - 1) / tileN; + int32_t numCtas + = batchM ? (mM + options.mTileM - 1) / options.mTileM : (mN + options.mTileN - 1) / options.mTileN; // The size of the tile. - int32_t tile = batchM ? tileM : tileN; + int32_t tile = batchM ? options.mTileM : options.mTileN; // The problem size. int32_t mn = batchM ? mM : mN; int32_t tokensPerTile = mn; @@ -704,15 +797,15 @@ struct KernelParams ctaOffset = maxNumCtas; } - if (useDeepSeekFp8 && dtypeC == tg::Dtype::E4m3) + if (options.mUseDeepSeekFp8 && options.mDtypeC == tg::Dtype::E4m3) { params.ptrDqSfsC = reinterpret_cast(dSfC); } params.ptrA = ptrA; params.ptrB = ptrB; - params.strideInBytesA = k * tg::dtypeGetNumBits(dtypeA) / 8; - params.strideInBytesB = k * tg::dtypeGetNumBits(dtypeB) / 8; + params.strideInBytesA = options.mK * tg::dtypeGetNumBits(options.mDtypeA) / 8; + params.strideInBytesB = options.mK * tg::dtypeGetNumBits(options.mDtypeB) / 8; params.ptrSfA = dSfA; params.ptrSfB = dSfB; @@ -721,75 +814,96 @@ struct KernelParams if (!batchM) { // A is the expert - if (0 != m % tileM) + if (0 != options.mM % options.mTileM) { throw std::runtime_error("0 == mM %% tileM"); } - params.tileStridePerBatch = m / tileM; - params.nm = m; + params.tileStridePerBatch = options.mM / options.mTileM; + params.nm = options.mM; // Shape/stride for gmem tensor A. - auto [shapeA, strideA] = makeTmaShapeStrideAbc(transposeMatrixA, transposeMatrixB, transposeMmaOutput, - useFusedAct, numBatches, m, n, k, MatrixType::MatrixA); + auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAbc(options, options.mM, options.mN, options.mK, + options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixA); // Build tma descriptor for A. - params.tmaA[0] = gemm::buildNdTmaDescriptor(dtypeA, mmaKind, shapeA, strideA, - transposeMatrixA ? tileK : tileM, transposeMatrixA ? tileM : tileK, const_cast(ptrA)); + params.tmaA[0] = gemm::buildNdTmaDescriptor( + options.mDtypeA, options.mMmaKind, shapeA, strideA, tileShapeA, const_cast(ptrA)); // The input is padded: // [act0, padding, padding, ... TileN size .., act1, padding, padding, ...] - auto const inputNumTokens = ctaOffset * tileN; - // B is the activation - // Shape/stride for gmem tensor B. - auto [shapeB, strideB] = makeTmaShapeStrideAbc(transposeMatrixA, transposeMatrixB, transposeMmaOutput, - useFusedAct, numBatches, m, inputNumTokens, k, MatrixType::MatrixB); - // Build tma descriptor for B. - params.tmaB[0] = gemm::buildNdTmaDescriptor(dtypeB, mmaKind, shapeB, strideB, - !transposeMatrixB ? tileK : tileN, !transposeMatrixB ? tileN : tileK, const_cast(ptrB)); + auto const inputNumTokens = ctaOffset * options.mTileN; + + if (!batchedGemm::doesRouteImplUseLdgsts(options.mRouteImpl)) + { + bool useRouteAct = batchedGemm::doesRouteImplUseTma(options.mRouteImpl); + // B is the activation + // Shape/stride for gmem tensor B. + auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAbc(options, options.mM, + useRouteAct ? options.mNumTokens : inputNumTokens, options.mK, options.mTileM, + (useRouteAct ? 1 : options.mTileN), options.mTileK, MatrixType::MatrixB); + // Build tma descriptor for B. + params.tmaB[0] = gemm::buildNdTmaDescriptor( + options.mDtypeB, options.mMmaKind, shapeB, strideB, tileShapeB, const_cast(ptrB)); + } - if (dtypeA == tg::Dtype::E2m1 || dtypeA == tg::Dtype::MxE4m3 || dtypeA == tg::Dtype::MxE2m1) + if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE4m3 + || options.mDtypeA == tg::Dtype::MxE2m1) { - tg::Dtype const dTypeSf = (dtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; + tg::Dtype const dTypeSf = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; // Build TMA descriptor for gmem A block scaling factors. - auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb( - m * numBatches, n, k, MatrixType::MatrixA, tileM, tileN, tileK, dtypeA, tg::SfLayout::R128c4); + auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(options.mM * options.mNumBatches, + options.mN, options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK, + options.mDtypeA, tg::SfLayout::R128c4, options.mSfReshapeFactor); params.tmaSfA[0] = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast(dSfA)); } - if (dtypeB == tg::Dtype::E2m1 || dtypeB == tg::Dtype::MxE4m3 || dtypeB == tg::Dtype::MxE2m1) + if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE4m3 + || options.mDtypeB == tg::Dtype::MxE2m1) { - tg::Dtype const dTypeSf = (dtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; + tg::Dtype const dTypeSf = (options.mDtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; - // The input is padded: - // [act0, padding, padding, ... TileN size .., act1, padding, padding, ...] - auto const inputNumTokensSfB = ctaOffset * tileN; + if (batchedGemm::doesRouteImplUseTma(options.mRouteImpl)) + { - // Build TMA descriptor for gmem B block scaling factors. - auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideSfAb( - m, inputNumTokensSfB, k, MatrixType::MatrixB, tileM, tileN, tileK, dtypeB, sfLayoutB); - params.tmaSfB[0] - = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast(dSfB)); + // The input is NOT padded: + // [act0, act1, act2, ...] + + // Build TMA descriptor for gmem B block scaling factors. + int32_t const numEltsPerSf = tg::dtypeNumEltsPerSf(options.mDtypeB); + auto [shapeSfB, strideSfB, tileShapesSfB] + = makeTmaShapeStrideAbc(options, options.mM, options.mNumTokens, options.mK / numEltsPerSf, + options.mTileM, 1 /* tileN */, options.mTileK / numEltsPerSf, MatrixType::MatrixB); + params.tmaSfB[0] = gemm::buildNdTmaDescriptor(dTypeSf, options.mMmaKind, shapeSfB, strideSfB, + tileShapesSfB, const_cast(dSfB), + /*doSwizzle*/ true); + } + else if (batchedGemm::doesRouteImplUseNoRoute(options.mRouteImpl)) + { + + // The input is padded: + // [act0, padding, padding, ... TileN size .., act1, padding, padding, ...] + + auto const inputNumTokensSfB = ctaOffset * options.mTileN; + + // Build TMA descriptor for gmem B block scaling factors. + auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideSfAb(options.mM, inputNumTokensSfB, + options.mK, MatrixType::MatrixB, options.mTileM, options.mTileN, options.mTileK, + options.mDtypeB, options.mSfLayoutB, options.mSfReshapeFactor); + params.tmaSfB[0] = gemm::buildSfTmaDescriptor( + dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast(dSfB)); + } } // C is the output activation - if (useTmaStore) + if (options.mUseTmaStore) { // Shape/stride for gmem tensor C. - auto [shapeC, strideC] = makeTmaShapeStrideAbc(transposeMatrixA, transposeMatrixB, transposeMmaOutput, - useFusedAct, numBatches, m, ctaOffset * tileN, k, MatrixType::MatrixC); - - // Swap M and N tiles for the M-major epilogue. - auto outputTileM = transposeMmaOutput ? epilogueTileN : epilogueTileM; - auto outputTileN = transposeMmaOutput ? epilogueTileM : epilogueTileN; - - if (useFusedAct) - { - // for a fused activation kernel, output tile `N` is halved - outputTileN /= 2; - } + auto [shapeC, strideC, tileShapeC] + = makeTmaShapeStrideAbc(options, options.mM, ctaOffset * options.mTileN, options.mK, options.mTileM, + options.mTileN, options.mTileK, MatrixType::MatrixC); // Build tma descriptor for C. - params.tmaC[0] = gemm::buildNdTmaDescriptor( - dtypeC, tg::MmaKind::Auto, shapeC, strideC, outputTileM, outputTileN, ptrC); + params.tmaC[0] + = gemm::buildNdTmaDescriptor(options.mDtypeC, tg::MmaKind::Auto, shapeC, strideC, tileShapeC, ptrC); } else { @@ -799,75 +913,76 @@ struct KernelParams else { // B is the expert - if (0 != n % tileN) + if (0 != options.mN % options.mTileN) { throw std::runtime_error("0 == mN %% tileN"); } - params.tileStridePerBatch = n / tileN; - params.nm = n; + params.tileStridePerBatch = options.mN / options.mTileN; + params.nm = options.mN; // Shape/stride for gmem tensor B. - auto [shapeB, strideB] = makeTmaShapeStrideAbc(transposeMatrixA, transposeMatrixB, transposeMmaOutput, - useFusedAct, numBatches, m, n, k, MatrixType::MatrixB); + auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAbc(options, options.mM, options.mN, options.mK, + options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixB); // Build tma descriptor for B. - params.tmaB[0] = gemm::buildNdTmaDescriptor(dtypeB, mmaKind, shapeB, strideB, - !transposeMatrixB ? tileK : tileN, !transposeMatrixB ? tileN : tileK, const_cast(ptrB)); + params.tmaB[0] = gemm::buildNdTmaDescriptor( + options.mDtypeB, options.mMmaKind, shapeB, strideB, tileShapeB, const_cast(ptrB)); - // A is the activation - // Shape/stride for gmem tensor A. - // The input is padded: - // [act0, padding, padding, ... tileM size .., act1, padding, padding, ...] - auto const inputNumTokens = ctaOffset * tileM; - auto [shapeA, strideA] = makeTmaShapeStrideAbc(transposeMatrixA, transposeMatrixB, transposeMmaOutput, - useFusedAct, numBatches, inputNumTokens, n, k, MatrixType::MatrixA); - // Build tma descriptor for A. - params.tmaA[0] = gemm::buildNdTmaDescriptor(dtypeA, mmaKind, shapeA, strideA, - transposeMatrixA ? tileK : tileM, transposeMatrixA ? tileM : tileK, const_cast(ptrA)); - - if (dtypeA == tg::Dtype::E2m1 || dtypeA == tg::Dtype::MxE4m3 || dtypeA == tg::Dtype::MxE2m1) + if (options.mRouteImpl == batchedGemm::RouteImpl::NoRoute) { - tg::Dtype const dTypeSf = (dtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; - + // A is the activation + // Shape/stride for gmem tensor A. // The input is padded: // [act0, padding, padding, ... tileM size .., act1, padding, padding, ...] - auto const inputNumTokensSfA = ctaOffset * tileM; + auto const inputNumTokens = ctaOffset * options.mTileM; + auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAbc(options, inputNumTokens, options.mN, + options.mK, options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixA); + // Build tma descriptor for A. + params.tmaA[0] = gemm::buildNdTmaDescriptor( + options.mDtypeA, options.mMmaKind, shapeA, strideA, tileShapeA, const_cast(ptrA)); + } - // Build TMA descriptor for gmem A block scaling factors. - auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb( - inputNumTokensSfA, n, k, MatrixType::MatrixA, tileM, tileN, tileK, dtypeA, tg::SfLayout::R128c4); - params.tmaSfA[0] - = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast(dSfA)); + if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE4m3 + || options.mDtypeA == tg::Dtype::MxE2m1) + { + tg::Dtype const dTypeSf = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; + + if (options.mRouteImpl == batchedGemm::RouteImpl::NoRoute) + { + + // The input is padded: + // [act0, padding, padding, ... tileM size .., act1, padding, padding, ...] + auto const inputNumTokensSfA = ctaOffset * options.mTileM; + + // Build TMA descriptor for gmem A block scaling factors. + auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(inputNumTokensSfA, options.mN, + options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK, + options.mDtypeA, tg::SfLayout::R128c4, options.mSfReshapeFactor); + params.tmaSfA[0] = gemm::buildSfTmaDescriptor( + dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast(dSfA)); + } } - if (dtypeB == tg::Dtype::E2m1 || dtypeB == tg::Dtype::MxE4m3 || dtypeB == tg::Dtype::MxE2m1) + if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE4m3 + || options.mDtypeB == tg::Dtype::MxE2m1) { - tg::Dtype const dTypeSf = (dtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; + tg::Dtype const dTypeSf = (options.mDtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; // Build TMA descriptor for gmem B block scaling factors. - auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideSfAb( - m, n * numBatches, k, MatrixType::MatrixB, tileM, tileN, tileK, dtypeB, sfLayoutB); + auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideSfAb(options.mM, + options.mN * options.mNumBatches, options.mK, MatrixType::MatrixB, options.mTileM, options.mTileN, + options.mTileK, options.mDtypeB, options.mSfLayoutB, options.mSfReshapeFactor); params.tmaSfB[0] = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast(dSfB)); } // C is the output activation - if (useTmaStore) + if (options.mUseTmaStore) { // Shape/stride for gmem tensor C. - auto [shapeC, strideC] = makeTmaShapeStrideAbc(transposeMatrixA, transposeMatrixB, transposeMmaOutput, - useFusedAct, numBatches, ctaOffset * tileM, n, k, MatrixType::MatrixC); - - // Swap M and N tiles for the M-major epilogue. - auto outputTileM = transposeMmaOutput ? epilogueTileN : epilogueTileM; - auto outputTileN = transposeMmaOutput ? epilogueTileM : epilogueTileN; - - if (useFusedAct) - { - // for a fused activation kernel, output tile `N` is halved - outputTileN /= 2; - } + auto [shapeC, strideC, tileShapeC] = makeTmaShapeStrideAbc(options, ctaOffset * options.mTileM, + options.mN, options.mK, options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixC); // Build tma descriptor for C. - params.tmaC[0] = gemm::buildNdTmaDescriptor( - dtypeC, tg::MmaKind::Auto, shapeC, strideC, outputTileM, outputTileN, ptrC); + params.tmaC[0] + = gemm::buildNdTmaDescriptor(options.mDtypeC, tg::MmaKind::Auto, shapeC, strideC, tileShapeC, ptrC); } else { @@ -875,7 +990,8 @@ struct KernelParams } } - params.k = k; + params.k = options.mK; + params.numBatches = options.mNumBatches; params.rank = 0; params.tpGrpSize = 1; @@ -888,32 +1004,10 @@ struct KernelParams // Set the per-token scale factors for MetaFP8 or scale inputs params.ptrPerTokenSfA = ptrPerTokenSfA; params.ptrPerTokenSfB = ptrPerTokenSfB; + params.ptrBias = ptrBias; return params; } - - //////////////////////////////////////////////////////////////////////////////////////////////////// - - template - static KernelParams setKernelParams(GemmOptions_ const& options, bool const batchM, void const* ptrA, - void const* ptrB, void* ptrC, void const* dSfA, void const* dSfB, void const* ptrPerTokenSfA, - void const* ptrPerTokenSfB, void* dSfC, float const* ptrScaleC, float const* ptrScaleGate, - int32_t const* routeMap, float* rowMax, uint32_t* rowMaxBars, int32_t const* ptrNumNonExitingCtas = nullptr, - int32_t const* ptrTotalNumPaddedTokens = nullptr, int32_t const* ptrCtaIdxXyToBatchIdx = nullptr, - int32_t const* ptrCtaIdxXyToMnLimit = nullptr, int32_t const maxNumCtas = MaxNumCtas) - { - - bool const useFusedAct = options.mFusedAct; - - return setKernelParams(options.mNumBatches, options.mNumTokens, batchM, options.mM, options.mN, options.mK, - options.mBatchedM, options.mBatchedN, options.mTileM, options.mTileN, options.mTileK, - options.mEpilogueTileM, options.mEpilogueTileN, options.mUseDeepSeekFp8, options.mUseTmaStore, - options.mTransposeMatrixA, options.mTransposeMatrixB, options.mTransposeMmaOutput, options.mSfLayoutB, - useFusedAct, options.mDtypeA, options.mDtypeB, options.mDtypeC, options.mMmaKind, ptrA, ptrB, ptrC, dSfA, - dSfB, ptrPerTokenSfA, ptrPerTokenSfB, dSfC, ptrScaleC, ptrScaleGate, routeMap, rowMax, rowMaxBars, - options.mIsStaticBatch, ptrNumNonExitingCtas, ptrTotalNumPaddedTokens, ptrCtaIdxXyToBatchIdx, - ptrCtaIdxXyToMnLimit, maxNumCtas); - } #endif }; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h index c46578e4b4..73ce5ce1ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h @@ -162,11 +162,12 @@ class KernelTraits KernelTraits() {} // The constructor. - KernelTraits(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeAcc, tg::MmaKind mmaKind, - int32_t tileM, int32_t tileN, int32_t tileK, int32_t epilogueTileM, int32_t epilogueTileN, int32_t numStages, - int32_t numStagesMma, int32_t numSlicesForSplitK, int32_t numSlicesForSliceK, SplitK splitK, bool useTmaStore, - bool transposeMmaOutput, AllReduceAlgo allReduceAlgo, bool usePersistentScheduler, bool useDeepSeekFp8, - bool usePerTokenSfA, bool usePerTokenSfB) + KernelTraits(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeAcc, tg::Dtype dtypeMmaA, + tg::Dtype dtypeMmaB, tg::MmaKind mmaKind, int32_t tileM, int32_t tileN, int32_t tileK, int32_t epilogueTileM, + int32_t epilogueTileN, int32_t numStages, int32_t numStagesMma, int32_t numSlicesForSplitK, + int32_t numSlicesForSliceK, SplitK splitK, bool useTmaStore, bool transposeMmaOutput, + AllReduceAlgo allReduceAlgo, bool usePersistentScheduler, bool useDeepSeekFp8, bool usePerTokenSfA, + bool usePerTokenSfB, BiasType biasType) : mMmaKind{mmaKind} { // @@ -181,16 +182,17 @@ class KernelTraits // [rowMax ] (16B aligned) (if needed) // [sliceK ] (16B aligned) (if needed) // [per-token SF ] (16B aligned) (if needed) + // [bias ] (16B aligned) (if needed) // // SMEM for smemA and smemB might be repurposed and used for gmemC0 and gmemC1: // // [..smemA..][..smemB..][..smemBShuffle..] - // [..gmemC0..][..gmemC1..][..rowMax..][..sliceK..] + // [..gmemC0..][..gmemC1..][..rowMax..][..sliceK..][..per-token SF..][..bias..] // if (mMmaKind == tg::MmaKind::Auto) { - mMmaKind = dtypeGetMmaKind(dtypeA, dtypeB); + mMmaKind = dtypeGetMmaKind(dtypeMmaA, dtypeMmaB); } std::vector> numBytesAndAlignmentPerSmemChunk; @@ -344,6 +346,29 @@ class KernelTraits firstChunkReuseSmem.emplace_back(false); } + // Bias + { + int32_t numBytesSmemBias = 0; + if (isBiasTypeN(biasType)) + { + numBytesSmemBias = tileN * sizeof(float); + } + else if (isBiasTypeM(biasType)) + { + numBytesSmemBias = tileM * sizeof(float); + } + else if (isBiasTypeMn(biasType)) + { + numBytesSmemBias = tileM * tileN * sizeof(float); + } + // Number of bytes alignment for bias + auto const numBytesAlignmentBias = 16; + // Add info. + smemChunkNames.emplace_back("smemBias"); + numBytesAndAlignmentPerSmemChunk.emplace_back(std::make_pair(numBytesSmemBias, numBytesAlignmentBias)); + firstChunkReuseSmem.emplace_back(false); + } + // Per-block absolute maximum for multi-warp reduction. { // Number of bytes: number of epilogue warps * number of tile columns. @@ -401,10 +426,12 @@ class KernelTraits // Matrix A { + // We use TMEM for A if we use slice-K or if we need to cast A. + bool const useTmemA = (numSlicesForSliceK > 1) || (dtypeMmaA != dtypeA); // Number of columns for A. - auto const numTmemColsA = numSlicesForSliceK > 1 ? numStages * tileK - / (numSlicesForSliceK * tg::dtypeGetNumBits(tg::Dtype::UInt32) / tg::dtypeGetNumBits(dtypeA)) - : 0; + auto const numTmemColsA = useTmemA ? numStages * tileK + / (numSlicesForSliceK * tg::dtypeGetNumBits(tg::Dtype::UInt32) / tg::dtypeGetNumBits(dtypeMmaA)) + : 0; // Number of columns for A alignment. auto const numColsAlignmentA = 4; // No need to reuse TMEM. @@ -418,7 +445,7 @@ class KernelTraits // Sf A { - bool const useBlockScalingA = tg::dtypeIsBlockFmt(dtypeA); + bool const useBlockScalingA = tg::dtypeIsBlockFmt(dtypeMmaA); // Number of columns for scaling factors of A. auto const numTmemColsSfA = useBlockScalingA ? ((tileK / 64) * 2 * tg::ceilDiv(tileM, 64)) * numStages : 0; @@ -435,7 +462,7 @@ class KernelTraits // Sf B { - bool const useBlockScalingB = tg::dtypeIsBlockFmt(dtypeB); + bool const useBlockScalingB = tg::dtypeIsBlockFmt(dtypeMmaB); // Number of columns for scaling factors of B. auto const numTmemColsSfB = useBlockScalingB ? ((tileK / 64) * 2 * tg::ceilDiv(tileN, 64)) * numStages : 0; @@ -541,13 +568,20 @@ inline int32_t getSmemOffsetPerTokenSf(KernelTraits traits) //////////////////////////////////////////////////////////////////////////////////////////////////// -inline int32_t getSmemOffsetBlockAmax(KernelTraits traits) +inline int32_t getSmemOffsetBias(KernelTraits traits) { return traits.mSmemAllocatorHelper.getChunkOffset(8); } //////////////////////////////////////////////////////////////////////////////////////////////////// +inline int32_t getSmemOffsetBlockAmax(KernelTraits traits) +{ + return traits.mSmemAllocatorHelper.getChunkOffset(9); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0) { // Be conscious that the index (3 + resIdx) should match the index in getSmemOffsetGmemC(). diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h index 0b10632f5d..9ca92f887d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h @@ -41,14 +41,14 @@ namespace tg = trtllm::gen; #ifdef TLLM_ENABLE_CUDA inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, std::vector const& shapes, - std::vector const& strides, int32_t tileSizeMn, int32_t tileSizeK, void* gmemAddr, bool doSwizzle = true) + std::vector const& strides, std::vector const& tileShapes, void* gmemAddr, bool doSwizzle = true) { // The multiplication factor of the data padding in SMEM. int32_t padMultiplier = 1; CUtensorMap desc{}; // The data type. CUtensorMapDataType tmaDataFormat{CU_TENSOR_MAP_DATA_TYPE_FLOAT32}; - if (dtype == tg::Dtype::E4m3 || dtype == tg::Dtype::MxE4m3) + if (dtype == tg::Dtype::E4m3 || dtype == tg::Dtype::MxE4m3 || dtype == tg::Dtype::UE8m0) { tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_UINT8; } @@ -71,15 +71,11 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st padMultiplier = 2; tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B; } - else if (mmaKind == tg::MmaKind::MxFp4NvFp4 || mmaKind == tg::MmaKind::Auto) - { - tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B; - } else { - std::cerr << "Invalid dtype / mmaKind combination " << tg::dtypeToString(dtype) << "/" - << tg::mmaKindToString(mmaKind) << std::endl; - assert(false); + // Note: this is used with the MMA kind MxFp4NvFp4 and also when casting to a higher-precision + // type such as Bfloat16 before the MMA. + tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B; } } else if (dtype == tg::Dtype::Fp32) @@ -94,24 +90,29 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st // The swizzle type. CUtensorMapSwizzle swizzleType{CU_TENSOR_MAP_SWIZZLE_NONE}; - int32_t tileKSizeInBytes = (tileSizeK * tg::dtypeGetNumBits(dtype) * padMultiplier) / /* bits */ 8; + int32_t fastestDimTileSizeBytes = (tileShapes[0] * tg::dtypeGetNumBits(dtype) * padMultiplier) / /* bits */ 8; if (doSwizzle) { - if ((tileKSizeInBytes % 128) == 0) + if ((fastestDimTileSizeBytes % 128) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_128B; } - else if ((tileKSizeInBytes % 64) == 0) + else if ((fastestDimTileSizeBytes % 64) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_64B; } - else if ((tileKSizeInBytes % 32) == 0) + else if ((fastestDimTileSizeBytes % 32) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_32B; } + else if ((fastestDimTileSizeBytes % 16) == 0 && (dtype == tg::Dtype::UE8m0 || dtype == tg::Dtype::E4m3)) + { + swizzleType = CU_TENSOR_MAP_SWIZZLE_NONE; + } else { - std::cerr << "buildNdTmaDescriptor: unexpected tileKSizeInBytes " << tileKSizeInBytes << std::endl; + std::cerr << "buildNdTmaDescriptor: unexpected fastestDimTileSizeBytes " << fastestDimTileSizeBytes + << std::endl; assert(false); } } @@ -121,8 +122,9 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st // Check shape must be in range [1, 2^32] int32_t dim = shapes.size(); - // Expect 2 dimensions. - assert(dim == 2 || dim == 3); + // Expect 2 dimensions for regular gemm, 3 dimensions for batched gemm or blocked layout, and 4 + // dimensions for batched gemm with blocked layout. + assert(dim == 2 || dim == 3 || dim == 4); // Check shape range. for (int32_t ii = 0; ii < dim; ++ii) { @@ -147,59 +149,72 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st // The number of elements in 128B. auto const numEltsIn128B = numEltsPerUInt32 /*4B*/ * 32; // The number of tile K hidden size (per token) in each block of shared memory. - auto const numEltsInClampedTileKSize = std::min(numEltsIn128B, tileSizeK); + auto const numEltsInClampedFastestTileSize = std::min(numEltsIn128B, tileShapes[0]); - // Build tile shapes. - std::vector tileShapes(dim, 1); - tileShapes[0] = numEltsInClampedTileKSize; // tileSizeK - tileShapes[1] = tileSizeMn; // tileSizeMn + // Build box dim array. If tileShapes is smaller than dim, just fill with 1s. + assert(static_cast(tileShapes.size()) <= dim); + std::vector boxDim(dim, 1); + boxDim[0] = numEltsInClampedFastestTileSize; + for (size_t ii = 1; ii < tileShapes.size(); ++ii) + { + if (tileShapes[ii] > 256) + { + std::cerr << "buildNdTmaDescriptor: boxDim too large " << tileShapes[ii] << std::endl; + assert(false); + } + else + { + boxDim[ii] = tileShapes[ii]; + } + } // Set tile strides to 1; std::vector tileStrides(dim, 1); // Build the descriptor. CUresult result = cuTensorMapEncodeTiled(&desc, tmaDataFormat, - /*tensorRank=*/dim, gmemAddr, shapes.data(), stridesInBytes.data(), tileShapes.data(), tileStrides.data(), + /*tensorRank=*/dim, gmemAddr, shapes.data(), stridesInBytes.data(), boxDim.data(), tileStrides.data(), /*interleave=*/CU_TENSOR_MAP_INTERLEAVE_NONE, swizzleType, /*l2Promotion=*/CU_TENSOR_MAP_L2_PROMOTION_L2_128B, /*oobFill=*/CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); if (result != CUDA_SUCCESS) { - std::cerr << "Error: Failed to initialize the TMA descriptor " << result << std::endl; + std::stringstream ss; + ss << "Error: Failed to initialize the TMA descriptor " << result << std::endl; - std::cerr << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr - << std::endl; + ss << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl; - std::cerr << "Shape: "; + ss << "Shape: "; for (int ii = 0; ii < dim; ++ii) { - std::cerr << shapes[ii] << " "; + ss << shapes[ii] << " "; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "Stride: "; + ss << "Stride: "; for (int ii = 0; ii < dim - 1; ++ii) { - std::cerr << stridesInBytes[ii] << " "; + ss << stridesInBytes[ii] << " "; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "tileShapes: "; + ss << "tileShapes: "; for (int ii = 0; ii < dim; ++ii) { - std::cerr << tileShapes[ii] << " "; + ss << boxDim[ii] << " "; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "tileStrides: "; + ss << "tileStrides: "; for (int ii = 0; ii < dim; ++ii) { - std::cerr << tileStrides[ii] << " "; + ss << tileStrides[ii] << " "; } - std::cerr << std::endl; - std::cerr << "swizzleType: " << int(swizzleType) << std::endl; - assert(false); + ss << std::endl; + ss << "swizzleType: " << int(swizzleType) << std::endl; + ss << "(in " << __FILE__ << ":" << __LINE__ << ")" << std::endl; + throw std::runtime_error(ss.str()); } return desc; @@ -267,41 +282,42 @@ inline CUtensorMap buildSfTmaDescriptor(tg::Dtype dtype, std::vector c if (result != CUDA_SUCCESS) { - std::cerr << "Error: Failed to initialize the TMA descriptor for SF " << result << std::endl; + std::stringstream ss; + ss << "Error: Failed to initialize the TMA descriptor for SF " << result << std::endl; - std::cerr << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr - << std::endl; + ss << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl; - std::cerr << "shape:"; + ss << "shape:"; for (uint32_t shape_i : shapes) { - std::cerr << " " << shape_i; + ss << " " << shape_i; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "stridesInBytes:"; + ss << "stridesInBytes:"; for (uint32_t stride_i : stridesInBytes) { - std::cerr << " " << stride_i; + ss << " " << stride_i; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "tileShapes:"; + ss << "tileShapes:"; for (uint32_t tileShape_i : tileShapes) { - std::cerr << " " << tileShape_i; + ss << " " << tileShape_i; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "tileStrides:"; + ss << "tileStrides:"; for (uint32_t tileStride_i : tileStrides) { - std::cerr << " " << tileStride_i; + ss << " " << tileStride_i; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "swizzleType: " << int(swizzleType) << std::endl; - assert(false); + ss << "swizzleType: " << int(swizzleType) << std::endl; + ss << "(in " << __FILE__ << ":" << __LINE__ << ")" << std::endl; + throw std::runtime_error(ss.str()); } return desc; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json index cf40d3e452..8f553bab03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json @@ -180,6 +180,7 @@ "dtypeC": ["bf16", "fp16", "e2m1"], "listN": "8,8", "numExperts": 2, + "numStagesMma": 1, "tileScheduler": "static" }, { @@ -191,6 +192,7 @@ "dtypeC": ["bf16", "fp16", "e4m3"], "listN": "8,8", "numExperts": 2, + "numStagesMma": 1, "tileScheduler": "static" }, { @@ -202,6 +204,7 @@ "dtypeC": ["bf16", "fp16", "e4m3"], "listN": "8,8", "numExperts": 2, + "numStagesMma": 2, "tileScheduler": "static" }, { @@ -209,15 +212,17 @@ "_template": "BatchedGemmFp4LowLatency", "routeAct": true, "fusedAct": true, + "sfLayoutB": "linear", "useRoutingScalesOnInput": false, "useUnrollLoop2xForMma": [true, false], "dtypeC": "e2m1", "numTokens": 2, "numExperts": 2, + "numStagesMma": 1, "tileScheduler": "static" }, { - "_comment": "DS_FP4_FC2", + "_comment": "DS_FP4_FC2_persistSched", "_template": "BatchedGemmFp4LowLatency", "routeAct": false, "fusedAct": false, @@ -227,10 +232,23 @@ "numTokens": 2, "numExperts": 2, "numStagesMma": 2, - "tileScheduler": ["persistent", "static"] + "tileScheduler": "persistent" + }, + { + "_comment": "DS_FP4_FC2_staticSched", + "_template": "BatchedGemmFp4LowLatency", + "routeAct": false, + "fusedAct": false, + "useRoutingScalesOnInput": false, + "useUnrollLoop2xForMma": [true, false], + "dtypeC": "bf16", + "numTokens": 2, + "numExperts": 2, + "numStagesMma": 1, + "tileScheduler": "static" }, { - "_comment": "DS_FP4_TP8_EP1_FC2", + "_comment": "DS_FP4_TP8_EP1_FC2_persistSched", "_template": "BatchedGemmFp4LowLatency", "routeAct": false, "fusedAct": false, @@ -242,7 +260,22 @@ "numStagesMma": 2, "tileK": 256, "numStages": 6, - "tileScheduler": ["persistent", "static"] + "tileScheduler": "persistent" + }, + { + "_comment": "DS_FP4_TP8_EP1_FC2_staticSched", + "_template": "BatchedGemmFp4LowLatency", + "routeAct": false, + "fusedAct": false, + "useRoutingScalesOnInput": false, + "useUnrollLoop2xForMma": [true, false], + "dtypeC": "bf16", + "numTokens": 2, + "numExperts": 2, + "numStagesMma": 1, + "tileK": 256, + "numStages": 6, + "tileScheduler": "static" }, { "_comment": "DS_FP8_FC1", @@ -259,7 +292,7 @@ "tileScheduler": "static" }, { - "_comment": "DS_FP8_FC2", + "_comment": "DS_FP8_FC2_persistent", "_template": "BatchedGemmDeepSeekFp8LowLatency", "routeAct": false, "fusedAct": false, @@ -270,7 +303,21 @@ "numExperts": 2, "numStages": 8, "numStagesMma": 4, - "tileScheduler": ["persistent", "static"] + "tileScheduler": "persistent" + }, + { + "_comment": "DS_FP8_FC2_static", + "_template": "BatchedGemmDeepSeekFp8LowLatency", + "routeAct": false, + "fusedAct": false, + "useRoutingScalesOnInput": false, + "useUnrollLoop2xForMma": [true, false], + "dtypeC": "bf16", + "numTokens": 2, + "numExperts": 2, + "numStages": 8, + "numStagesMma": 2, + "tileScheduler": "static" }, { "_comment": "DS_FP8_FC1_16_THROUGHPUT", @@ -407,7 +454,7 @@ "tileScheduler": "static" }, { - "_comment": "Llama4_FP8_FC2", + "_comment": "Llama4_FP8_FC2_persistent", "_template": "BatchedGemmPerTensorScalingFp8LowLatency", "routeAct": false, "fusedAct": false, @@ -418,22 +465,38 @@ "numExperts": 2, "numStages": 3, "numStagesMma": 2, - "tileScheduler": ["persistent", "static"] + "tileScheduler": "persistent" + }, + { + "_comment": "Llama4_FP8_FC2_static", + "_template": "BatchedGemmPerTensorScalingFp8LowLatency", + "routeAct": false, + "fusedAct": false, + "useRoutingScalesOnInput": false, + "useUnrollLoop2xForMma": [true, false], + "dtypeC": "bf16", + "numTokens": 2, + "numExperts": 2, + "numStages": 3, + "numStagesMma": 1, + "tileScheduler": "static" }, { "_comment": "MxE2m1_FC1", "_template": "BatchedGemmMxE2m1MxE4m3LowLatency", "routeAct": true, "fusedAct": true, + "sfLayoutB": "linear", "useRoutingScalesOnInput": false, "useUnrollLoop2xForMma": [true, false], "dtypeC": "mxe4m3", "numTokens": 2, "numExperts": 2, + "numStagesMma": 1, "tileScheduler": "static" }, { - "_comment": "MxE2m1_FC2", + "_comment": "MxE2m1_FC2_persistent", "_template": "BatchedGemmMxE2m1MxE4m3LowLatency", "routeAct": false, "fusedAct": false, @@ -443,10 +506,23 @@ "numTokens": 2, "numExperts": 2, "numStagesMma": 2, - "tileScheduler": ["persistent", "static"] + "tileScheduler": "persistent" + }, + { + "_comment": "MxE2m1_FC2_static", + "_template": "BatchedGemmMxE2m1MxE4m3LowLatency", + "routeAct": false, + "fusedAct": false, + "useRoutingScalesOnInput": false, + "useUnrollLoop2xForMma": [true, false], + "dtypeC": "bf16", + "numTokens": 2, + "numExperts": 2, + "numStagesMma": 1, + "tileScheduler": "static" }, { - "_comment": "MxE2m1_TP8_EP1_FC2", + "_comment": "MxE2m1_TP8_EP1_FC2_persistent", "_template": "BatchedGemmMxE2m1MxE4m3LowLatency", "routeAct": false, "fusedAct": false, @@ -458,7 +534,122 @@ "numStagesMma": 2, "tileK": 256, "numStages": 4, - "tileScheduler": ["persistent", "static"] + "tileScheduler": "persistent" + }, + { + "_comment": "MxE2m1_TP8_EP1_FC2_static", + "_template": "BatchedGemmMxE2m1MxE4m3LowLatency", + "routeAct": false, + "fusedAct": false, + "useRoutingScalesOnInput": false, + "useUnrollLoop2xForMma": [true, false], + "dtypeC": "bf16", + "numTokens": 2, + "numExperts": 2, + "numStagesMma": 1, + "tileK": 256, + "numStages": 4, + "tileScheduler": "static" + }, + { + "_comment": "Qwen3_TP1_EP8_FC1", + "_template": "BatchedGemmFp4LowLatency", + "routeAct": true, + "fusedAct": true, + "sfLayoutB": "linear", + "dtypeC": "e2m1", + "numStages": 4, + "numStagesMma": 1, + "tileK": 512, + "tileScheduler": "static", + "useUnrollLoop2xForMma": [true] + }, + { + "_comment": "Qwen3_TP1_EP8_FC2", + "_template": "BatchedGemmFp4LowLatency", + "routeAct": false, + "fusedAct": false, + "dtypeC": "bf16", + "numStages": 4, + "numStagesMma": 1, + "tileK": 512, + "tileScheduler": "static", + "useUnrollLoop2xForMma": [false] + }, + { + "_comment": "Qwen3_TP2_EP4_FC1", + "_template": "BatchedGemmFp4LowLatency", + "routeAct": true, + "fusedAct": true, + "sfLayoutB": "linear", + "dtypeC": "e2m1", + "numStages": 4, + "numStagesMma": 1, + "tileK": 512, + "tileScheduler": "static", + "useUnrollLoop2xForMma": [true] + }, + { + "_comment": "Qwen3_TP2_EP4_FC2", + "_template": "BatchedGemmFp4LowLatency", + "routeAct": false, + "fusedAct": false, + "dtypeC": "bf16", + "numStages": 4, + "numStagesMma": 2, + "tileK": 512, + "tileScheduler": "persistent", + "useUnrollLoop2xForMma": [false] + }, + { + "_comment": "Qwen3_TP4_EP2_FC1", + "_template": "BatchedGemmFp4LowLatency", + "routeAct": true, + "fusedAct": true, + "sfLayoutB": "linear", + "dtypeC": "e2m1", + "numStages": 4, + "numStagesMma": 1, + "tileK": 512, + "tileScheduler": "static", + "useUnrollLoop2xForMma": [true] + }, + { + "_comment": "Qwen3_TP4_EP2_FC2", + "_template": "BatchedGemmFp4LowLatency", + "routeAct": false, + "fusedAct": false, + "dtypeC": "bf16", + "numStages": 4, + "numStagesMma": 2, + "tileK": 512, + "tileScheduler": "persistent", + "useUnrollLoop2xForMma": [false] + }, + { + "_comment": "Qwen3_TP8_EP1_FC1", + "_template": "BatchedGemmFp4LowLatency", + "routeAct": true, + "fusedAct": true, + "sfLayoutB": "linear", + "dtypeC": "e2m1", + "numStages": 4, + "numStagesMma": 1, + "tileK": 512, + "tileScheduler": "static", + "useUnrollLoop2xForMma": [true] + }, + { + "_comment": "Qwen3_TP8_EP1_FC2", + "_template": "BatchedGemmFp4LowLatency", + "routeAct": false, + "fusedAct": false, + "dtypeC": "bf16", + "numStages": 4, + "numStagesMma": 2, + "tileK": 256, + "tileScheduler": "persistent", + "useUnrollLoop2xForMma": [false] } ] } diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..f80b9172e6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:025a21a07a797de485c2c86bfc5dbbe1fb5c494466346a207e7fc8ac266f60b8 +size 583355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..de97381a21 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d0f7bcd89357e66b7c75ec67ccb481de87d8c73898ee8f81672b0eb35830769 +size 691616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..ed326410aa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c73f79940b8691ecc3a14ce254bd64afced1824a79ac96f2e305dedfebe8f7b5 +size 606299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..5064016786 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7d783dc7ece8cea35d9bf83a7c8eea3986d9f29c3cd67389433d2357ade2d74 +size 729162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..20888a1d49 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4a8773b5462296ac6b1fb65c34e5fec54be4dd41e3b52c50e463c436f616d12 +size 552325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..a609d1ffe2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afb26c01c09a14d5fd5243d018164e3cb47f6acb26b380810bd24ea20b4deef3 +size 549397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..6c75e50a77 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19f7b7a44abef9ab58ead44ea82570c2669c41b2bf9dc97300a1cdc732aff6dd +size 688754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..0811ac534f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:532b00782f06b21405eae6c207cb539494b552d7db05bcecd96bf872d4a586fd +size 578475 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..024e1e7caf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd8cf63739a7968321556b7980ce56577f9692921a7f62043295be7e0f9f19eb +size 575499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..dcb2ed07e4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cc777b009a1fe7fcf4f5c2b794c0eaca2943e8f190c94fc4d3bb30c49348f85 +size 722256 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..43a9dab09d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f072615f04e0d0678e70294c0d6b7486e225d2382bad8f8427390a125bb48236 +size 610163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..27af07cab1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83bb1c43a309a3c0ce1f56cc239e517a7596a4cbdd59c16adec1ca9fb708043c +size 779992 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..4cf4506310 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8994ba4c2ca7dd0bf592da450e1404441f358657f7f60c4cdd56b56f9c448131 +size 628668 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..da87049c7f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cdc0f6feff2b0ea40dfb73159dc2a5038a2f46b328d057da200bd76d1fec044 +size 806784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..68b132b2b6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81396786711ebd423e67d113dc1f1d2d963385b1f56b56d51448070b538678d2 +size 501179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..6d2239f8a0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d82db55cdd10e897a4e3f7778c09c10bf029a69a86a80da92baebe7a6dfcaac +size 632034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..f632f63e44 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c23c02a534806734e5522c54cf9f6ddc112df09c35a0834439796f357ea2c700 +size 518895 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..4485f5c7cc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dca1a3d18d04e8d20b41bea46729faaf36d22e1aec5e75b2dc9d8c074dfca5b +size 661244 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..104fa0c8c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f66cc836ac67fbf37e27c790d93fd56ace7f84e45e1e8e709bc92449f171c40b +size 518249 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..54a234683b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44a920497d1cac7d3c2e27898758f4a069b2e40b16851e1c671a57580a6de613 +size 653742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..27487413df --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5de768877c4368ea3798f6410d5d66cd5010e7debc14fee453941f823cca1d75 +size 535963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..41805ff56f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aed58a7fca6462cd5f1f15181c4818071dd99fe7b436b570132c6cc3df6e97ed +size 682112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..36cfaeacd4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7294cdde3778c06141ce0ae68e293ec6b9ca990373e452c01f91852cbbb556 +size 554015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..81de55d568 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cbbf9129f822e95d3fae46330460eb5b50878e12c5f3d495102edb04fa7ca69 +size 699918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..8da3ce0b74 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28086a74790bb3e11722786569b95be426bf3ada28a82a36730dd3465d904fa7 +size 571731 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..a63d7844f7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92fa64e4f820b2f5d1b4db1864220d36231c2a6451c8a93b5aa13a3cd299caf4 +size 729028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..94c748b3ce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8f2365a13656aaf10d935265da62b26ff30019fd151296a2c1dea1ee633d1fe +size 539851 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..5db3fd96d6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d42abe6ddcac40aaa119421f50038adb4edf9f4e12931ba8528a858bd4de0ca +size 514477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..7859a3975e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b3e59916d41f8e66fa2d474819a031d2a24caaba31485dce0a62d5e12e2d24a +size 629660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..8fc8d104fb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:917ef5d93cf7cfbd76f3d32b3c051f4e119631a0324801a4b8684bf98157857c +size 559193 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..d5d56c56a8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:330cad5e70b49cf039b5931e9fa6686298bd04d9416e7684aa5f31c4e5e46444 +size 533821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..1ac491c269 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82707942c797cba2d1352e94fa594aca0a8b1c3c07d99646c41227a83f740766 +size 651864 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..7f8299ac92 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7f0619d6293f8448940ba954221fe2c841541bd8199fc943983eb6acd41ed61 +size 420605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..3bbf5ee43b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7882097269f622d56d99f704b42a0331d1fdc59442e803ff9c8f836b67c7d7d2 +size 417333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..8640661908 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34e40aba2b816a6bee4e8f30bc758d7ed3438b290341d6834c7985232f3c4760 +size 504443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..87b13fa745 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbf07013056fcb9c58a4a90fa2783eb38a3ee311e2ec4d2a5fb7aa855399a389 +size 443893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..0bb069d3f8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15e4735cc2d2e2bf3fdb10698e20303a532939acdfcb6c8c691b2a2d569663a6 +size 440671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..b2669c3acf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a9d9b821d0dcb825bed9911cdd556ed063ba48b398294eb4848159a2d11ae34 +size 526055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..21889830f0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8faa5ad1e00d5e9bfd17f288934536ad440d4655c5efac5e6a1fe01e7be245fa +size 563777 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..aecb4e1dd9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb0133260a01b83ed44b6a0b150dbcad2857f575a70476c41da2cca6a9b35f55 +size 736122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..6b74f349ae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1acd2b5b82fd69f3cf568ff96ace34ac5ed120785e5c5a5b8f45dccb7a589422 +size 591605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..25066e0555 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1df67c891567c1b074da7b6dae1b3b819bde680cd35b2cf5ab89a750b8cc943a +size 761582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..9a82468a76 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e3bc3b8b35eba8111d94ab7d195b8b731cc9b5f441bda0bf7784e0fdec1cc09 +size 610349 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..539d9762d0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7770d7a52f7ec065e1f3df861c273edf6f7e1cc677c36ac11c0352b50be83649 +size 739970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..2e3d6fc33d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86d84a8dbf978b75d87959e2cbcc50f1863d8f2c771bf26c5c4ad47996f81e18 +size 640150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..226404950d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:385fa25c09e6dc066957de238fea4f0636ee52cc90f7ad456c61f68390b13aad +size 766516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..eda32eb064 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef6738ce1712ecebe6e717e09bd7d78b324f828a4b3f4df2898511275383aad4 +size 600993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..108550c5ee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e0e4a92d078fc240e9006ae271cc08a91f214d31cc40269affe86f83b664ec +size 555211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..a245c9b0e8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e5b78e9b3db3a45609fba97e9df3c960f5f380e6a59bfddb2423f48908bf51f +size 626404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..7820f4b9a7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6d33c4ccff36bae9207ff93168d3607d5b0681bc586d50b8c6995b409661403 +size 581313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..7e07a1dcc7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d730384eeb2bd8f97d3f0daeabce52e0ea4ed80cac954d9755f51108b0ffce97 +size 841702 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..3bae4036eb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20642ded83c712aeeb9ba418353f6fa7552c4622777c61d128253e0e598b85e1 +size 859416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..0114b7a888 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8805e951d49c58123db07253b867803d3ae30942269c80915e7ea81eeffba66 +size 524185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..093b53806e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15c42189f82f2ea343576b589a1c28748d0ab3af0d04a11d5925f1cd97c4e23e +size 542689 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..689d1fcf71 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f5a33ba3b1201b9884329362f5192ab4d394d26b596b5d96f9ab64535f9426b +size 565625 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..81365ccdbb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecbdc8d75681767b45d8f967a7f13dc9c852fa131c274b6c22dac829a0b4a919 +size 582551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..2097d2c376 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b496add3396b71793dbe998f4b1579c1bcae810286df86b0351f12c4197a4697 +size 633262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..bf7f917c8d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:246d7c59b39586b37dd675292254fe90654e5d158ef24aef8ce04d4543c55aed +size 650976 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..6a1f9358b2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:868f0fedc4e99546431d08068f43cdd9c3c2f523ed922844b66f4ce83717c402 +size 558219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..5f523f2729 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0463544c6a80fd23dbdc767dfc96f512b2757dec2d976c917a9850a16f9e6f5 +size 527297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..4994666099 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd00a01e561343de9f8bd212b5e6da151e55e68cb12e37aa2ee0edb7cf3d76d +size 575243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..2828d8fb73 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cb24d3bc83aabe4c4ae6bac627a70fbc5cfc537f917a6c93b8d6830e1e0c373 +size 545849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..07dd143888 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c805a4f096f9f792b6402c607565f2e88fc1a56344e654618c3bbe1b615e01c +size 392855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..fc76a27474 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08067655e5ad721255bb86ec17617c7217ef9dc1165d1ce4bcc11c5a86dec681 +size 415845 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..908071b19a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4cc962ca87e8e42ccf3aab87d2c921cf747ff1cc8d2cb8e9b339491a47122c0 +size 418563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..c453b40432 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342721dcf5114816a32a15d31e24711a2aadd52ea4eb578695c502324786f163 +size 439973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..0b5e17dbcf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f11f3bc7b0c137577c335e4762068a0de4785a544d85f2575b787360c300f0a8 +size 548601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..af47072c14 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a35b8ab05817e3168b36002a8c0bfa45ff6cbe09d3a2db4af8be4dfb95fb6e6 +size 574701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..4ba5eb66d1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c4f2e7ba6b10a99f386253c6b9315620804590f2feae7dd78351c3fce34d9ec +size 513681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..a5ae523b14 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86179f2a7d2ddbffcb8b40a49f5babc31dc6dd80acff882a67a1eae40d396b24 +size 532233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..b9c02816a7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59566acf5116dd2221842073c5fcea6bcf70eb5ee29b14482e5d4efd33ebadb4 +size 415745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp new file mode 100644 index 0000000000..1cdf0ce039 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41933fab6978bc725dad91ccd0539a25f804f1e579a0360d3d6289eab0e076de +size 439873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..f6e7eff70c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf376b0111b28c57f20cdd44a5edaa98edd364da22b194dcfa4188e8247068c7 +size 625426 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp new file mode 100644 index 0000000000..d0ad1d6f84 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fef76304ee09e3ca32b459db98ef67f3a5cd93e8526c208115227b97eacaf22e +size 659520 diff --git a/tests/unittest/_torch/thop/test_moe.py b/tests/unittest/_torch/thop/test_moe.py index db76f62af1..dcae98cac5 100644 --- a/tests/unittest/_torch/thop/test_moe.py +++ b/tests/unittest/_torch/thop/test_moe.py @@ -593,8 +593,6 @@ def test_moe_fp8(num_tokens, expert_info, hidden_size, intermediate_size): assert num_experts % n_groups == 0 assert num_experts % 4 == 0 assert top_k < (top_k_groups * num_experts / n_groups) - assert hidden_size % 128 == 0 - assert intermediate_size % 128 == 0 expert_logits = torch.randn((num_tokens, num_experts), device='cuda').to(torch.float) @@ -668,9 +666,9 @@ def check_accuracy(a, b, atol, rtol, percent): reason="The kernel only supports Blackwell. Current SM is %d." % getSMVersion(), ) -@pytest.mark.parametrize("num_tokens", [1, 2, 16, 64, 1024, 4096]) +@pytest.mark.parametrize("num_tokens", [1, 1024, 4096]) @pytest.mark.parametrize("hidden_size", [1024]) -@pytest.mark.parametrize("intermediate_size", [1024]) +@pytest.mark.parametrize("intermediate_size", [1024, 768, 384, 192]) @pytest.mark.parametrize( "routing_info", [ @@ -743,8 +741,6 @@ def test_moe_fp4(num_tokens, hidden_size, intermediate_size, routing_info): assert top_k <= num_experts assert top_k <= 8 - assert hidden_size % 128 == 0 - assert intermediate_size % 128 == 0 if (top_k_groups is not None) and (n_groups is not None): assert top_k_groups <= 4 assert num_experts > n_groups @@ -995,8 +991,6 @@ def test_moe_fp8_per_tensor_scale(num_tokens, expert_info, hidden_size, assert n_groups == 0 or num_experts % n_groups == 0 assert num_experts % 4 == 0 assert n_groups == 0 or top_k < (top_k_groups * num_experts / n_groups) - assert hidden_size % 128 == 0 - assert intermediate_size % 128 == 0 expert_logits = torch.randn((num_tokens, num_experts), device='cuda').to(torch.float)