Accroding to ptx, tcgen05.ld is an async instruction, so why not use tcgen05.wait or its wrapper after loading in 02_mma_tma_sm100.cu.
|
// Load TMEM -> RMEM |
|
copy(tiled_t2r_copy, tDtAcc, tDrAcc); |
|
|
|
// AXPBY RMEM -> RMEM: tDrC = alpha * tDrAcc + beta * tDrC |
|
axpby(alpha, tDrAcc, beta, tDrC); |
|
// Store RMEM -> GMEM |
|
copy(tDrC, tDgD); |