@@ -1183,6 +1183,93 @@ operations.
1183
1183
For more information, refer to the PTX ISA
1184
1184
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensorcore-5th-generation-instructions-tcgen05-fence >`_.
1185
1185
1186
+ '``llvm.nvvm.tcgen05.shift ``'
1187
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1188
+
1189
+ Syntax:
1190
+ """""""
1191
+
1192
+ .. code-block :: llvm
1193
+
1194
+ declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr)
1195
+ declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr)
1196
+
1197
+ Overview:
1198
+ """""""""
1199
+
1200
+ The '``@llvm.nvvm.tcgen05.shift.{cg1/cg2} ``' intrinsics correspond to
1201
+ the ``tcgen05.shift.{cg1/cg2} `` PTX instructions. The ``tcgen05.shift ``
1202
+ is an asynchronous instruction which initiates the shifting of 32-byte
1203
+ elements downwards across all the rows, except the last, by one row.
1204
+ The address operand ``%tmem_addr `` specifies the base address of the
1205
+ matrix in the Tensor Memory whose rows must be down shifted.
1206
+
1207
+ For more information, refer to the PTX ISA
1208
+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-shift >`_.
1209
+
1210
+ '``llvm.nvvm.tcgen05.cp ``'
1211
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^
1212
+
1213
+ Syntax:
1214
+ """""""
1215
+
1216
+ .. code-block :: llvm
1217
+
1218
+ declare void @llvm.nvvm.tcgen05.cp.4x256b.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1219
+ declare void @llvm.nvvm.tcgen05.cp.128x256b.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1220
+ declare void @llvm.nvvm.tcgen05.cp.128x128b.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1221
+ declare void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1222
+ declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1223
+ declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1224
+
1225
+ declare void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1226
+ declare void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1227
+ declare void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1228
+ declare void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1229
+ declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1230
+ declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1231
+
1232
+ declare void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1233
+ declare void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1234
+ declare void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1235
+ declare void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1236
+ declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1237
+ declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc)
1238
+
1239
+ Overview:
1240
+ """""""""
1241
+
1242
+ The '``@llvm.nvvm.tcgen05.cp.{shape}.{src_fmt}.{cg1/cg2} ``' intrinsics
1243
+ correspond to the ``tcgen05.cp.* `` family of PTX instructions.
1244
+ The ``tcgen05.cp `` instruction initiates an asynchronous copy operation from
1245
+ shared memory to the location specified by ``%tmem_addr `` in Tensor Memory.
1246
+ The 64-bit register operand ``%sdesc `` is the matrix descriptor representing
1247
+ the source matrix in shared memory that needs to be copied.
1248
+
1249
+ The valid shapes for the copy operation are:
1250
+ {128x256b, 4x256b, 128x128b, 64x128b_warpx2_02_13, 64x128b_warpx2_01_23, 32x128b_warpx4}.
1251
+
1252
+ Shapes ``64x128b `` and ``32x128b `` require dedicated multicast qualifiers,
1253
+ which are appended to the corresponding intrinsic names.
1254
+
1255
+ Optionally, the data can be decompressed from the source format in the shared memory
1256
+ to the destination format in Tensor Memory during the copy operation. Currently,
1257
+ only ``.b8x16 `` is supported as destination format. The valid source formats are
1258
+ ``.b6x16_p32 `` and ``.b4x16_p64 ``.
1259
+
1260
+ When the source format is ``.b6x16_p32 ``, a contiguous set of 16 elements of 6-bits
1261
+ each followed by four bytes of padding (``_p32 ``) in shared memory is decompressed
1262
+ into 16 elements of 8-bits (``.b8x16 ``) each in the Tensor Memory.
1263
+
1264
+ When the source format is ``.b4x16_p64 ``, a contiguous set of 16 elements of 4-bits
1265
+ each followed by eight bytes of padding (``_p64 ``) in shared memory is decompressed
1266
+ into 16 elements of 8-bits (``.b8x16 ``) each in the Tensor Memory.
1267
+
1268
+ For more information on the decompression schemes, refer to the PTX ISA
1269
+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#optional-decompression >`_.
1270
+
1271
+ For more information on the tcgen05.cp instruction, refer to the PTX ISA
1272
+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-cp >`_.
1186
1273
1187
1274
Other Intrinsics
1188
1275
----------------
0 commit comments