-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[mlir][amdgpu][docs] Add op examples to dialect docs #146848
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -106,6 +106,16 @@ def AMDGPU_ExtPackedFp8Op : | |
If the passed-in vector has fewer than four elements, or the input is scalar, | ||
the remaining values in the <4 x i8> will be filled with | ||
undefined values as needed. | ||
|
||
#### Example | ||
```mlir | ||
// Extract single FP8 element to scalar f32 | ||
%element = amdgpu.ext_packed_fp8 %src_vector[0] : vector<4xf8E4M3FNUZ> to f32 | ||
|
||
// Extract two FP8 elements to vector<2xf32> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above. |
||
%elements = amdgpu.ext_packed_fp8 %src_vector[0] : vector<4xf8E4M3FNUZ> to vector<2xf32> | ||
``` | ||
|
||
}]; | ||
let assemblyFormat = [{ | ||
attr-dict $source `[` $index `]` `:` type($source) `to` type($res) | ||
|
@@ -162,6 +172,12 @@ def AMDGPU_PackedTrunc2xFp8Op : | |
sub-registers, and so the conversion intrinsics (which are currently the | ||
only way to work with 8-bit float types) take packed vectors of 4 8-bit | ||
values. | ||
|
||
#### Example | ||
```mlir | ||
%result = amdgpu.packed_trunc_2xfp8 %src1, %src2 into %dest[word 1] | ||
: f32 to vector<4xf8E5M2FNUZ> into vector<4xf8E5M2FNUZ> | ||
``` | ||
}]; | ||
let assemblyFormat = [{ | ||
attr-dict $sourceA `,` ($sourceB^):(`undef`)? | ||
|
@@ -220,6 +236,12 @@ def AMDGPU_PackedStochRoundFp8Op : | |
sub-registers, and so the conversion intrinsics (which are currently the | ||
only way to work with 8-bit float types) take packed vectors of 4 8-bit | ||
values. | ||
|
||
#### Example | ||
```mlir | ||
%result = amdgpu.packed_stoch_round_fp8 %src + %stoch_seed into %dest[2] | ||
: f32 to vector<4xf8E5M2FNUZ> into vector<4xf8E5M2FNUZ> | ||
``` | ||
}]; | ||
let assemblyFormat = [{ | ||
attr-dict $source `+` $stochiasticParam | ||
|
@@ -275,6 +297,18 @@ def AMDGPU_FatRawBufferCastOp : | |
If the value of the memref's offset is not uniform (independent of the lane/thread ID), | ||
this will lead to substantially decreased performance due to the need for | ||
a waterfall loop on the base address of the buffer resource. | ||
|
||
#### Example | ||
```mlir | ||
// Simple cast | ||
%converted = amdgpu.fat_raw_buffer_cast %src | ||
: memref<8xi32> to memref<8xi32, #amdgpu.address_space<fat_raw_buffer>> | ||
// Cast with memory attributes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be great if you can be a bit more specific here. |
||
%converted = amdgpu.fat_raw_buffer_cast %src validBytes(%valid) | ||
cacheSwizzleStride(%swizzle) boundsCheck(false) resetOffset | ||
: memref<8xi32, strided<[1], offset: ?>> | ||
to memref<8xi32, strided<[1]>, #amdgpu.address_space<fat_raw_buffer>> | ||
``` | ||
}]; | ||
|
||
let extraClassDeclaration = [{ | ||
|
@@ -333,6 +367,17 @@ def AMDGPU_RawBufferLoadOp : | |
- If `boundsCheck` is false and the target chipset is RDNA, OOB_SELECT is set | ||
to 2 to disable bounds checks, otherwise it is 3 | ||
- The cache coherency bits are off | ||
|
||
#### Example | ||
```mlir | ||
// Load scalar f32 from 1D buffer | ||
%scalar = amdgpu.raw_buffer_load %src[%idx] : memref<128xf32>, i32 -> f32 | ||
// Load vector<4xf32> from 4D buffer | ||
%vector = amdgpu.raw_buffer_load %src[%idx0, %idx1, %idx2, %idx3] | ||
: memref<128x64x32x16xf32>, i32, i32, i32, i32 -> vector<4xf32> | ||
// Load from scalar buffer | ||
%value = amdgpu.raw_buffer_load %src[] : memref<f32> -> f32 | ||
``` | ||
}]; | ||
let assemblyFormat = [{ | ||
attr-dict $memref `[` $indices `]` | ||
|
@@ -372,6 +417,17 @@ def AMDGPU_RawBufferStoreOp : | |
|
||
See `amdgpu.raw_buffer_load` for a description of how the underlying | ||
instruction is constructed. | ||
|
||
#### Example | ||
```mlir | ||
// Store scalar f32 to 1D buffer | ||
amdgpu.raw_buffer_store %value -> %dst[%idx] : f32 -> memref<128xf32>, i32 | ||
// Store vector<4xf32> to 4D buffer | ||
amdgpu.raw_buffer_store %vec -> %dst[%idx0, %idx1, %idx2, %idx3] | ||
: vector<4xf32> -> memref<128x64x32x16xf32>, i32, i32, i32, i32 | ||
// Store to scalar buffer | ||
amdgpu.raw_buffer_store %value -> %dst[] : f32 -> memref<f32> | ||
``` | ||
}]; | ||
let assemblyFormat = [{ | ||
attr-dict $value `->` $memref `[` $indices `]` | ||
|
@@ -414,6 +470,13 @@ def AMDGPU_RawBufferAtomicCmpswapOp : | |
|
||
See `amdgpu.raw_buffer_load` for a description of how the underlying | ||
instruction is constructed. | ||
|
||
#### Example | ||
```mlir | ||
// Atomic compare-swap | ||
amdgpu.raw_buffer_atomic_cmpswap %src, %cmp -> %dst[%idx] | ||
: f32 -> memref<128xf32>, i32 | ||
``` | ||
}]; | ||
let assemblyFormat = [{ | ||
attr-dict $src `,` $cmp `->` $memref `[` $indices `]` | ||
|
@@ -453,6 +516,13 @@ def AMDGPU_RawBufferAtomicFaddOp : | |
|
||
See `amdgpu.raw_buffer_load` for a description of how the underlying | ||
instruction is constructed. | ||
|
||
#### Example | ||
```mlir | ||
// Atomic floating-point add | ||
amdgpu.raw_buffer_atomic_fadd %value -> %dst[%idx] | ||
: f32 -> memref<128xf32>, i32 | ||
``` | ||
}]; | ||
let assemblyFormat = [{ | ||
attr-dict $value `->` $memref `[` $indices `]` | ||
|
@@ -647,11 +717,16 @@ def AMDGPU_SwizzleBitModeOp : AMDGPU_Op<"swizzle_bitmode", | |
|
||
Supports arbitrary int/float/vector types, which will be repacked to i32 and | ||
one or more `rocdl.ds_swizzle` ops during lowering. | ||
|
||
#### Example | ||
```mlir | ||
%result = amdgpu.swizzle_bitmode %src 1 2 4 : f32 | ||
``` | ||
}]; | ||
let results = (outs AnyIntegerOrFloatOr1DVector:$result); | ||
let assemblyFormat = [{ | ||
$src $and_mask $or_mask $xor_mask attr-dict `:` type($result) | ||
}]; | ||
}]; | ||
} | ||
|
||
def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> { | ||
|
@@ -673,6 +748,11 @@ def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> { | |
(those which will implement this barrier by emitting inline assembly), | ||
use of this operation will impede the usabiliity of memory watches (including | ||
breakpoints set on variables) when debugging. | ||
|
||
#### Example | ||
```mlir | ||
amdgpu.lds_barrier | ||
``` | ||
}]; | ||
let assemblyFormat = "attr-dict"; | ||
} | ||
|
@@ -711,6 +791,14 @@ def AMDGPU_SchedBarrierOp : | |
`amdgpu.sched_barrier` serves as a barrier that could be | ||
configured to restrict movements of instructions through it as | ||
defined by sched_barrier_opts. | ||
|
||
#### Example | ||
```mlir | ||
// Barrier allowing no dependent instructions | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I find the use of "dependent" here confusing. Personally, I think it is more clear to say something along the lines of "no instructions may be scheduled across the barrier" |
||
amdgpu.sched_barrier allow = <none> | ||
// Barrier allowing specific execution units | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similarly, I find the use of execution units here confusing. I find something like "barrier allowing VALU and VMEM instructions to be scheduled across it" to be more straightforward. |
||
amdgpu.sched_barrier allow = <valu|all_vmem> | ||
``` | ||
}]; | ||
let assemblyFormat = [{ | ||
`allow` `=` $opts attr-dict | ||
|
@@ -810,6 +898,13 @@ def AMDGPU_MFMAOp : | |
|
||
The negateA, negateB, and negateC flags are only supported for double-precision | ||
operations on gfx94x. | ||
|
||
#### Example | ||
```mlir | ||
%result = amdgpu.mfma %a * %b + %c | ||
{ abid = 1 : i32, cbsz = 1 : i32, k = 1 : i32, m = 32 : i32, n = 32 : i32, blocks = 2 : i32 } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is missing the |
||
: f32, f32, vector<32xf32> | ||
``` | ||
}]; | ||
let assemblyFormat = [{ | ||
$sourceA `*` $sourceB `+` $destC | ||
|
@@ -851,6 +946,12 @@ def AMDGPU_WMMAOp : | |
|
||
The `clamp` flag is used to saturate the output of type T to numeric_limits<T>::max() | ||
in case of overflow. | ||
|
||
#### Example | ||
```mlir | ||
%result = amdgpu.wmma %a * %b + %c | ||
: vector<16xf16>, vector<16xf16>, vector<8xf16> | ||
``` | ||
}]; | ||
let assemblyFormat = [{ | ||
$sourceA `*` $sourceB `+` $destC | ||
|
@@ -973,6 +1074,14 @@ def AMDGPU_ScaledMFMAOp : | |
are omitted from this wrapper. | ||
- The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for | ||
double-precision operations on gfx94x and so are not included here. | ||
|
||
#### Example | ||
```mlir | ||
%result = amdgpu.scaled_mfma | ||
(%scale_a[0] * %vec_a) * (%scale_b[1] * %vec_b) + %accum | ||
{ k = 64 : i32, m = 32 : i32, n = 32 : i32 } | ||
: f8E8M0FNU, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32> | ||
``` | ||
}]; | ||
let assemblyFormat = [{ | ||
`(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For me, it is more intuitive to use "extend" instead of "extract" here as that aligns more with the operation semantics.