Skip to content

Commit 8635b8e

Browse files
authored
[AMDGPU][True16][MC] true16 for v_alignbit_b32 (#119409)
Support true16 format for v_alignbit_b32 in MC. Since we are replacing `v_alignbit_b32` to `v_alignbit_b32_t16/v_alignbit_b32_fake16` in Post-GFX11, have to update the CodeGen pattern for `v_alignbit_b32_fake16` to get CodeGen test passing. There is no pattern modified/created, but just replacing the `v_alignbit_b32` with fake16 format. Some of the true16 CodeGen test are impacted since `v_alignbit_b32` selection are removed in Post-GFX11 while `v_alignbit_b32_t16` are not yet supported. The CodeGen patch for `v_alignbit_b32_t16` will be done in the following patch.
1 parent 1357279 commit 8635b8e

16 files changed

+342
-101
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 98 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2499,6 +2499,7 @@ def : AMDGPUPat <
24992499
$src1), sub1)
25002500
>;
25012501

2502+
let True16Predicate = NotHasTrue16BitInsts in {
25022503
def : ROTRPattern <V_ALIGNBIT_B32_e64>;
25032504

25042505
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
@@ -2508,6 +2509,42 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
25082509
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
25092510
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
25102511
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
2512+
} // end True16Predicate = NotHasTrue16BitInsts
2513+
2514+
let True16Predicate = UseFakeTrue16Insts in {
2515+
def : GCNPat <
2516+
(rotr i32:$src0, i32:$src1),
2517+
(V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0,
2518+
/* src1_modifiers */ 0, $src0,
2519+
/* src2_modifiers */ 0,
2520+
$src1, /* clamp */ 0, /* op_sel */ 0)
2521+
>;
2522+
2523+
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
2524+
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
2525+
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
2526+
0, /* src1_modifiers */
2527+
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
2528+
0, /* src2_modifiers */
2529+
$src1, /* clamp */ 0, /* op_sel */ 0)
2530+
>;
2531+
2532+
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
2533+
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
2534+
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
2535+
0, /* src1_modifiers */
2536+
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
2537+
0, /* src2_modifiers */
2538+
$src1, /* clamp */ 0, /* op_sel */ 0)
2539+
>;
2540+
2541+
def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
2542+
(V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0,
2543+
/* src1_modifiers */ 0, $src1,
2544+
/* src2_modifiers */ 0,
2545+
$src2, /* clamp */ 0, /* op_sel */ 0)
2546+
>;
2547+
} // end True16Predicate = UseFakeTrue16Insts
25112548

25122549
/********** ====================== **********/
25132550
/********** Indirect addressing **********/
@@ -3015,35 +3052,69 @@ def : GCNPat <
30153052
(i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
30163053
>;
30173054

3055+
let True16Predicate = NotHasTrue16BitInsts in
30183056
def : GCNPat <
30193057
(i32 (bswap i32:$a)),
30203058
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
30213059
(V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 24)),
30223060
(V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 8)))
30233061
>;
30243062

3025-
// FIXME: This should have been narrowed to i32 during legalization.
3026-
// This pattern should also be skipped for GlobalISel
3063+
let True16Predicate = UseFakeTrue16Insts in
3064+
def : GCNPat <
3065+
(i32 (bswap i32:$a)),
3066+
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
3067+
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
3068+
VSrc_b32:$a,
3069+
0, /* src1_modifiers */
3070+
VSrc_b32:$a,
3071+
0, /* src2_modifiers */
3072+
(i32 24), /* clamp */ 0, /* op_sel */ 0),
3073+
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
3074+
VSrc_b32:$a,
3075+
0, /* src1_modifiers */
3076+
VSrc_b32:$a,
3077+
0, /* src2_modifiers */
3078+
(i32 8), /* clamp */ 0, /* op_sel */ 0))
3079+
>;
3080+
3081+
class AlignBit32Inst<dag op1, dag op2, dag op3, bit isTrue16> {
3082+
defvar inst = !if(isTrue16, V_ALIGNBIT_B32_fake16_e64, V_ALIGNBIT_B32_e64);
3083+
defvar NoMods = !if(isTrue16, (inst 0), (inst));
3084+
dag ret = !con(NoMods, (inst op1), NoMods, (inst op2),
3085+
NoMods, (inst op3), NoMods, NoMods);
3086+
}
3087+
3088+
multiclass bswapi64ExtPat<bit hasTrue16> {
30273089
def : GCNPat <
30283090
(i64 (bswap i64:$a)),
30293091
(REG_SEQUENCE VReg_64,
30303092
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
3031-
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3032-
(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3033-
(i32 24)),
3034-
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3035-
(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3036-
(i32 8))),
3093+
AlignBit32Inst<(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3094+
(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3095+
(i32 24), hasTrue16>.ret,
3096+
AlignBit32Inst<(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3097+
(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3098+
(i32 8), hasTrue16>.ret),
30373099
sub0,
30383100
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
3039-
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3040-
(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3041-
(i32 24)),
3042-
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3043-
(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3044-
(i32 8))),
3101+
AlignBit32Inst<(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3102+
(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3103+
(i32 24), hasTrue16>.ret,
3104+
AlignBit32Inst<(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3105+
(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3106+
(i32 8), hasTrue16>.ret),
30453107
sub1)
30463108
>;
3109+
}
3110+
3111+
// FIXME: This should have been narrowed to i32 during legalization.
3112+
// This pattern should also be skipped for GlobalISel
3113+
let True16Predicate = NotHasTrue16BitInsts in
3114+
defm : bswapi64ExtPat</*hasTrue16*/0>;
3115+
3116+
let True16Predicate = UseFakeTrue16Insts in
3117+
defm : bswapi64ExtPat</*hasTrue16*/1>;
30473118

30483119
// FIXME: The AddedComplexity should not be needed, but in GlobalISel
30493120
// the BFI pattern ends up taking precedence without it.
@@ -3456,8 +3527,7 @@ def : GCNPat <
34563527

34573528
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
34583529
// Special case, can use V_ALIGNBIT (always uses encoded literal)
3459-
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
3460-
let True16Predicate = p in {
3530+
let True16Predicate = NotHasTrue16BitInsts in
34613531
def : GCNPat <
34623532
(vecTy (DivergentBinFrag<build_vector>
34633533
(Ty !if(!eq(Ty, i16),
@@ -3467,7 +3537,19 @@ def : GCNPat <
34673537
(V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))
34683538
>;
34693539

3540+
let True16Predicate = UseFakeTrue16Insts in
3541+
def : GCNPat <
3542+
(vecTy (DivergentBinFrag<build_vector>
3543+
(Ty !if(!eq(Ty, i16),
3544+
(Ty (trunc (srl VGPR_32:$a, (i32 16)))),
3545+
(Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))),
3546+
(Ty VGPR_32:$b))),
3547+
(V_ALIGNBIT_B32_fake16_e64 0, VGPR_32:$b, 0, VGPR_32:$a, 0, (i16 16), 0, 0)
3548+
>;
3549+
34703550
// Take the upper 16 bits from each VGPR_32 and concat them
3551+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
3552+
let True16Predicate = p in
34713553
def : GCNPat <
34723554
(vecTy (DivergentBinFrag<build_vector>
34733555
(Ty !if(!eq(Ty, i16),
@@ -3478,7 +3560,6 @@ def : GCNPat <
34783560
(Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
34793561
(V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x07060302)))
34803562
>;
3481-
}
34823563

34833564
} // end foreach Ty
34843565

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -211,9 +211,12 @@ defm V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>,
211211
defm V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
212212
defm V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
213213
defm V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>;
214-
defm V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, fshr>;
214+
defm V_ALIGNBIT_B32 : VOP3Inst_t16_with_profiles <"v_alignbit_b32",
215+
VOP3_Profile<VOP_I32_I32_I32_I32>,
216+
VOP3_Profile_True16<VOP_I32_I32_I32_I16, VOP3_OPSEL>,
217+
VOP3_Profile_Fake16<VOP_I32_I32_I32_I16, VOP3_OPSEL>,
218+
fshr, null_frag>;
215219

216-
let True16Predicate = NotHasTrue16BitInsts in
217220
defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
218221
let True16Predicate = UseRealTrue16Insts in
219222
defm V_ALIGNBYTE_B32_t16 : VOP3Inst <"v_alignbyte_b32_t16", VOP3_Profile_True16<VOP_I32_I32_I32_I16, VOP3_OPSEL>>;
@@ -1726,7 +1729,7 @@ defm V_BFI_B32 : VOP3_Realtriple_gfx11_gfx12<0x212>;
17261729
defm V_FMA_F32 : VOP3_Realtriple_gfx11_gfx12<0x213>;
17271730
defm V_FMA_F64 : VOP3_Real_Base_gfx11_gfx12<0x214>;
17281731
defm V_LERP_U8 : VOP3_Realtriple_gfx11_gfx12<0x215>;
1729-
defm V_ALIGNBIT_B32 : VOP3_Realtriple_gfx11_gfx12<0x216>;
1732+
defm V_ALIGNBIT_B32 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x216, "v_alignbit_b32">;
17301733
defm V_ALIGNBYTE_B32 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x217, "v_alignbyte_b32">;
17311734
defm V_MULLIT_F32 : VOP3_Realtriple_gfx11_gfx12<0x218>;
17321735
defm V_MIN3_F32 : VOP3_Realtriple_gfx11<0x219>;

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
44
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
55
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
6-
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
6+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s
77

88
---
99

@@ -23,6 +23,15 @@ body: |
2323
; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
2424
; GCN-NEXT: [[V_ALIGNBIT_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec
2525
; GCN-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_e64_]]
26+
;
27+
; GFX11-LABEL: name: fshr_s32
28+
; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
29+
; GFX11-NEXT: {{ $}}
30+
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
31+
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
32+
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
33+
; GFX11-NEXT: [[V_ALIGNBIT_B32_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
34+
; GFX11-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_fake16_e64_]]
2635
%0:vgpr(s32) = COPY $vgpr0
2736
%1:vgpr(s32) = COPY $vgpr1
2837
%2:vgpr(s32) = COPY $vgpr2

llvm/test/CodeGen/AMDGPU/bswap.ll

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -737,15 +737,25 @@ define i64 @v_bswap_i48(i64 %src) {
737737
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
738738
; VI-NEXT: s_setpc_b64 s[30:31]
739739
;
740-
; GFX11-LABEL: v_bswap_i48:
741-
; GFX11: ; %bb.0:
742-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
743-
; GFX11-NEXT: v_perm_b32 v2, 0, v0, 0x10203
744-
; GFX11-NEXT: v_perm_b32 v0, 0, v1, 0x10203
745-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
746-
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
747-
; GFX11-NEXT: v_alignbit_b32 v0, v2, v0, 16
748-
; GFX11-NEXT: s_setpc_b64 s[30:31]
740+
; GFX11-REAL16-LABEL: v_bswap_i48:
741+
; GFX11-REAL16: ; %bb.0:
742+
; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
743+
; GFX11-REAL16-NEXT: v_perm_b32 v2, 0, v0, 0x10203
744+
; GFX11-REAL16-NEXT: v_perm_b32 v1, 0, v1, 0x10203
745+
; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
746+
; GFX11-REAL16-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2]
747+
; GFX11-REAL16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
748+
; GFX11-REAL16-NEXT: s_setpc_b64 s[30:31]
749+
;
750+
; GFX11-FAKE16-LABEL: v_bswap_i48:
751+
; GFX11-FAKE16: ; %bb.0:
752+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
753+
; GFX11-FAKE16-NEXT: v_perm_b32 v2, 0, v0, 0x10203
754+
; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, v1, 0x10203
755+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
756+
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
757+
; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v2, v0, 16
758+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
749759
%trunc = trunc i64 %src to i48
750760
%bswap = call i48 @llvm.bswap.i48(i48 %trunc)
751761
%zext = zext i48 %bswap to i64

llvm/test/MC/AMDGPU/gfx11_asm_vop3.s

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -416,11 +416,11 @@ v_alignbit_b32 v5, s1, v255, s3
416416
v_alignbit_b32 v5, s105, s105, s105
417417
// GFX11: v_alignbit_b32 v5, s105, s105, s105 ; encoding: [0x05,0x00,0x16,0xd6,0x69,0xd2,0xa4,0x01]
418418

419-
v_alignbit_b32 v5, vcc_lo, ttmp15, v3
420-
// GFX11: v_alignbit_b32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x16,0xd6,0x6a,0xf6,0x0c,0x04]
419+
v_alignbit_b32 v5, vcc_lo, ttmp15, v3.l
420+
// GFX11: v_alignbit_b32 v5, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x16,0xd6,0x6a,0xf6,0x0c,0x04]
421421

422-
v_alignbit_b32 v5, vcc_hi, 0xaf123456, v255
423-
// GFX11: v_alignbit_b32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x16,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf]
422+
v_alignbit_b32 v5, vcc_hi, 0xaf123456, v255.l
423+
// GFX11: v_alignbit_b32 v5, vcc_hi, 0xaf123456, v255.l ; encoding: [0x05,0x00,0x16,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf]
424424

425425
v_alignbit_b32 v5, ttmp15, src_scc, ttmp15
426426
// GFX11: v_alignbit_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x16,0xd6,0x7b,0xfa,0xed,0x01]
@@ -449,6 +449,9 @@ v_alignbit_b32 v5, src_scc, vcc_lo, -1
449449
v_alignbit_b32 v255, 0xaf123456, vcc_hi, null
450450
// GFX11: v_alignbit_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x16,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
451451

452+
v_alignbit_b32 v5, vcc_hi, 0xaf123456, v255.h
453+
// GFX11: v_alignbit_b32 v5, vcc_hi, 0xaf123456, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x16,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf]
454+
452455
v_alignbyte_b32 v5, v1, v2, s3
453456
// GFX11: v_alignbyte_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x17,0xd6,0x01,0x05,0x0e,0x00]
454457

@@ -543,13 +546,13 @@ v_and_b16 v255.l, 0xfe0b, vcc_hi
543546
// GFX11: v_and_b16 v255.l, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x62,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
544547

545548
v_and_b16 v5.l, v1.h, v2.l
546-
// GFX11: [0x05,0x08,0x62,0xd7,0x01,0x05,0x02,0x00]
549+
// GFX11: v_and_b16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x62,0xd7,0x01,0x05,0x02,0x00]
547550

548551
v_and_b16 v5.l, v255.l, v255.h
549-
// GFX11: [0x05,0x10,0x62,0xd7,0xff,0xff,0x03,0x00]
552+
// GFX11: v_and_b16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x62,0xd7,0xff,0xff,0x03,0x00]
550553

551554
v_and_b16 v255.h, 0xfe0b, vcc_hi
552-
// GFX11: [0xff,0x40,0x62,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
555+
// GFX11: v_and_b16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x62,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
553556

554557
v_and_or_b32 v5, v1, v2, s3
555558
// GFX11: v_and_or_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x57,0xd6,0x01,0x05,0x0e,0x00]
@@ -5577,13 +5580,13 @@ v_or_b16 v255.l, 0xfe0b, vcc_hi
55775580
// GFX11: v_or_b16 v255.l, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x63,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
55785581

55795582
v_or_b16 v5.l, v1.h, v2.l
5580-
// GFX11: [0x05,0x08,0x63,0xd7,0x01,0x05,0x02,0x00]
5583+
// GFX11: v_or_b16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x63,0xd7,0x01,0x05,0x02,0x00]
55815584

55825585
v_or_b16 v5.l, v255.l, v255.h
5583-
// GFX11: [0x05,0x10,0x63,0xd7,0xff,0xff,0x03,0x00]
5586+
// GFX11: v_or_b16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x63,0xd7,0xff,0xff,0x03,0x00]
55845587

55855588
v_or_b16 v255.h, 0xfe0b, vcc_hi
5586-
// GFX11: [0xff,0x40,0x63,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
5589+
// GFX11: v_or_b16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x63,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
55875590

55885591
v_pack_b32_f16 v5, v1.l, v2.l
55895592
// GFX11: v_pack_b32_f16 v5, v1.l, v2.l ; encoding: [0x05,0x00,0x11,0xd7,0x01,0x05,0x02,0x00]
@@ -6632,10 +6635,10 @@ v_xor_b16 v255.l, 0xfe0b, vcc_hi
66326635
// GFX11: v_xor_b16 v255.l, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x64,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
66336636

66346637
v_xor_b16 v5.l, v1.h, v2.l
6635-
// GFX11: [0x05,0x08,0x64,0xd7,0x01,0x05,0x02,0x00]
6638+
// GFX11: v_xor_b16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x64,0xd7,0x01,0x05,0x02,0x00]
66366639

66376640
v_xor_b16 v5.l, v255.l, v255.h
6638-
// GFX11: [0x05,0x10,0x64,0xd7,0xff,0xff,0x03,0x00]
6641+
// GFX11: v_xor_b16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x64,0xd7,0xff,0xff,0x03,0x00]
66396642

66406643
v_xor_b16 v255.h, 0xfe0b, vcc_hi
6641-
// GFX11: [0xff,0x40,0x64,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
6644+
// GFX11: v_xor_b16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x64,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]

0 commit comments

Comments
 (0)