Skip to content

Commit 1a48e1d

Browse files
[AMDGPU] Do not fold COPY with implicit operands (#136003)
Folding may remove COPY from inside of the divergent loop.
1 parent d7d1706 commit 1a48e1d

File tree

5 files changed

+160
-126
lines changed

5 files changed

+160
-126
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1091,7 +1091,8 @@ void SIFoldOperandsImpl::foldOperand(
10911091
} else {
10921092
if (UseMI->isCopy() && OpToFold.isReg() &&
10931093
UseMI->getOperand(0).getReg().isVirtual() &&
1094-
!UseMI->getOperand(1).getSubReg()) {
1094+
!UseMI->getOperand(1).getSubReg() &&
1095+
OpToFold.getParent()->implicit_operands().empty()) {
10951096
LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
10961097
unsigned Size = TII->getOpSize(*UseMI, 1);
10971098
Register UseReg = OpToFold.getReg();
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-fold-operands -o - %s | FileCheck %s
3+
4+
---
5+
liveins:
6+
name: do_not_fold_copy_with_implicit_exec
7+
tracksRegLiveness: true
8+
body: |
9+
; CHECK-LABEL: name: do_not_fold_copy_with_implicit_exec
10+
; CHECK: bb.0:
11+
; CHECK-NEXT: successors: %bb.1(0x80000000)
12+
; CHECK-NEXT: {{ $}}
13+
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
14+
; CHECK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
15+
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0, implicit $exec
16+
; CHECK-NEXT: S_BRANCH %bb.1
17+
; CHECK-NEXT: {{ $}}
18+
; CHECK-NEXT: bb.1:
19+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
20+
; CHECK-NEXT: {{ $}}
21+
; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_1]], %bb.0, %4, %bb.1
22+
; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %6, %bb.1
23+
; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI1]], 1, implicit-def dead $scc
24+
; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK [[S_MOV_B64_]], [[PHI]], implicit-def dead $scc
25+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]], implicit $exec
26+
; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
27+
; CHECK-NEXT: S_BRANCH %bb.2
28+
; CHECK-NEXT: {{ $}}
29+
; CHECK-NEXT: bb.2:
30+
; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
31+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
32+
; CHECK-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, 0, killed [[DEF]], implicit $exec
33+
; CHECK-NEXT: S_ENDPGM 0
34+
35+
bb.0:
36+
%0:sreg_64 = S_MOV_B64 0
37+
%1:sreg_64 = S_MOV_B64 0
38+
%2:sreg_32 = S_MOV_B32 0, implicit $exec
39+
S_BRANCH %bb.1
40+
41+
bb.1:
42+
%3:sreg_64 = PHI %1, %bb.0, %4, %bb.1
43+
%5:sreg_32 = PHI %2, %bb.0, %6, %bb.1
44+
%6:sreg_32 = S_ADD_I32 %5, 1, implicit-def dead $scc
45+
%4:sreg_64 = SI_IF_BREAK %0, %3, implicit-def dead $scc
46+
%7:vgpr_32 = COPY %6, implicit $exec
47+
SI_LOOP %4, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
48+
S_BRANCH %bb.2
49+
50+
bb.2:
51+
SI_END_CF %4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
52+
%9:vgpr_32 = COPY %7
53+
%10:sreg_64_xexec = IMPLICIT_DEF
54+
%11:vgpr_32 = V_SET_INACTIVE_B32 0, %9, 0, 0, killed %10, implicit $exec
55+
S_ENDPGM 0
56+
...

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -429,13 +429,14 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call
429429
; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s9, 0, v0
430430
; DAGISEL12-NEXT: s_mov_b32 exec_lo, s8
431431
; DAGISEL12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v13, v1
432+
; DAGISEL12-NEXT: v_mov_b32_e32 v11, s9
432433
; DAGISEL12-NEXT: s_or_b32 s4, vcc_lo, s4
433434
; DAGISEL12-NEXT: s_wait_alu 0xfffe
434435
; DAGISEL12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
435436
; DAGISEL12-NEXT: s_cbranch_execnz .LBB3_2
436437
; DAGISEL12-NEXT: ; %bb.3: ; %tail.loopexit
437438
; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s4
438-
; DAGISEL12-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_add_nc_u32 v10, 42, v1
439+
; DAGISEL12-NEXT: v_add_nc_u32_e32 v10, 42, v1
439440
; DAGISEL12-NEXT: .LBB3_4: ; %Flow1
440441
; DAGISEL12-NEXT: s_wait_alu 0xfffe
441442
; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -526,13 +527,13 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call
526527
; DAGISEL10-NEXT: v_cmp_ne_u32_e64 s9, 0, v0
527528
; DAGISEL10-NEXT: s_mov_b32 exec_lo, s8
528529
; DAGISEL10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v13, v1
530+
; DAGISEL10-NEXT: v_mov_b32_e32 v11, s9
529531
; DAGISEL10-NEXT: s_or_b32 s4, vcc_lo, s4
530532
; DAGISEL10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
531533
; DAGISEL10-NEXT: s_cbranch_execnz .LBB3_2
532534
; DAGISEL10-NEXT: ; %bb.3: ; %tail.loopexit
533535
; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s4
534536
; DAGISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v1
535-
; DAGISEL10-NEXT: v_mov_b32_e32 v11, s9
536537
; DAGISEL10-NEXT: .LBB3_4: ; %Flow1
537538
; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3
538539
; DAGISEL10-NEXT: s_mov_b32 s3, exec_lo

llvm/test/CodeGen/AMDGPU/mfma-loop.ll

Lines changed: 96 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -1425,41 +1425,42 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
14251425
; GFX90A: ; %bb.0: ; %entry
14261426
; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
14271427
; GFX90A-NEXT: s_mov_b32 s0, 16
1428-
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
14291428
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
14301429
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1431-
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s1
1432-
; GFX90A-NEXT: v_accvgpr_write_b32 a30, s1
1433-
; GFX90A-NEXT: v_accvgpr_write_b32 a29, s1
1434-
; GFX90A-NEXT: v_accvgpr_write_b32 a28, s1
1435-
; GFX90A-NEXT: v_accvgpr_write_b32 a27, s1
1436-
; GFX90A-NEXT: v_accvgpr_write_b32 a26, s1
1437-
; GFX90A-NEXT: v_accvgpr_write_b32 a25, s1
1438-
; GFX90A-NEXT: v_accvgpr_write_b32 a24, s1
1439-
; GFX90A-NEXT: v_accvgpr_write_b32 a23, s1
1440-
; GFX90A-NEXT: v_accvgpr_write_b32 a22, s1
1441-
; GFX90A-NEXT: v_accvgpr_write_b32 a21, s1
1442-
; GFX90A-NEXT: v_accvgpr_write_b32 a20, s1
1443-
; GFX90A-NEXT: v_accvgpr_write_b32 a19, s1
1444-
; GFX90A-NEXT: v_accvgpr_write_b32 a18, s1
1445-
; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1
1446-
; GFX90A-NEXT: v_accvgpr_write_b32 a16, s1
1447-
; GFX90A-NEXT: v_accvgpr_write_b32 a15, s1
1448-
; GFX90A-NEXT: v_accvgpr_write_b32 a14, s1
1449-
; GFX90A-NEXT: v_accvgpr_write_b32 a13, s1
1450-
; GFX90A-NEXT: v_accvgpr_write_b32 a12, s1
1451-
; GFX90A-NEXT: v_accvgpr_write_b32 a11, s1
1452-
; GFX90A-NEXT: v_accvgpr_write_b32 a10, s1
1453-
; GFX90A-NEXT: v_accvgpr_write_b32 a9, s1
1454-
; GFX90A-NEXT: v_accvgpr_write_b32 a8, s1
1455-
; GFX90A-NEXT: v_accvgpr_write_b32 a7, s1
1456-
; GFX90A-NEXT: v_accvgpr_write_b32 a6, s1
1457-
; GFX90A-NEXT: v_accvgpr_write_b32 a5, s1
1458-
; GFX90A-NEXT: v_accvgpr_write_b32 a4, s1
1459-
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s1
1460-
; GFX90A-NEXT: v_accvgpr_write_b32 a2, s1
1461-
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
1462-
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s1
1430+
; GFX90A-NEXT: v_mov_b32_e32 v0, s1
1431+
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
1432+
; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
1433+
; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
1434+
; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
1435+
; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
1436+
; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
1437+
; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
1438+
; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
1439+
; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
1440+
; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
1441+
; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
1442+
; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
1443+
; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
1444+
; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
1445+
; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
1446+
; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
1447+
; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
1448+
; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
1449+
; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
1450+
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
1451+
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
1452+
; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
1453+
; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
1454+
; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
1455+
; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
1456+
; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
1457+
; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
1458+
; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
1459+
; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
1460+
; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
1461+
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
1462+
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
1463+
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
14631464
; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader
14641465
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
14651466
; GFX90A-NEXT: s_nop 1
@@ -1487,41 +1488,42 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
14871488
; GFX942: ; %bb.0: ; %entry
14881489
; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
14891490
; GFX942-NEXT: s_mov_b32 s0, 16
1490-
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
14911491
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
14921492
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
1493-
; GFX942-NEXT: v_accvgpr_write_b32 a31, s1
1494-
; GFX942-NEXT: v_accvgpr_write_b32 a30, s1
1495-
; GFX942-NEXT: v_accvgpr_write_b32 a29, s1
1496-
; GFX942-NEXT: v_accvgpr_write_b32 a28, s1
1497-
; GFX942-NEXT: v_accvgpr_write_b32 a27, s1
1498-
; GFX942-NEXT: v_accvgpr_write_b32 a26, s1
1499-
; GFX942-NEXT: v_accvgpr_write_b32 a25, s1
1500-
; GFX942-NEXT: v_accvgpr_write_b32 a24, s1
1501-
; GFX942-NEXT: v_accvgpr_write_b32 a23, s1
1502-
; GFX942-NEXT: v_accvgpr_write_b32 a22, s1
1503-
; GFX942-NEXT: v_accvgpr_write_b32 a21, s1
1504-
; GFX942-NEXT: v_accvgpr_write_b32 a20, s1
1505-
; GFX942-NEXT: v_accvgpr_write_b32 a19, s1
1506-
; GFX942-NEXT: v_accvgpr_write_b32 a18, s1
1507-
; GFX942-NEXT: v_accvgpr_write_b32 a17, s1
1508-
; GFX942-NEXT: v_accvgpr_write_b32 a16, s1
1509-
; GFX942-NEXT: v_accvgpr_write_b32 a15, s1
1510-
; GFX942-NEXT: v_accvgpr_write_b32 a14, s1
1511-
; GFX942-NEXT: v_accvgpr_write_b32 a13, s1
1512-
; GFX942-NEXT: v_accvgpr_write_b32 a12, s1
1513-
; GFX942-NEXT: v_accvgpr_write_b32 a11, s1
1514-
; GFX942-NEXT: v_accvgpr_write_b32 a10, s1
1515-
; GFX942-NEXT: v_accvgpr_write_b32 a9, s1
1516-
; GFX942-NEXT: v_accvgpr_write_b32 a8, s1
1517-
; GFX942-NEXT: v_accvgpr_write_b32 a7, s1
1518-
; GFX942-NEXT: v_accvgpr_write_b32 a6, s1
1519-
; GFX942-NEXT: v_accvgpr_write_b32 a5, s1
1520-
; GFX942-NEXT: v_accvgpr_write_b32 a4, s1
1521-
; GFX942-NEXT: v_accvgpr_write_b32 a3, s1
1522-
; GFX942-NEXT: v_accvgpr_write_b32 a2, s1
1523-
; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
1524-
; GFX942-NEXT: v_accvgpr_write_b32 a0, s1
1493+
; GFX942-NEXT: v_mov_b32_e32 v0, s1
1494+
; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
1495+
; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
1496+
; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
1497+
; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
1498+
; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
1499+
; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
1500+
; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
1501+
; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
1502+
; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
1503+
; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
1504+
; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
1505+
; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
1506+
; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
1507+
; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
1508+
; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
1509+
; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
1510+
; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
1511+
; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
1512+
; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
1513+
; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
1514+
; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
1515+
; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
1516+
; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
1517+
; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
1518+
; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
1519+
; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
1520+
; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
1521+
; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
1522+
; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
1523+
; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
1524+
; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
1525+
; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
1526+
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
15251527
; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader
15261528
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
15271529
; GFX942-NEXT: s_nop 1
@@ -1696,6 +1698,8 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
16961698
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
16971699
; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
16981700
; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
1701+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1702+
; GFX90A-NEXT: v_mov_b32_e32 v0, s1
16991703
; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
17001704
; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
17011705
; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
@@ -1725,8 +1729,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
17251729
; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
17261730
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
17271731
; GFX90A-NEXT: s_mov_b32 s0, 16
1728-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1729-
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
1732+
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
17301733
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
17311734
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
17321735
; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader
@@ -1759,6 +1762,8 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
17591762
; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
17601763
; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
17611764
; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
1765+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
1766+
; GFX942-NEXT: v_mov_b32_e32 v0, s1
17621767
; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
17631768
; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
17641769
; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
@@ -1788,8 +1793,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
17881793
; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
17891794
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
17901795
; GFX942-NEXT: s_mov_b32 s0, 16
1791-
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
1792-
; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
1796+
; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
17931797
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
17941798
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
17951799
; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader
@@ -2050,66 +2054,38 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
20502054
; GFX908-NEXT: s_nop 7
20512055
; GFX908-NEXT: s_nop 1
20522056
; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2053-
; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2054-
; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2057+
; GFX908-NEXT: s_nop 1
2058+
; GFX908-NEXT: v_accvgpr_write_b32 a0, v2
20552059
; GFX908-NEXT: v_accvgpr_write_b32 a1, v2
2056-
; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2057-
; GFX908-NEXT: v_accvgpr_write_b32 a2, v3
2058-
; GFX908-NEXT: v_accvgpr_write_b32 a3, v33
2060+
; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
2061+
; GFX908-NEXT: v_accvgpr_write_b32 a3, v2
20592062
; GFX908-NEXT: v_accvgpr_write_b32 a4, v2
2060-
; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2061-
; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2062-
; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2063-
; GFX908-NEXT: v_accvgpr_write_b32 a5, v3
2064-
; GFX908-NEXT: v_accvgpr_write_b32 a6, v33
2063+
; GFX908-NEXT: v_accvgpr_write_b32 a5, v2
2064+
; GFX908-NEXT: v_accvgpr_write_b32 a6, v2
20652065
; GFX908-NEXT: v_accvgpr_write_b32 a7, v2
2066-
; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2067-
; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2068-
; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2069-
; GFX908-NEXT: v_accvgpr_write_b32 a8, v3
2070-
; GFX908-NEXT: v_accvgpr_write_b32 a9, v33
2066+
; GFX908-NEXT: v_accvgpr_write_b32 a8, v2
2067+
; GFX908-NEXT: v_accvgpr_write_b32 a9, v2
20712068
; GFX908-NEXT: v_accvgpr_write_b32 a10, v2
2072-
; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2073-
; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2074-
; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2075-
; GFX908-NEXT: v_accvgpr_write_b32 a11, v3
2076-
; GFX908-NEXT: v_accvgpr_write_b32 a12, v33
2069+
; GFX908-NEXT: v_accvgpr_write_b32 a11, v2
2070+
; GFX908-NEXT: v_accvgpr_write_b32 a12, v2
20772071
; GFX908-NEXT: v_accvgpr_write_b32 a13, v2
2078-
; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2079-
; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2080-
; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2081-
; GFX908-NEXT: v_accvgpr_write_b32 a14, v3
2082-
; GFX908-NEXT: v_accvgpr_write_b32 a15, v33
2072+
; GFX908-NEXT: v_accvgpr_write_b32 a14, v2
2073+
; GFX908-NEXT: v_accvgpr_write_b32 a15, v2
20832074
; GFX908-NEXT: v_accvgpr_write_b32 a16, v2
2084-
; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2085-
; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2086-
; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2087-
; GFX908-NEXT: v_accvgpr_write_b32 a17, v3
2088-
; GFX908-NEXT: v_accvgpr_write_b32 a18, v33
2075+
; GFX908-NEXT: v_accvgpr_write_b32 a17, v2
2076+
; GFX908-NEXT: v_accvgpr_write_b32 a18, v2
20892077
; GFX908-NEXT: v_accvgpr_write_b32 a19, v2
2090-
; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2091-
; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2092-
; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2093-
; GFX908-NEXT: v_accvgpr_write_b32 a20, v3
2094-
; GFX908-NEXT: v_accvgpr_write_b32 a21, v33
2078+
; GFX908-NEXT: v_accvgpr_write_b32 a20, v2
2079+
; GFX908-NEXT: v_accvgpr_write_b32 a21, v2
20952080
; GFX908-NEXT: v_accvgpr_write_b32 a22, v2
2096-
; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2097-
; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2098-
; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2099-
; GFX908-NEXT: v_accvgpr_write_b32 a23, v3
2100-
; GFX908-NEXT: v_accvgpr_write_b32 a24, v33
2081+
; GFX908-NEXT: v_accvgpr_write_b32 a23, v2
2082+
; GFX908-NEXT: v_accvgpr_write_b32 a24, v2
21012083
; GFX908-NEXT: v_accvgpr_write_b32 a25, v2
2102-
; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2103-
; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2104-
; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2105-
; GFX908-NEXT: v_accvgpr_write_b32 a26, v3
2106-
; GFX908-NEXT: v_accvgpr_write_b32 a27, v33
2084+
; GFX908-NEXT: v_accvgpr_write_b32 a26, v2
2085+
; GFX908-NEXT: v_accvgpr_write_b32 a27, v2
21072086
; GFX908-NEXT: v_accvgpr_write_b32 a28, v2
2108-
; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2109-
; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2110-
; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2111-
; GFX908-NEXT: v_accvgpr_write_b32 a29, v3
2112-
; GFX908-NEXT: v_accvgpr_write_b32 a30, v33
2087+
; GFX908-NEXT: v_accvgpr_write_b32 a29, v2
2088+
; GFX908-NEXT: v_accvgpr_write_b32 a30, v2
21132089
; GFX908-NEXT: v_accvgpr_write_b32 a31, v2
21142090
; GFX908-NEXT: .LBB8_1: ; %for.cond.preheader
21152091
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1

llvm/test/CodeGen/AMDGPU/mul.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2619,13 +2619,13 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
26192619
; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
26202620
; SI-NEXT: v_add_i32_e32 v0, vcc, s5, v0
26212621
; SI-NEXT: s_mul_i32 s5, s14, s9
2622-
; SI-NEXT: s_mul_i32 s4, s12, s10
26232622
; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1
26242623
; SI-NEXT: s_mul_i32 s5, s15, s8
26252624
; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1
26262625
; SI-NEXT: s_mul_i32 s5, s14, s8
2627-
; SI-NEXT: v_mov_b32_e32 v2, s4
2628-
; SI-NEXT: v_add_i32_e32 v2, vcc, s5, v2
2626+
; SI-NEXT: s_mul_i32 s4, s12, s10
2627+
; SI-NEXT: v_mov_b32_e32 v2, s5
2628+
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
26292629
; SI-NEXT: v_addc_u32_e32 v0, vcc, v1, v0, vcc
26302630
; SI-NEXT: v_mov_b32_e32 v1, s12
26312631
; SI-NEXT: v_mul_hi_u32 v5, s8, v1

0 commit comments

Comments
 (0)