@@ -1425,41 +1425,42 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
1425
1425
; GFX90A: ; %bb.0: ; %entry
1426
1426
; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
1427
1427
; GFX90A-NEXT: s_mov_b32 s0, 16
1428
- ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
1429
1428
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
1430
1429
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1431
- ; GFX90A-NEXT: v_accvgpr_write_b32 a31, s1
1432
- ; GFX90A-NEXT: v_accvgpr_write_b32 a30, s1
1433
- ; GFX90A-NEXT: v_accvgpr_write_b32 a29, s1
1434
- ; GFX90A-NEXT: v_accvgpr_write_b32 a28, s1
1435
- ; GFX90A-NEXT: v_accvgpr_write_b32 a27, s1
1436
- ; GFX90A-NEXT: v_accvgpr_write_b32 a26, s1
1437
- ; GFX90A-NEXT: v_accvgpr_write_b32 a25, s1
1438
- ; GFX90A-NEXT: v_accvgpr_write_b32 a24, s1
1439
- ; GFX90A-NEXT: v_accvgpr_write_b32 a23, s1
1440
- ; GFX90A-NEXT: v_accvgpr_write_b32 a22, s1
1441
- ; GFX90A-NEXT: v_accvgpr_write_b32 a21, s1
1442
- ; GFX90A-NEXT: v_accvgpr_write_b32 a20, s1
1443
- ; GFX90A-NEXT: v_accvgpr_write_b32 a19, s1
1444
- ; GFX90A-NEXT: v_accvgpr_write_b32 a18, s1
1445
- ; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1
1446
- ; GFX90A-NEXT: v_accvgpr_write_b32 a16, s1
1447
- ; GFX90A-NEXT: v_accvgpr_write_b32 a15, s1
1448
- ; GFX90A-NEXT: v_accvgpr_write_b32 a14, s1
1449
- ; GFX90A-NEXT: v_accvgpr_write_b32 a13, s1
1450
- ; GFX90A-NEXT: v_accvgpr_write_b32 a12, s1
1451
- ; GFX90A-NEXT: v_accvgpr_write_b32 a11, s1
1452
- ; GFX90A-NEXT: v_accvgpr_write_b32 a10, s1
1453
- ; GFX90A-NEXT: v_accvgpr_write_b32 a9, s1
1454
- ; GFX90A-NEXT: v_accvgpr_write_b32 a8, s1
1455
- ; GFX90A-NEXT: v_accvgpr_write_b32 a7, s1
1456
- ; GFX90A-NEXT: v_accvgpr_write_b32 a6, s1
1457
- ; GFX90A-NEXT: v_accvgpr_write_b32 a5, s1
1458
- ; GFX90A-NEXT: v_accvgpr_write_b32 a4, s1
1459
- ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s1
1460
- ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s1
1461
- ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
1462
- ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s1
1430
+ ; GFX90A-NEXT: v_mov_b32_e32 v0, s1
1431
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
1432
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
1433
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
1434
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
1435
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
1436
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
1437
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
1438
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
1439
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
1440
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
1441
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
1442
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
1443
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
1444
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
1445
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
1446
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
1447
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
1448
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
1449
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
1450
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
1451
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
1452
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
1453
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
1454
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
1455
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
1456
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
1457
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
1458
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
1459
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
1460
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
1461
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
1462
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
1463
+ ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
1463
1464
; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader
1464
1465
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1465
1466
; GFX90A-NEXT: s_nop 1
@@ -1487,41 +1488,42 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
1487
1488
; GFX942: ; %bb.0: ; %entry
1488
1489
; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
1489
1490
; GFX942-NEXT: s_mov_b32 s0, 16
1490
- ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
1491
1491
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
1492
1492
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
1493
- ; GFX942-NEXT: v_accvgpr_write_b32 a31, s1
1494
- ; GFX942-NEXT: v_accvgpr_write_b32 a30, s1
1495
- ; GFX942-NEXT: v_accvgpr_write_b32 a29, s1
1496
- ; GFX942-NEXT: v_accvgpr_write_b32 a28, s1
1497
- ; GFX942-NEXT: v_accvgpr_write_b32 a27, s1
1498
- ; GFX942-NEXT: v_accvgpr_write_b32 a26, s1
1499
- ; GFX942-NEXT: v_accvgpr_write_b32 a25, s1
1500
- ; GFX942-NEXT: v_accvgpr_write_b32 a24, s1
1501
- ; GFX942-NEXT: v_accvgpr_write_b32 a23, s1
1502
- ; GFX942-NEXT: v_accvgpr_write_b32 a22, s1
1503
- ; GFX942-NEXT: v_accvgpr_write_b32 a21, s1
1504
- ; GFX942-NEXT: v_accvgpr_write_b32 a20, s1
1505
- ; GFX942-NEXT: v_accvgpr_write_b32 a19, s1
1506
- ; GFX942-NEXT: v_accvgpr_write_b32 a18, s1
1507
- ; GFX942-NEXT: v_accvgpr_write_b32 a17, s1
1508
- ; GFX942-NEXT: v_accvgpr_write_b32 a16, s1
1509
- ; GFX942-NEXT: v_accvgpr_write_b32 a15, s1
1510
- ; GFX942-NEXT: v_accvgpr_write_b32 a14, s1
1511
- ; GFX942-NEXT: v_accvgpr_write_b32 a13, s1
1512
- ; GFX942-NEXT: v_accvgpr_write_b32 a12, s1
1513
- ; GFX942-NEXT: v_accvgpr_write_b32 a11, s1
1514
- ; GFX942-NEXT: v_accvgpr_write_b32 a10, s1
1515
- ; GFX942-NEXT: v_accvgpr_write_b32 a9, s1
1516
- ; GFX942-NEXT: v_accvgpr_write_b32 a8, s1
1517
- ; GFX942-NEXT: v_accvgpr_write_b32 a7, s1
1518
- ; GFX942-NEXT: v_accvgpr_write_b32 a6, s1
1519
- ; GFX942-NEXT: v_accvgpr_write_b32 a5, s1
1520
- ; GFX942-NEXT: v_accvgpr_write_b32 a4, s1
1521
- ; GFX942-NEXT: v_accvgpr_write_b32 a3, s1
1522
- ; GFX942-NEXT: v_accvgpr_write_b32 a2, s1
1523
- ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
1524
- ; GFX942-NEXT: v_accvgpr_write_b32 a0, s1
1493
+ ; GFX942-NEXT: v_mov_b32_e32 v0, s1
1494
+ ; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
1495
+ ; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
1496
+ ; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
1497
+ ; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
1498
+ ; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
1499
+ ; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
1500
+ ; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
1501
+ ; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
1502
+ ; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
1503
+ ; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
1504
+ ; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
1505
+ ; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
1506
+ ; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
1507
+ ; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
1508
+ ; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
1509
+ ; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
1510
+ ; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
1511
+ ; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
1512
+ ; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
1513
+ ; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
1514
+ ; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
1515
+ ; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
1516
+ ; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
1517
+ ; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
1518
+ ; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
1519
+ ; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
1520
+ ; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
1521
+ ; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
1522
+ ; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
1523
+ ; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
1524
+ ; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
1525
+ ; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
1526
+ ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
1525
1527
; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader
1526
1528
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
1527
1529
; GFX942-NEXT: s_nop 1
@@ -1696,6 +1698,8 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
1696
1698
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
1697
1699
; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
1698
1700
; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
1701
+ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1702
+ ; GFX90A-NEXT: v_mov_b32_e32 v0, s1
1699
1703
; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
1700
1704
; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
1701
1705
; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
@@ -1725,8 +1729,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
1725
1729
; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
1726
1730
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
1727
1731
; GFX90A-NEXT: s_mov_b32 s0, 16
1728
- ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1729
- ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
1732
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
1730
1733
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
1731
1734
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
1732
1735
; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader
@@ -1759,6 +1762,8 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
1759
1762
; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
1760
1763
; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
1761
1764
; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
1765
+ ; GFX942-NEXT: s_waitcnt lgkmcnt(0)
1766
+ ; GFX942-NEXT: v_mov_b32_e32 v0, s1
1762
1767
; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
1763
1768
; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
1764
1769
; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
@@ -1788,8 +1793,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
1788
1793
; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
1789
1794
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
1790
1795
; GFX942-NEXT: s_mov_b32 s0, 16
1791
- ; GFX942-NEXT: s_waitcnt lgkmcnt(0)
1792
- ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
1796
+ ; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
1793
1797
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
1794
1798
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
1795
1799
; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader
@@ -2050,66 +2054,38 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
2050
2054
; GFX908-NEXT: s_nop 7
2051
2055
; GFX908-NEXT: s_nop 1
2052
2056
; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2053
- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2054
- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2057
+ ; GFX908-NEXT: s_nop 1
2058
+ ; GFX908-NEXT: v_accvgpr_write_b32 a0, v2
2055
2059
; GFX908-NEXT: v_accvgpr_write_b32 a1, v2
2056
- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2057
- ; GFX908-NEXT: v_accvgpr_write_b32 a2, v3
2058
- ; GFX908-NEXT: v_accvgpr_write_b32 a3, v33
2060
+ ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
2061
+ ; GFX908-NEXT: v_accvgpr_write_b32 a3, v2
2059
2062
; GFX908-NEXT: v_accvgpr_write_b32 a4, v2
2060
- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2061
- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2062
- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2063
- ; GFX908-NEXT: v_accvgpr_write_b32 a5, v3
2064
- ; GFX908-NEXT: v_accvgpr_write_b32 a6, v33
2063
+ ; GFX908-NEXT: v_accvgpr_write_b32 a5, v2
2064
+ ; GFX908-NEXT: v_accvgpr_write_b32 a6, v2
2065
2065
; GFX908-NEXT: v_accvgpr_write_b32 a7, v2
2066
- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2067
- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2068
- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2069
- ; GFX908-NEXT: v_accvgpr_write_b32 a8, v3
2070
- ; GFX908-NEXT: v_accvgpr_write_b32 a9, v33
2066
+ ; GFX908-NEXT: v_accvgpr_write_b32 a8, v2
2067
+ ; GFX908-NEXT: v_accvgpr_write_b32 a9, v2
2071
2068
; GFX908-NEXT: v_accvgpr_write_b32 a10, v2
2072
- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2073
- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2074
- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2075
- ; GFX908-NEXT: v_accvgpr_write_b32 a11, v3
2076
- ; GFX908-NEXT: v_accvgpr_write_b32 a12, v33
2069
+ ; GFX908-NEXT: v_accvgpr_write_b32 a11, v2
2070
+ ; GFX908-NEXT: v_accvgpr_write_b32 a12, v2
2077
2071
; GFX908-NEXT: v_accvgpr_write_b32 a13, v2
2078
- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2079
- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2080
- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2081
- ; GFX908-NEXT: v_accvgpr_write_b32 a14, v3
2082
- ; GFX908-NEXT: v_accvgpr_write_b32 a15, v33
2072
+ ; GFX908-NEXT: v_accvgpr_write_b32 a14, v2
2073
+ ; GFX908-NEXT: v_accvgpr_write_b32 a15, v2
2083
2074
; GFX908-NEXT: v_accvgpr_write_b32 a16, v2
2084
- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2085
- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2086
- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2087
- ; GFX908-NEXT: v_accvgpr_write_b32 a17, v3
2088
- ; GFX908-NEXT: v_accvgpr_write_b32 a18, v33
2075
+ ; GFX908-NEXT: v_accvgpr_write_b32 a17, v2
2076
+ ; GFX908-NEXT: v_accvgpr_write_b32 a18, v2
2089
2077
; GFX908-NEXT: v_accvgpr_write_b32 a19, v2
2090
- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2091
- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2092
- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2093
- ; GFX908-NEXT: v_accvgpr_write_b32 a20, v3
2094
- ; GFX908-NEXT: v_accvgpr_write_b32 a21, v33
2078
+ ; GFX908-NEXT: v_accvgpr_write_b32 a20, v2
2079
+ ; GFX908-NEXT: v_accvgpr_write_b32 a21, v2
2095
2080
; GFX908-NEXT: v_accvgpr_write_b32 a22, v2
2096
- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2097
- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2098
- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2099
- ; GFX908-NEXT: v_accvgpr_write_b32 a23, v3
2100
- ; GFX908-NEXT: v_accvgpr_write_b32 a24, v33
2081
+ ; GFX908-NEXT: v_accvgpr_write_b32 a23, v2
2082
+ ; GFX908-NEXT: v_accvgpr_write_b32 a24, v2
2101
2083
; GFX908-NEXT: v_accvgpr_write_b32 a25, v2
2102
- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2103
- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2104
- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2105
- ; GFX908-NEXT: v_accvgpr_write_b32 a26, v3
2106
- ; GFX908-NEXT: v_accvgpr_write_b32 a27, v33
2084
+ ; GFX908-NEXT: v_accvgpr_write_b32 a26, v2
2085
+ ; GFX908-NEXT: v_accvgpr_write_b32 a27, v2
2107
2086
; GFX908-NEXT: v_accvgpr_write_b32 a28, v2
2108
- ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
2109
- ; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
2110
- ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
2111
- ; GFX908-NEXT: v_accvgpr_write_b32 a29, v3
2112
- ; GFX908-NEXT: v_accvgpr_write_b32 a30, v33
2087
+ ; GFX908-NEXT: v_accvgpr_write_b32 a29, v2
2088
+ ; GFX908-NEXT: v_accvgpr_write_b32 a30, v2
2113
2089
; GFX908-NEXT: v_accvgpr_write_b32 a31, v2
2114
2090
; GFX908-NEXT: .LBB8_1: ; %for.cond.preheader
2115
2091
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
0 commit comments