@@ -557,11 +557,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
557
557
; GFX908-NEXT: s_mul_hi_u32 s9, s0, s7
558
558
; GFX908-NEXT: s_mul_i32 s0, s0, s7
559
559
; GFX908-NEXT: s_add_i32 s1, s9, s1
560
- ; GFX908-NEXT: s_lshl_b64 s[0:1 ], s[0:1], 5
560
+ ; GFX908-NEXT: s_lshl_b64 s[14:15 ], s[0:1], 5
561
561
; GFX908-NEXT: s_branch .LBB3_2
562
562
; GFX908-NEXT: .LBB3_1: ; %Flow20
563
563
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
564
- ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15 ]
564
+ ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[0:1 ]
565
565
; GFX908-NEXT: s_cbranch_vccz .LBB3_12
566
566
; GFX908-NEXT: .LBB3_2: ; %bb9
567
567
; GFX908-NEXT: ; =>This Loop Header: Depth=1
@@ -571,15 +571,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
571
571
; GFX908-NEXT: ; %bb.3: ; %bb14
572
572
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
573
573
; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
574
+ ; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
574
575
; GFX908-NEXT: s_mov_b32 s7, s6
576
+ ; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
575
577
; GFX908-NEXT: v_mov_b32_e32 v4, s6
578
+ ; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6
576
579
; GFX908-NEXT: v_mov_b32_e32 v6, s6
577
580
; GFX908-NEXT: v_mov_b32_e32 v9, s7
578
581
; GFX908-NEXT: v_mov_b32_e32 v5, s7
579
582
; GFX908-NEXT: v_mov_b32_e32 v7, s7
580
583
; GFX908-NEXT: v_mov_b32_e32 v8, s6
581
- ; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[4:5], 0
582
- ; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[4:5], -1
584
+ ; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
583
585
; GFX908-NEXT: v_mov_b32_e32 v11, v5
584
586
; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11]
585
587
; GFX908-NEXT: v_mov_b32_e32 v10, v4
@@ -599,9 +601,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
599
601
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
600
602
; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
601
603
; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
602
- ; GFX908-NEXT: s_add_u32 s18, s18, s0
604
+ ; GFX908-NEXT: s_add_u32 s18, s18, s14
603
605
; GFX908-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[2:3]
604
- ; GFX908-NEXT: s_addc_u32 s19, s19, s1
606
+ ; GFX908-NEXT: s_addc_u32 s19, s19, s15
605
607
; GFX908-NEXT: s_mov_b64 s[20:21], 0
606
608
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23]
607
609
; GFX908-NEXT: s_cbranch_vccz .LBB3_9
@@ -620,7 +622,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
620
622
; GFX908-NEXT: s_waitcnt vmcnt(0)
621
623
; GFX908-NEXT: ds_read_b64 v[12:13], v19
622
624
; GFX908-NEXT: ds_read_b64 v[14:15], v0
623
- ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17 ]
625
+ ; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1 ]
624
626
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
625
627
; GFX908-NEXT: s_cbranch_vccnz .LBB3_7
626
628
; GFX908-NEXT: ; %bb.6: ; %bb51
@@ -648,7 +650,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
648
650
; GFX908-NEXT: s_mov_b64 s[20:21], -1
649
651
; GFX908-NEXT: s_branch .LBB3_4
650
652
; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
651
- ; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15 ]
653
+ ; GFX908-NEXT: s_mov_b64 s[20:21], s[16:17 ]
652
654
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21]
653
655
; GFX908-NEXT: s_cbranch_vccz .LBB3_4
654
656
; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
@@ -659,7 +661,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
659
661
; GFX908-NEXT: s_xor_b64 s[16:17], s[20:21], -1
660
662
; GFX908-NEXT: .LBB3_10: ; %Flow19
661
663
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
662
- ; GFX908-NEXT: s_mov_b64 s[14:15 ], -1
664
+ ; GFX908-NEXT: s_mov_b64 s[0:1 ], -1
663
665
; GFX908-NEXT: s_and_b64 vcc, exec, s[16:17]
664
666
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
665
667
; GFX908-NEXT: ; %bb.11: ; %bb12
@@ -668,7 +670,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
668
670
; GFX908-NEXT: s_addc_u32 s5, s5, 0
669
671
; GFX908-NEXT: s_add_u32 s10, s10, s12
670
672
; GFX908-NEXT: s_addc_u32 s11, s11, s13
671
- ; GFX908-NEXT: s_mov_b64 s[14:15 ], 0
673
+ ; GFX908-NEXT: s_mov_b64 s[0:1 ], 0
672
674
; GFX908-NEXT: s_branch .LBB3_1
673
675
; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock
674
676
; GFX908-NEXT: s_endpgm
@@ -718,11 +720,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
718
720
; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s7
719
721
; GFX90A-NEXT: s_mul_i32 s0, s0, s7
720
722
; GFX90A-NEXT: s_add_i32 s1, s9, s1
721
- ; GFX90A-NEXT: s_lshl_b64 s[0:1 ], s[0:1], 5
723
+ ; GFX90A-NEXT: s_lshl_b64 s[14:15 ], s[0:1], 5
722
724
; GFX90A-NEXT: s_branch .LBB3_2
723
725
; GFX90A-NEXT: .LBB3_1: ; %Flow20
724
726
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
725
- ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15 ]
727
+ ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[0:1 ]
726
728
; GFX90A-NEXT: s_cbranch_vccz .LBB3_12
727
729
; GFX90A-NEXT: .LBB3_2: ; %bb9
728
730
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
@@ -732,12 +734,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
732
734
; GFX90A-NEXT: ; %bb.3: ; %bb14
733
735
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
734
736
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
737
+ ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
735
738
; GFX90A-NEXT: s_mov_b32 s7, s6
739
+ ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1]
736
740
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
741
+ ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8
737
742
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[6:7], s[6:7] op_sel:[0,1]
738
743
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
739
- ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[4:5], 0
740
- ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[4:5], -1
744
+ ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
741
745
; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11]
742
746
; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
743
747
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -756,8 +760,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
756
760
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
757
761
; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
758
762
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
759
- ; GFX90A-NEXT: s_add_u32 s18, s18, s0
760
- ; GFX90A-NEXT: s_addc_u32 s19, s19, s1
763
+ ; GFX90A-NEXT: s_add_u32 s18, s18, s14
764
+ ; GFX90A-NEXT: s_addc_u32 s19, s19, s15
761
765
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[4:5]
762
766
; GFX90A-NEXT: s_mov_b64 s[20:21], 0
763
767
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23]
@@ -777,7 +781,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
777
781
; GFX90A-NEXT: s_waitcnt vmcnt(0)
778
782
; GFX90A-NEXT: ds_read_b64 v[14:15], v19
779
783
; GFX90A-NEXT: ds_read_b64 v[16:17], v0
780
- ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17 ]
784
+ ; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1 ]
781
785
; GFX90A-NEXT: ; kill: killed $sgpr20 killed $sgpr21
782
786
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
783
787
; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7
@@ -798,7 +802,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
798
802
; GFX90A-NEXT: s_mov_b64 s[20:21], -1
799
803
; GFX90A-NEXT: s_branch .LBB3_4
800
804
; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
801
- ; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15 ]
805
+ ; GFX90A-NEXT: s_mov_b64 s[20:21], s[16:17 ]
802
806
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[20:21]
803
807
; GFX90A-NEXT: s_cbranch_vccz .LBB3_4
804
808
; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
@@ -809,7 +813,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
809
813
; GFX90A-NEXT: s_xor_b64 s[16:17], s[20:21], -1
810
814
; GFX90A-NEXT: .LBB3_10: ; %Flow19
811
815
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
812
- ; GFX90A-NEXT: s_mov_b64 s[14:15 ], -1
816
+ ; GFX90A-NEXT: s_mov_b64 s[0:1 ], -1
813
817
; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17]
814
818
; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
815
819
; GFX90A-NEXT: ; %bb.11: ; %bb12
@@ -818,7 +822,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
818
822
; GFX90A-NEXT: s_addc_u32 s5, s5, 0
819
823
; GFX90A-NEXT: s_add_u32 s10, s10, s12
820
824
; GFX90A-NEXT: s_addc_u32 s11, s11, s13
821
- ; GFX90A-NEXT: s_mov_b64 s[14:15 ], 0
825
+ ; GFX90A-NEXT: s_mov_b64 s[0:1 ], 0
822
826
; GFX90A-NEXT: s_branch .LBB3_1
823
827
; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock
824
828
; GFX90A-NEXT: s_endpgm
0 commit comments