@@ -762,32 +762,32 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
762
762
; SM70-NEXT: // %bb.0:
763
763
; SM70-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
764
764
; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
765
- ; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r1;
766
- ; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
767
- ; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r3;
768
- ; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r4;
769
- ; SM70-NEXT: cvt.u32.u16 %r5, %rs8;
765
+ ; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4;
766
+ ; SM70-NEXT: cvt.u32.u16 %r5, %rs2;
770
767
; SM70-NEXT: shl.b32 %r6, %r5, 16;
771
768
; SM70-NEXT: mov.b32 %f1, %r6;
772
- ; SM70-NEXT: cvt.u32.u16 %r7, %rs7 ;
769
+ ; SM70-NEXT: cvt.u32.u16 %r7, %rs1 ;
773
770
; SM70-NEXT: shl.b32 %r8, %r7, 16;
774
771
; SM70-NEXT: mov.b32 %f2, %r8;
775
- ; SM70-NEXT: cvt.u32.u16 %r9, %rs6;
772
+ ; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r3;
773
+ ; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
776
774
; SM70-NEXT: shl.b32 %r10, %r9, 16;
777
775
; SM70-NEXT: mov.b32 %f3, %r10;
778
- ; SM70-NEXT: cvt.u32.u16 %r11, %rs5 ;
776
+ ; SM70-NEXT: cvt.u32.u16 %r11, %rs3 ;
779
777
; SM70-NEXT: shl.b32 %r12, %r11, 16;
780
778
; SM70-NEXT: mov.b32 %f4, %r12;
781
- ; SM70-NEXT: cvt.u32.u16 %r13, %rs4;
779
+ ; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r2;
780
+ ; SM70-NEXT: cvt.u32.u16 %r13, %rs6;
782
781
; SM70-NEXT: shl.b32 %r14, %r13, 16;
783
782
; SM70-NEXT: mov.b32 %f5, %r14;
784
- ; SM70-NEXT: cvt.u32.u16 %r15, %rs3 ;
783
+ ; SM70-NEXT: cvt.u32.u16 %r15, %rs5 ;
785
784
; SM70-NEXT: shl.b32 %r16, %r15, 16;
786
785
; SM70-NEXT: mov.b32 %f6, %r16;
787
- ; SM70-NEXT: cvt.u32.u16 %r17, %rs2;
786
+ ; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1;
787
+ ; SM70-NEXT: cvt.u32.u16 %r17, %rs8;
788
788
; SM70-NEXT: shl.b32 %r18, %r17, 16;
789
789
; SM70-NEXT: mov.b32 %f7, %r18;
790
- ; SM70-NEXT: cvt.u32.u16 %r19, %rs1 ;
790
+ ; SM70-NEXT: cvt.u32.u16 %r19, %rs7 ;
791
791
; SM70-NEXT: shl.b32 %r20, %r19, 16;
792
792
; SM70-NEXT: mov.b32 %f8, %r20;
793
793
; SM70-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
@@ -804,18 +804,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
804
804
; SM80-NEXT: // %bb.0:
805
805
; SM80-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
806
806
; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
807
- ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1 ;
808
- ; SM80-NEXT: mov.b32 {%rs3 , %rs4}, %r2 ;
809
- ; SM80-NEXT: mov.b32 {%rs5 , %rs6}, %r3 ;
810
- ; SM80-NEXT: mov.b32 {%rs7 , %rs8 }, %r4 ;
811
- ; SM80-NEXT: cvt.f32.bf16 %f1 , %rs8 ;
812
- ; SM80-NEXT: cvt.f32.bf16 %f2 , %rs7 ;
813
- ; SM80-NEXT: cvt.f32.bf16 %f3 , %rs6;
814
- ; SM80-NEXT: cvt.f32.bf16 %f4 , %rs5 ;
815
- ; SM80-NEXT: cvt.f32.bf16 %f5 , %rs4 ;
816
- ; SM80-NEXT: cvt.f32.bf16 %f6 , %rs3 ;
817
- ; SM80-NEXT: cvt.f32.bf16 %f7, %rs2 ;
818
- ; SM80-NEXT: cvt.f32.bf16 %f8, %rs1 ;
807
+ ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4 ;
808
+ ; SM80-NEXT: cvt.f32.bf16 %f1 , %rs2 ;
809
+ ; SM80-NEXT: cvt.f32.bf16 %f2 , %rs1 ;
810
+ ; SM80-NEXT: mov.b32 {%rs3 , %rs4 }, %r3 ;
811
+ ; SM80-NEXT: cvt.f32.bf16 %f3 , %rs4 ;
812
+ ; SM80-NEXT: cvt.f32.bf16 %f4 , %rs3 ;
813
+ ; SM80-NEXT: mov.b32 {%rs5 , %rs6}, %r2 ;
814
+ ; SM80-NEXT: cvt.f32.bf16 %f5 , %rs6 ;
815
+ ; SM80-NEXT: cvt.f32.bf16 %f6 , %rs5 ;
816
+ ; SM80-NEXT: mov.b32 {%rs7 , %rs8}, %r1 ;
817
+ ; SM80-NEXT: cvt.f32.bf16 %f7, %rs8 ;
818
+ ; SM80-NEXT: cvt.f32.bf16 %f8, %rs7 ;
819
819
; SM80-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
820
820
; SM80-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
821
821
; SM80-NEXT: ret;
@@ -830,18 +830,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
830
830
; SM80-FTZ-NEXT: // %bb.0:
831
831
; SM80-FTZ-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
832
832
; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
833
- ; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r1 ;
834
- ; SM80-FTZ-NEXT: mov.b32 {%rs3 , %rs4}, %r2 ;
835
- ; SM80-FTZ-NEXT: mov.b32 {%rs5 , %rs6}, %r3 ;
836
- ; SM80-FTZ-NEXT: mov.b32 {%rs7 , %rs8 }, %r4 ;
837
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1 , %rs8 ;
838
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2 , %rs7 ;
839
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f3 , %rs6;
840
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4 , %rs5 ;
841
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5 , %rs4 ;
842
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6 , %rs3 ;
843
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs2 ;
844
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs1 ;
833
+ ; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4 ;
834
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1 , %rs2 ;
835
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2 , %rs1 ;
836
+ ; SM80-FTZ-NEXT: mov.b32 {%rs3 , %rs4 }, %r3 ;
837
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f3 , %rs4 ;
838
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4 , %rs3 ;
839
+ ; SM80-FTZ-NEXT: mov.b32 {%rs5 , %rs6}, %r2 ;
840
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5 , %rs6 ;
841
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6 , %rs5 ;
842
+ ; SM80-FTZ-NEXT: mov.b32 {%rs7 , %rs8}, %r1 ;
843
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs8 ;
844
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs7 ;
845
845
; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
846
846
; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
847
847
; SM80-FTZ-NEXT: ret;
@@ -856,18 +856,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
856
856
; SM90-NEXT: // %bb.0:
857
857
; SM90-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
858
858
; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
859
- ; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1 ;
860
- ; SM90-NEXT: mov.b32 {%rs3 , %rs4}, %r2 ;
861
- ; SM90-NEXT: mov.b32 {%rs5 , %rs6}, %r3 ;
862
- ; SM90-NEXT: mov.b32 {%rs7 , %rs8 }, %r4 ;
863
- ; SM90-NEXT: cvt.f32.bf16 %f1 , %rs8 ;
864
- ; SM90-NEXT: cvt.f32.bf16 %f2 , %rs7 ;
865
- ; SM90-NEXT: cvt.f32.bf16 %f3 , %rs6;
866
- ; SM90-NEXT: cvt.f32.bf16 %f4 , %rs5 ;
867
- ; SM90-NEXT: cvt.f32.bf16 %f5 , %rs4 ;
868
- ; SM90-NEXT: cvt.f32.bf16 %f6 , %rs3 ;
869
- ; SM90-NEXT: cvt.f32.bf16 %f7, %rs2 ;
870
- ; SM90-NEXT: cvt.f32.bf16 %f8, %rs1 ;
859
+ ; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r4 ;
860
+ ; SM90-NEXT: cvt.f32.bf16 %f1 , %rs2 ;
861
+ ; SM90-NEXT: cvt.f32.bf16 %f2 , %rs1 ;
862
+ ; SM90-NEXT: mov.b32 {%rs3 , %rs4 }, %r3 ;
863
+ ; SM90-NEXT: cvt.f32.bf16 %f3 , %rs4 ;
864
+ ; SM90-NEXT: cvt.f32.bf16 %f4 , %rs3 ;
865
+ ; SM90-NEXT: mov.b32 {%rs5 , %rs6}, %r2 ;
866
+ ; SM90-NEXT: cvt.f32.bf16 %f5 , %rs6 ;
867
+ ; SM90-NEXT: cvt.f32.bf16 %f6 , %rs5 ;
868
+ ; SM90-NEXT: mov.b32 {%rs7 , %rs8}, %r1 ;
869
+ ; SM90-NEXT: cvt.f32.bf16 %f7, %rs8 ;
870
+ ; SM90-NEXT: cvt.f32.bf16 %f8, %rs7 ;
871
871
; SM90-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
872
872
; SM90-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
873
873
; SM90-NEXT: ret;
0 commit comments