@@ -712,25 +712,25 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
712
712
; SM70-NEXT: // %bb.0:
713
713
; SM70-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
714
714
; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
715
- ; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r1;
716
- ; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
717
- ; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r3;
718
- ; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r4;
719
- ; SM70-NEXT: cvt.u32.u16 %r5, %rs8;
715
+ ; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4;
716
+ ; SM70-NEXT: cvt.u32.u16 %r5, %rs2;
720
717
; SM70-NEXT: shl.b32 %r29, %r5, 16;
721
- ; SM70-NEXT: cvt.u32.u16 %r8, %rs7 ;
718
+ ; SM70-NEXT: cvt.u32.u16 %r8, %rs1 ;
722
719
; SM70-NEXT: shl.b32 %r30, %r8, 16;
723
- ; SM70-NEXT: cvt.u32.u16 %r11, %rs6;
720
+ ; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r3;
721
+ ; SM70-NEXT: cvt.u32.u16 %r11, %rs4;
724
722
; SM70-NEXT: shl.b32 %r31, %r11, 16;
725
- ; SM70-NEXT: cvt.u32.u16 %r14, %rs5 ;
723
+ ; SM70-NEXT: cvt.u32.u16 %r14, %rs3 ;
726
724
; SM70-NEXT: shl.b32 %r32, %r14, 16;
727
- ; SM70-NEXT: cvt.u32.u16 %r17, %rs4;
725
+ ; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r2;
726
+ ; SM70-NEXT: cvt.u32.u16 %r17, %rs6;
728
727
; SM70-NEXT: shl.b32 %r33, %r17, 16;
729
- ; SM70-NEXT: cvt.u32.u16 %r20, %rs3 ;
728
+ ; SM70-NEXT: cvt.u32.u16 %r20, %rs5 ;
730
729
; SM70-NEXT: shl.b32 %r34, %r20, 16;
731
- ; SM70-NEXT: cvt.u32.u16 %r23, %rs2;
730
+ ; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1;
731
+ ; SM70-NEXT: cvt.u32.u16 %r23, %rs8;
732
732
; SM70-NEXT: shl.b32 %r35, %r23, 16;
733
- ; SM70-NEXT: cvt.u32.u16 %r26, %rs1 ;
733
+ ; SM70-NEXT: cvt.u32.u16 %r26, %rs7 ;
734
734
; SM70-NEXT: shl.b32 %r36, %r26, 16;
735
735
; SM70-NEXT: st.param.v4.b32 [func_retval0], {%r36, %r35, %r34, %r33};
736
736
; SM70-NEXT: st.param.v4.b32 [func_retval0+16], {%r32, %r31, %r30, %r29};
@@ -745,18 +745,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
745
745
; SM80-NEXT: // %bb.0:
746
746
; SM80-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
747
747
; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
748
- ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1 ;
749
- ; SM80-NEXT: mov.b32 {%rs3 , %rs4}, %r2 ;
750
- ; SM80-NEXT: mov.b32 {%rs5 , %rs6}, %r3 ;
751
- ; SM80-NEXT: mov.b32 {%rs7 , %rs8 }, %r4 ;
752
- ; SM80-NEXT: cvt.f32.bf16 %r5 , %rs8 ;
753
- ; SM80-NEXT: cvt.f32.bf16 %r6 , %rs7 ;
754
- ; SM80-NEXT: cvt.f32.bf16 %r7 , %rs6;
755
- ; SM80-NEXT: cvt.f32.bf16 %r8 , %rs5 ;
756
- ; SM80-NEXT: cvt.f32.bf16 %r9 , %rs4 ;
757
- ; SM80-NEXT: cvt.f32.bf16 %r10 , %rs3 ;
758
- ; SM80-NEXT: cvt.f32.bf16 %r11, %rs2 ;
759
- ; SM80-NEXT: cvt.f32.bf16 %r12, %rs1 ;
748
+ ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4 ;
749
+ ; SM80-NEXT: cvt.f32.bf16 %r5 , %rs2 ;
750
+ ; SM80-NEXT: cvt.f32.bf16 %r6 , %rs1 ;
751
+ ; SM80-NEXT: mov.b32 {%rs3 , %rs4 }, %r3 ;
752
+ ; SM80-NEXT: cvt.f32.bf16 %r7 , %rs4 ;
753
+ ; SM80-NEXT: cvt.f32.bf16 %r8 , %rs3 ;
754
+ ; SM80-NEXT: mov.b32 {%rs5 , %rs6}, %r2 ;
755
+ ; SM80-NEXT: cvt.f32.bf16 %r9 , %rs6 ;
756
+ ; SM80-NEXT: cvt.f32.bf16 %r10 , %rs5 ;
757
+ ; SM80-NEXT: mov.b32 {%rs7 , %rs8}, %r1 ;
758
+ ; SM80-NEXT: cvt.f32.bf16 %r11, %rs8 ;
759
+ ; SM80-NEXT: cvt.f32.bf16 %r12, %rs7 ;
760
760
; SM80-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
761
761
; SM80-NEXT: st.param.v4.b32 [func_retval0+16], {%r8, %r7, %r6, %r5};
762
762
; SM80-NEXT: ret;
@@ -770,18 +770,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
770
770
; SM80-FTZ-NEXT: // %bb.0:
771
771
; SM80-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
772
772
; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
773
- ; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r1 ;
774
- ; SM80-FTZ-NEXT: mov.b32 {%rs3 , %rs4}, %r2 ;
775
- ; SM80-FTZ-NEXT: mov.b32 {%rs5 , %rs6}, %r3 ;
776
- ; SM80-FTZ-NEXT: mov.b32 {%rs7 , %rs8 }, %r4 ;
777
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5 , %rs8 ;
778
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6 , %rs7 ;
779
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7 , %rs6;
780
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8 , %rs5 ;
781
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9 , %rs4 ;
782
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10 , %rs3 ;
783
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2 ;
784
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1 ;
773
+ ; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4 ;
774
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5 , %rs2 ;
775
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6 , %rs1 ;
776
+ ; SM80-FTZ-NEXT: mov.b32 {%rs3 , %rs4 }, %r3 ;
777
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7 , %rs4 ;
778
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8 , %rs3 ;
779
+ ; SM80-FTZ-NEXT: mov.b32 {%rs5 , %rs6}, %r2 ;
780
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9 , %rs6 ;
781
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10 , %rs5 ;
782
+ ; SM80-FTZ-NEXT: mov.b32 {%rs7 , %rs8}, %r1 ;
783
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs8 ;
784
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs7 ;
785
785
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
786
786
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r8, %r7, %r6, %r5};
787
787
; SM80-FTZ-NEXT: ret;
@@ -795,18 +795,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
795
795
; SM90-NEXT: // %bb.0:
796
796
; SM90-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
797
797
; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
798
- ; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1 ;
799
- ; SM90-NEXT: mov.b32 {%rs3 , %rs4}, %r2 ;
800
- ; SM90-NEXT: mov.b32 {%rs5 , %rs6}, %r3 ;
801
- ; SM90-NEXT: mov.b32 {%rs7 , %rs8 }, %r4 ;
802
- ; SM90-NEXT: cvt.f32.bf16 %r5 , %rs8 ;
803
- ; SM90-NEXT: cvt.f32.bf16 %r6 , %rs7 ;
804
- ; SM90-NEXT: cvt.f32.bf16 %r7 , %rs6;
805
- ; SM90-NEXT: cvt.f32.bf16 %r8 , %rs5 ;
806
- ; SM90-NEXT: cvt.f32.bf16 %r9 , %rs4 ;
807
- ; SM90-NEXT: cvt.f32.bf16 %r10 , %rs3 ;
808
- ; SM90-NEXT: cvt.f32.bf16 %r11, %rs2 ;
809
- ; SM90-NEXT: cvt.f32.bf16 %r12, %rs1 ;
798
+ ; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r4 ;
799
+ ; SM90-NEXT: cvt.f32.bf16 %r5 , %rs2 ;
800
+ ; SM90-NEXT: cvt.f32.bf16 %r6 , %rs1 ;
801
+ ; SM90-NEXT: mov.b32 {%rs3 , %rs4 }, %r3 ;
802
+ ; SM90-NEXT: cvt.f32.bf16 %r7 , %rs4 ;
803
+ ; SM90-NEXT: cvt.f32.bf16 %r8 , %rs3 ;
804
+ ; SM90-NEXT: mov.b32 {%rs5 , %rs6}, %r2 ;
805
+ ; SM90-NEXT: cvt.f32.bf16 %r9 , %rs6 ;
806
+ ; SM90-NEXT: cvt.f32.bf16 %r10 , %rs5 ;
807
+ ; SM90-NEXT: mov.b32 {%rs7 , %rs8}, %r1 ;
808
+ ; SM90-NEXT: cvt.f32.bf16 %r11, %rs8 ;
809
+ ; SM90-NEXT: cvt.f32.bf16 %r12, %rs7 ;
810
810
; SM90-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
811
811
; SM90-NEXT: st.param.v4.b32 [func_retval0+16], {%r8, %r7, %r6, %r5};
812
812
; SM90-NEXT: ret;
0 commit comments