Skip to content

Commit 841c8d4

Browse files
committed
[LV] Add tests for more interleave group factors on AArch64 and RISC-V. NFC
The plan is to eventually add support for scalably vectorizing these for non-power-of-2 factors, see llvm#139893 Simultaneously, we need to add a test to make sure we don't generate @llvm.vector.[de]interleave3 for AArch64 if we can't lower it (yet)
1 parent 4b09eed commit 841c8d4

File tree

2 files changed

+1253
-80
lines changed

2 files changed

+1253
-80
lines changed

llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll

Lines changed: 99 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1353,6 +1353,103 @@ end:
13531353
ret void
13541354
}
13551355

1356+
; Check vectorization on an interleaved load/store groups of factor 3
1357+
1358+
; for (int i = 0; i < 1024; ++i) {
1359+
; dst[i].x = a[i].x + b[i].x;
1360+
; dst[i].y = a[i].y - b[i].y;
1361+
; dst[i].z = a[i].z << b[i].z;
1362+
; }
1363+
;
1364+
; TODO: Support scalable interleave groups once we can also codegen
1365+
; @llvm.[de]interleave3
1366+
%struct.xyz = type { i32, i32, i32 }
1367+
1368+
define void @interleave_deinterleave_factor3(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) {
1369+
; CHECK-LABEL: @interleave_deinterleave_factor3(
1370+
; CHECK-NEXT: entry:
1371+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
1372+
; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
1373+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024
1374+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1375+
; CHECK: vector.ph:
1376+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
1377+
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
1378+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
1379+
; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]]
1380+
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
1381+
; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2
1382+
; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
1383+
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
1384+
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
1385+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1386+
; CHECK: vector.body:
1387+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1388+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1389+
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_XYZ:%.*]], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
1390+
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
1391+
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_XYZ]], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]]
1392+
; CHECK-NEXT: [[TMP11:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP20]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
1393+
; CHECK-NEXT: [[TMP14:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], [[TMP7]]
1394+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_XYZ]], ptr [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
1395+
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true))
1396+
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP19]], i64 4
1397+
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP21]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
1398+
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP20]], i64 4
1399+
; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP22]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
1400+
; CHECK-NEXT: [[TMP16:%.*]] = sub nsw <vscale x 4 x i32> [[TMP8]], [[TMP12]]
1401+
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP10]], i64 4
1402+
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> [[TMP23]], i32 4, <vscale x 4 x i1> splat (i1 true))
1403+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP19]], i64 8
1404+
; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
1405+
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP20]], i64 8
1406+
; CHECK-NEXT: [[TMP13:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP24]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
1407+
; CHECK-NEXT: [[TMP17:%.*]] = shl <vscale x 4 x i32> [[TMP9]], [[TMP13]]
1408+
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP10]], i64 8
1409+
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP17]], <vscale x 4 x ptr> [[TMP25]], i32 4, <vscale x 4 x i1> splat (i1 true))
1410+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
1411+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
1412+
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1413+
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
1414+
; CHECK: middle.block:
1415+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1416+
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1417+
; CHECK: scalar.ph:
1418+
;
1419+
entry:
1420+
br label %for.body
1421+
1422+
for.body:
1423+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
1424+
%arrayidx = getelementptr inbounds %struct.xyz, ptr %a, i64 %indvars.iv
1425+
%0 = load i32, ptr %arrayidx, align 4
1426+
%arrayidx2 = getelementptr inbounds %struct.xyz, ptr %b, i64 %indvars.iv
1427+
%1 = load i32, ptr %arrayidx2, align 4
1428+
%add = add nsw i32 %1, %0
1429+
%arrayidx5 = getelementptr inbounds %struct.xyz, ptr %dst, i64 %indvars.iv
1430+
store i32 %add, ptr %arrayidx5, align 4
1431+
%y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4
1432+
%2 = load i32, ptr %y, align 4
1433+
%y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4
1434+
%3 = load i32, ptr %y11, align 4
1435+
%sub = sub nsw i32 %2, %3
1436+
%y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4
1437+
store i32 %sub, ptr %y14, align 4
1438+
%z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8
1439+
%4 = load i32, ptr %z, align 4
1440+
%z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8
1441+
%5 = load i32, ptr %z19, align 4
1442+
%shl = shl i32 %4, %5
1443+
%z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8
1444+
store i32 %shl, ptr %z22, align 4
1445+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
1446+
%exitcond.not = icmp eq i64 %indvars.iv.next, 1024
1447+
br i1 %exitcond.not, label %for.end, label %for.body
1448+
1449+
for.end:
1450+
ret void
1451+
}
1452+
13561453
; Check vectorization on an interleaved load/store groups of factor 4
13571454

13581455
; for (int i = 0; i < 1024; ++i) {
@@ -1413,7 +1510,7 @@ define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a
14131510
; CHECK-NEXT: store <vscale x 16 x i32> [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4
14141511
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
14151512
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1416-
; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
1513+
; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
14171514
; CHECK: middle.block:
14181515
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
14191516
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -1532,7 +1629,7 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
15321629
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
15331630
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
15341631
; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
1535-
; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
1632+
; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
15361633
; CHECK: middle.block:
15371634
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
15381635
; CHECK: scalar.ph:

0 commit comments

Comments
 (0)