[LegalizeVectorOps][X86] Don't defer BITREVERSE expansion to LegalizeDAG.

topperc · topperc · commit ff37b1105d6e · 2021-10-21T15:23:23.000-07:00
By expanding early it allows the shifts to be custom lowered in LegalizeVectorOps. Then a DAG combine is able to run on them before LegalizeDAG handles the BUILD_VECTORS for the masks used. v16Xi8 shift lowering on X86 requires a mask to be applied to a v8i16 shift. The BITREVERSE expansion applied an AND mask before SHL ops and after SRL ops. This was done to share the same mask constant for both shifts. It looks like this patch allows DAG combine to remove the AND mask added after v16i8 SHL by X86 lowering. This maintains the mask sharing that BITREVERSE was trying to achieve. Prior to this patch it looks like we kept the mask after the SHL instead which required an extra constant pool or a PANDN to invert it. This is dependent on D112248 because RISCV will end up scalarizing the BSWAP portion of the BITREVERSE expansion if we don't disable BSWAP scalarization in LegalizeVectorOps first. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D112254
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1162,9 +1162,10 @@ void VectorLegalizer::ExpandBITREVERSE(SDNode *Node,
   if (TLI.isOperationLegalOrCustom(ISD::SHL, VT) &&
       TLI.isOperationLegalOrCustom(ISD::SRL, VT) &&
       TLI.isOperationLegalOrCustomOrPromote(ISD::AND, VT) &&
-      TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT))
-    // Let LegalizeDAG handle this later.
+      TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT)) {
+    Results.push_back(TLI.expandBITREVERSE(Node, DAG));
     return;
+  }
 
   // Otherwise unroll.
   SDValue Tmp = DAG.UnrollVectorOp(Node);
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -58,10 +58,11 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
 ; X64-NEXT:    psllw $8, %xmm0
 ; X64-NEXT:    por %xmm1, %xmm0
 ; X64-NEXT:    movdqa %xmm0, %xmm1
-; X64-NEXT:    psllw $4, %xmm1
-; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-NEXT:    psrlw $4, %xmm0
-; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    psrlw $4, %xmm1
+; X64-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X64-NEXT:    pand %xmm2, %xmm1
+; X64-NEXT:    pand %xmm2, %xmm0
+; X64-NEXT:    psllw $4, %xmm0
 ; X64-NEXT:    por %xmm1, %xmm0
 ; X64-NEXT:    movdqa %xmm0, %xmm1
 ; X64-NEXT:    psrlw $2, %xmm1
diff --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll
@@ -50,10 +50,11 @@ define <4 x i32> @test_demandedbits_bitreverse(<4 x i32> %a0) nounwind {
 ; X86-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 ; X86-NEXT:    packuswb %xmm2, %xmm0
 ; X86-NEXT:    movdqa %xmm0, %xmm1
-; X86-NEXT:    psllw $4, %xmm1
-; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-NEXT:    psrlw $4, %xmm0
-; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    psrlw $4, %xmm1
+; X86-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X86-NEXT:    pand %xmm2, %xmm1
+; X86-NEXT:    pand %xmm2, %xmm0
+; X86-NEXT:    psllw $4, %xmm0
 ; X86-NEXT:    por %xmm1, %xmm0
 ; X86-NEXT:    movdqa %xmm0, %xmm1
 ; X86-NEXT:    psrlw $2, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll