Skip to content

Commit e28376e

Browse files
committed
[X86] Add i32->float and i64->double bitcast pseudo instructions to store folding table.
We have pseudo instructions we use for bitcasts between these types. We have them in the load folding table, but not the store folding table. This adds them there so they can be used for stack spills. I added an exact size check so that we don't fold when the stack slot is larger than the GPR. Otherwise the upper bits in the stack slot would be garbage. That would be fine for Eli's test case in PR47874, but I'm not sure its safe in general. A step towards fixing PR47874. Next steps are to change the ADDSSrr_Int pseudo instructions to use FR32 as the second source register class instead of VR128. That will keep the coalescer from promoting the register class of the bitcast instruction which will make the stack slot 4 bytes instead of 16 bytes. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D89656
1 parent ae3625d commit e28376e

File tree

3 files changed

+25
-14
lines changed

3 files changed

+25
-14
lines changed

llvm/lib/Target/X86/X86InstrFoldTables.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,11 +300,13 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
300300
{ X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE },
301301
{ X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE },
302302
{ X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE },
303+
{ X86::MOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
303304
{ X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE },
304305
{ X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE },
305306
{ X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
306307
{ X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
307308
{ X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
309+
{ X86::MOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
308310
{ X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
309311
{ X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE },
310312
{ X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
@@ -357,6 +359,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
357359
{ X86::VEXTRACTI64x4Zrr, X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
358360
{ X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE },
359361
{ X86::VEXTRACTPSrr, X86::VEXTRACTPSmr, TB_FOLDED_STORE },
362+
{ X86::VMOV64toSDZrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
363+
{ X86::VMOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
360364
{ X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
361365
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
362366
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
@@ -367,6 +371,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
367371
{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
368372
{ X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
369373
{ X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
374+
{ X86::VMOVDI2SSZrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
375+
{ X86::VMOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
370376
{ X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
371377
{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
372378
{ X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },

llvm/lib/Target/X86/X86InstrInfo.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5526,6 +5526,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
55265526

55275527
if (I != nullptr) {
55285528
unsigned Opcode = I->DstOp;
5529+
bool FoldedLoad =
5530+
isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_LOAD) || OpNum > 0;
5531+
bool FoldedStore =
5532+
isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_STORE);
55295533
MaybeAlign MinAlign =
55305534
decodeMaybeAlign((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT);
55315535
if (MinAlign && Alignment < *MinAlign)
@@ -5536,20 +5540,25 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
55365540
const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum,
55375541
&RI, MF);
55385542
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
5539-
if (Size < RCSize) {
5540-
// FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
5541-
// Check if it's safe to fold the load. If the size of the object is
5542-
// narrower than the load width, then it's not.
5543-
if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
5544-
return nullptr;
5543+
// Check if it's safe to fold the load. If the size of the object is
5544+
// narrower than the load width, then it's not.
5545+
// FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
5546+
if (FoldedLoad && Size < RCSize) {
55455547
// If this is a 64-bit load, but the spill slot is 32, then we can do
55465548
// a 32-bit load which is implicitly zero-extended. This likely is
55475549
// due to live interval analysis remat'ing a load from stack slot.
5550+
if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
5551+
return nullptr;
55485552
if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
55495553
return nullptr;
55505554
Opcode = X86::MOV32rm;
55515555
NarrowToMOV32rm = true;
55525556
}
5557+
// For stores, make sure the size of the object is equal to the size of
5558+
// the store. If the object is larger, the extra bits would be garbage. If
5559+
// the object is smaller we might overwrite another object or fault.
5560+
if (FoldedStore && Size != RCSize)
5561+
return nullptr;
55535562
}
55545563

55555564
if (isTwoAddrFold)

llvm/test/CodeGen/X86/pr47874.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@ define void @a(float* %arg, i32 %arg1) {
99
; SSE2-NEXT: testl %esi, %esi
1010
; SSE2-NEXT: jle LBB0_3
1111
; SSE2-NEXT: ## %bb.1: ## %bb2
12-
; SSE2-NEXT: movd %esi, %xmm0
13-
; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
12+
; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
1413
; SSE2-NEXT: movl %esi, %eax
1514
; SSE2-NEXT: .p2align 4, 0x90
1615
; SSE2-NEXT: LBB0_2: ## %bb6
@@ -31,8 +30,7 @@ define void @a(float* %arg, i32 %arg1) {
3130
; AVX-NEXT: testl %esi, %esi
3231
; AVX-NEXT: jle LBB0_3
3332
; AVX-NEXT: ## %bb.1: ## %bb2
34-
; AVX-NEXT: vmovd %esi, %xmm0
35-
; AVX-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
33+
; AVX-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
3634
; AVX-NEXT: movl %esi, %eax
3735
; AVX-NEXT: .p2align 4, 0x90
3836
; AVX-NEXT: LBB0_2: ## %bb6
@@ -78,8 +76,7 @@ define void @b(double* %arg, i64 %arg1) {
7876
; SSE2-NEXT: testq %rsi, %rsi
7977
; SSE2-NEXT: jle LBB1_3
8078
; SSE2-NEXT: ## %bb.1: ## %bb2
81-
; SSE2-NEXT: movq %rsi, %xmm0
82-
; SSE2-NEXT: movq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Spill
79+
; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
8380
; SSE2-NEXT: .p2align 4, 0x90
8481
; SSE2-NEXT: LBB1_2: ## %bb6
8582
; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1
@@ -99,8 +96,7 @@ define void @b(double* %arg, i64 %arg1) {
9996
; AVX-NEXT: testq %rsi, %rsi
10097
; AVX-NEXT: jle LBB1_3
10198
; AVX-NEXT: ## %bb.1: ## %bb2
102-
; AVX-NEXT: vmovq %rsi, %xmm0
103-
; AVX-NEXT: vmovq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Spill
99+
; AVX-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
104100
; AVX-NEXT: .p2align 4, 0x90
105101
; AVX-NEXT: LBB1_2: ## %bb6
106102
; AVX-NEXT: ## =>This Inner Loop Header: Depth=1

0 commit comments

Comments
 (0)