swiotlb: optimize get_max_slots()

Petr Tesarik · Christoph Hellwig · commit d069ed288ac7 · 2023-08-08T10:29:21.000-07:00
Use a simple logical shift and increment to calculate the number of slots
taken by the DMA segment boundary.

At least GCC-13 is not able to optimize the expression, producing this
horrible assembly code on x86:

	cmpq	$-1, %rcx
	je	.L364
	addq	$2048, %rcx
	shrq	$11, %rcx
	movq	%rcx, %r13
.L331:
	// rest of the function here...

	// after function epilogue and return:
.L364:
	movabsq $9007199254740992, %r13
	jmp	.L331

After the optimization, the code looks more reasonable:

	shrq	$11, %r11
	leaq	1(%r11), %rbx

Signed-off-by: Petr Tesarik &lt;petr.tesarik.ext@huawei.com&gt;
Signed-off-by: Christoph Hellwig &lt;hch@lst.de&gt;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
@@ -903,9 +903,7 @@ static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx)
  */
 static inline unsigned long get_max_slots(unsigned long boundary_mask)
 {
-	if (boundary_mask == ~0UL)
-		return 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
-	return nr_slots(boundary_mask + 1);
+	return (boundary_mask >> IO_TLB_SHIFT) + 1;
 }
 
 static unsigned int wrap_area_index(struct io_tlb_pool *mem, unsigned int index)

Original file line number	Diff line number	Diff line change
`@@ -903,9 +903,7 @@ static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx)`
`903`	`903`	`*/`
`904`	`904`	`static inline unsigned long get_max_slots(unsigned long boundary_mask)`
`905`	`905`	`{`
`906`		`- if (boundary_mask == ~0UL)`
`907`		`- return 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);`
`908`		`- return nr_slots(boundary_mask + 1);`
	`906`	`+ return (boundary_mask >> IO_TLB_SHIFT) + 1;`
`909`	`907`	`}`
`910`	`908`
`911`	`909`	`static unsigned int wrap_area_index(struct io_tlb_pool *mem, unsigned int index)`