Skip to content
This repository was archived by the owner on Nov 8, 2023. It is now read-only.

Commit 1b03d82

Browse files
Jacob PanKAGA-KOKO
authored andcommitted
x86/irq: Install posted MSI notification handler
All MSI vectors are multiplexed into a single notification vector when posted MSI is enabled. It is the responsibility of the notification vector handler to demultiplex MSI vectors. In the handler the MSI vector handlers are dispatched without IDT delivery for each pending MSI interrupt. For example, the interrupt flow will change as follows: (3 MSIs of different vectors arrive in a a high frequency burst) BEFORE: interrupt(MSI) irq_enter() handler() /* EOI */ irq_exit() process_softirq() interrupt(MSI) irq_enter() handler() /* EOI */ irq_exit() process_softirq() interrupt(MSI) irq_enter() handler() /* EOI */ irq_exit() process_softirq() AFTER: interrupt /* Posted MSI notification vector */ irq_enter() atomic_xchg(PIR) handler() handler() handler() pi_clear_on() apic_eoi() irq_exit() process_softirq() Except for the leading MSI, CPU notifications are skipped/coalesced. For MSIs which arrive at a low frequency, the demultiplexing loop does not wait for more interrupts to coalesce. Therefore, there's no additional latency other than the processing time. Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Link: https://lore.kernel.org/r/20240423174114.526704-9-jacob.jun.pan@linux.intel.com
1 parent 6087c7f commit 1b03d82

File tree

5 files changed

+135
-4
lines changed

5 files changed

+135
-4
lines changed

arch/x86/entry/entry_fred.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = {
117117
SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi),
118118
SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi),
119119
SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi),
120+
121+
SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, posted_msi_notification),
120122
};
121123

122124
static bool fred_setup_done __initdata;

arch/x86/include/asm/hardirq.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ typedef struct {
4444
unsigned int irq_hv_reenlightenment_count;
4545
unsigned int hyperv_stimer0_count;
4646
#endif
47+
#ifdef CONFIG_X86_POSTED_MSI
48+
unsigned int posted_msi_notification_count;
49+
#endif
4750
} ____cacheline_aligned irq_cpustat_t;
4851

4952
DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);

arch/x86/include/asm/idtentry.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -751,6 +751,12 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested
751751
# define fred_sysvec_kvm_posted_intr_nested_ipi NULL
752752
#endif
753753

754+
# ifdef CONFIG_X86_POSTED_MSI
755+
DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification);
756+
#else
757+
# define fred_sysvec_posted_msi_notification NULL
758+
# endif
759+
754760
#if IS_ENABLED(CONFIG_HYPERV)
755761
DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback);
756762
DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment);

arch/x86/kernel/idt.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,9 @@ static const __initconst struct idt_data apic_idts[] = {
163163
# endif
164164
INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt),
165165
INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt),
166+
# ifdef CONFIG_X86_POSTED_MSI
167+
INTG(POSTED_MSI_NOTIFICATION_VECTOR, asm_sysvec_posted_msi_notification),
168+
# endif
166169
#endif
167170
};
168171

arch/x86/kernel/irq.c

Lines changed: 121 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
183183
seq_printf(p, "%10u ",
184184
irq_stats(j)->kvm_posted_intr_wakeup_ipis);
185185
seq_puts(p, " Posted-interrupt wakeup event\n");
186+
#endif
187+
#ifdef CONFIG_X86_POSTED_MSI
188+
seq_printf(p, "%*s: ", prec, "PMN");
189+
for_each_online_cpu(j)
190+
seq_printf(p, "%10u ",
191+
irq_stats(j)->posted_msi_notification_count);
192+
seq_puts(p, " Posted MSI notification event\n");
186193
#endif
187194
return 0;
188195
}
@@ -242,16 +249,16 @@ static __always_inline void handle_irq(struct irq_desc *desc,
242249
__handle_irq(desc, regs);
243250
}
244251

245-
static __always_inline void call_irq_handler(int vector, struct pt_regs *regs)
252+
static __always_inline int call_irq_handler(int vector, struct pt_regs *regs)
246253
{
247254
struct irq_desc *desc;
255+
int ret = 0;
248256

249257
desc = __this_cpu_read(vector_irq[vector]);
250258
if (likely(!IS_ERR_OR_NULL(desc))) {
251259
handle_irq(desc, regs);
252260
} else {
253-
apic_eoi();
254-
261+
ret = -EINVAL;
255262
if (desc == VECTOR_UNUSED) {
256263
pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n",
257264
__func__, smp_processor_id(),
@@ -260,6 +267,8 @@ static __always_inline void call_irq_handler(int vector, struct pt_regs *regs)
260267
__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
261268
}
262269
}
270+
271+
return ret;
263272
}
264273

265274
/*
@@ -273,7 +282,9 @@ DEFINE_IDTENTRY_IRQ(common_interrupt)
273282
/* entry code tells RCU that we're not quiescent. Check it. */
274283
RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
275284

276-
call_irq_handler(vector, regs);
285+
if (unlikely(call_irq_handler(vector, regs)))
286+
apic_eoi();
287+
277288
set_irq_regs(old_regs);
278289
}
279290

@@ -361,6 +372,112 @@ void intel_posted_msi_init(void)
361372
destination = x2apic_enabled() ? apic_id : apic_id << 8;
362373
this_cpu_write(posted_msi_pi_desc.ndst, destination);
363374
}
375+
376+
/*
377+
* De-multiplexing posted interrupts is on the performance path, the code
378+
* below is written to optimize the cache performance based on the following
379+
* considerations:
380+
* 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
381+
* accessed by both CPU and IOMMU.
382+
* 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg
383+
* for checking and clearing posted interrupt request (PIR), a 256 bit field
384+
* within the PID.
385+
* 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
386+
* line when posting interrupts and setting control bits.
387+
* 4.The CPU can access the cache line a magnitude faster than the IOMMU.
388+
* 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
389+
* cache line. The cache line states after each operation are as follows:
390+
* CPU IOMMU PID Cache line state
391+
* ---------------------------------------------------------------
392+
*...read64 exclusive
393+
*...lock xchg64 modified
394+
*... post/atomic swap invalid
395+
*...-------------------------------------------------------------
396+
*
397+
* To reduce L1 data cache miss, it is important to avoid contention with
398+
* IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
399+
* to dispatch interrupt handlers.
400+
*
401+
* In addition, the code is trying to keep the cache line state consistent
402+
* as much as possible. e.g. when making a copy and clearing the PIR
403+
* (assuming non-zero PIR bits are present in the entire PIR), it does:
404+
* read, read, read, read, xchg, xchg, xchg, xchg
405+
* instead of:
406+
* read, xchg, read, xchg, read, xchg, read, xchg
407+
*/
408+
static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs)
409+
{
410+
int i, vec = FIRST_EXTERNAL_VECTOR;
411+
unsigned long pir_copy[4];
412+
bool handled = false;
413+
414+
for (i = 0; i < 4; i++)
415+
pir_copy[i] = pir[i];
416+
417+
for (i = 0; i < 4; i++) {
418+
if (!pir_copy[i])
419+
continue;
420+
421+
pir_copy[i] = arch_xchg(&pir[i], 0);
422+
handled = true;
423+
}
424+
425+
if (handled) {
426+
for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
427+
call_irq_handler(vec, regs);
428+
}
429+
430+
return handled;
431+
}
432+
433+
/*
434+
* Performance data shows that 3 is good enough to harvest 90+% of the benefit
435+
* on high IRQ rate workload.
436+
*/
437+
#define MAX_POSTED_MSI_COALESCING_LOOP 3
438+
439+
/*
440+
* For MSIs that are delivered as posted interrupts, the CPU notifications
441+
* can be coalesced if the MSIs arrive in high frequency bursts.
442+
*/
443+
DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
444+
{
445+
struct pt_regs *old_regs = set_irq_regs(regs);
446+
struct pi_desc *pid;
447+
int i = 0;
448+
449+
pid = this_cpu_ptr(&posted_msi_pi_desc);
450+
451+
inc_irq_stat(posted_msi_notification_count);
452+
irq_enter();
453+
454+
/*
455+
* Max coalescing count includes the extra round of handle_pending_pir
456+
* after clearing the outstanding notification bit. Hence, at most
457+
* MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
458+
*/
459+
while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
460+
if (!handle_pending_pir(pid->pir64, regs))
461+
break;
462+
}
463+
464+
/*
465+
* Clear outstanding notification bit to allow new IRQ notifications,
466+
* do this last to maximize the window of interrupt coalescing.
467+
*/
468+
pi_clear_on(pid);
469+
470+
/*
471+
* There could be a race of PI notification and the clearing of ON bit,
472+
* process PIR bits one last time such that handling the new interrupts
473+
* are not delayed until the next IRQ.
474+
*/
475+
handle_pending_pir(pid->pir64, regs);
476+
477+
apic_eoi();
478+
irq_exit();
479+
set_irq_regs(old_regs);
480+
}
364481
#endif /* X86_POSTED_MSI */
365482

366483
#ifdef CONFIG_HOTPLUG_CPU

0 commit comments

Comments
 (0)