Skip to content

Commit b39d157

Browse files
committed
Merge tag 'kvm-x86-generic-6.13' of https://github.com/kvm-x86/linux into HEAD
KVM generic changes for 6.13 - Rework kvm_vcpu_on_spin() to use a single for-loop instead of making two partial poasses over "all" vCPUs. Opportunistically expand the comment to better explain the motivation and logic. - Protect vcpu->pid accesses outside of vcpu->mutex with a rwlock instead of RCU, so that running a vCPU on a different task doesn't encounter long stalls due to having to wait for all CPUs become quiescent.
2 parents 185e02d + 3e7f431 commit b39d157

File tree

3 files changed

+86
-62
lines changed

3 files changed

+86
-62
lines changed

arch/arm64/include/asm/kvm_host.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1140,7 +1140,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
11401140
void kvm_arm_halt_guest(struct kvm *kvm);
11411141
void kvm_arm_resume_guest(struct kvm *kvm);
11421142

1143-
#define vcpu_has_run_once(vcpu) !!rcu_access_pointer((vcpu)->pid)
1143+
#define vcpu_has_run_once(vcpu) (!!READ_ONCE((vcpu)->pid))
11441144

11451145
#ifndef __KVM_NVHE_HYPERVISOR__
11461146
#define kvm_call_hyp_nvhe(f, ...) \

include/linux/kvm_host.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,8 @@ struct kvm_vcpu {
334334
#ifndef __KVM_HAVE_ARCH_WQP
335335
struct rcuwait wait;
336336
#endif
337-
struct pid __rcu *pid;
337+
struct pid *pid;
338+
rwlock_t pid_lock;
338339
int sigset_active;
339340
sigset_t sigset;
340341
unsigned int halt_poll_ns;

virt/kvm/kvm_main.c

Lines changed: 83 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
447447
vcpu->kvm = kvm;
448448
vcpu->vcpu_id = id;
449449
vcpu->pid = NULL;
450+
rwlock_init(&vcpu->pid_lock);
450451
#ifndef __KVM_HAVE_ARCH_WQP
451452
rcuwait_init(&vcpu->wait);
452453
#endif
@@ -474,7 +475,7 @@ static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
474475
* the vcpu->pid pointer, and at destruction time all file descriptors
475476
* are already gone.
476477
*/
477-
put_pid(rcu_dereference_protected(vcpu->pid, 1));
478+
put_pid(vcpu->pid);
478479

479480
free_page((unsigned long)vcpu->run);
480481
kmem_cache_free(kvm_vcpu_cache, vcpu);
@@ -3770,17 +3771,19 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
37703771

37713772
int kvm_vcpu_yield_to(struct kvm_vcpu *target)
37723773
{
3773-
struct pid *pid;
37743774
struct task_struct *task = NULL;
3775-
int ret = 0;
3775+
int ret;
3776+
3777+
if (!read_trylock(&target->pid_lock))
3778+
return 0;
3779+
3780+
if (target->pid)
3781+
task = get_pid_task(target->pid, PIDTYPE_PID);
3782+
3783+
read_unlock(&target->pid_lock);
37763784

3777-
rcu_read_lock();
3778-
pid = rcu_dereference(target->pid);
3779-
if (pid)
3780-
task = get_pid_task(pid, PIDTYPE_PID);
3781-
rcu_read_unlock();
37823785
if (!task)
3783-
return ret;
3786+
return 0;
37843787
ret = yield_to(task, 1);
37853788
put_task_struct(task);
37863789

@@ -3869,59 +3872,71 @@ bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
38693872

38703873
void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
38713874
{
3875+
int nr_vcpus, start, i, idx, yielded;
38723876
struct kvm *kvm = me->kvm;
38733877
struct kvm_vcpu *vcpu;
3874-
int last_boosted_vcpu;
3875-
unsigned long i;
3876-
int yielded = 0;
38773878
int try = 3;
3878-
int pass;
38793879

3880-
last_boosted_vcpu = READ_ONCE(kvm->last_boosted_vcpu);
3880+
nr_vcpus = atomic_read(&kvm->online_vcpus);
3881+
if (nr_vcpus < 2)
3882+
return;
3883+
3884+
/* Pairs with the smp_wmb() in kvm_vm_ioctl_create_vcpu(). */
3885+
smp_rmb();
3886+
38813887
kvm_vcpu_set_in_spin_loop(me, true);
3888+
38823889
/*
3883-
* We boost the priority of a VCPU that is runnable but not
3884-
* currently running, because it got preempted by something
3885-
* else and called schedule in __vcpu_run. Hopefully that
3886-
* VCPU is holding the lock that we need and will release it.
3887-
* We approximate round-robin by starting at the last boosted VCPU.
3890+
* The current vCPU ("me") is spinning in kernel mode, i.e. is likely
3891+
* waiting for a resource to become available. Attempt to yield to a
3892+
* vCPU that is runnable, but not currently running, e.g. because the
3893+
* vCPU was preempted by a higher priority task. With luck, the vCPU
3894+
* that was preempted is holding a lock or some other resource that the
3895+
* current vCPU is waiting to acquire, and yielding to the other vCPU
3896+
* will allow it to make forward progress and release the lock (or kick
3897+
* the spinning vCPU, etc).
3898+
*
3899+
* Since KVM has no insight into what exactly the guest is doing,
3900+
* approximate a round-robin selection by iterating over all vCPUs,
3901+
* starting at the last boosted vCPU. I.e. if N=kvm->last_boosted_vcpu,
3902+
* iterate over vCPU[N+1]..vCPU[N-1], wrapping as needed.
3903+
*
3904+
* Note, this is inherently racy, e.g. if multiple vCPUs are spinning,
3905+
* they may all try to yield to the same vCPU(s). But as above, this
3906+
* is all best effort due to KVM's lack of visibility into the guest.
38883907
*/
3889-
for (pass = 0; pass < 2 && !yielded && try; pass++) {
3890-
kvm_for_each_vcpu(i, vcpu, kvm) {
3891-
if (!pass && i <= last_boosted_vcpu) {
3892-
i = last_boosted_vcpu;
3893-
continue;
3894-
} else if (pass && i > last_boosted_vcpu)
3895-
break;
3896-
if (!READ_ONCE(vcpu->ready))
3897-
continue;
3898-
if (vcpu == me)
3899-
continue;
3900-
if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
3901-
continue;
3908+
start = READ_ONCE(kvm->last_boosted_vcpu) + 1;
3909+
for (i = 0; i < nr_vcpus; i++) {
3910+
idx = (start + i) % nr_vcpus;
3911+
if (idx == me->vcpu_idx)
3912+
continue;
39023913

3903-
/*
3904-
* Treat the target vCPU as being in-kernel if it has a
3905-
* pending interrupt, as the vCPU trying to yield may
3906-
* be spinning waiting on IPI delivery, i.e. the target
3907-
* vCPU is in-kernel for the purposes of directed yield.
3908-
*/
3909-
if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3910-
!kvm_arch_dy_has_pending_interrupt(vcpu) &&
3911-
!kvm_arch_vcpu_preempted_in_kernel(vcpu))
3912-
continue;
3913-
if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3914-
continue;
3914+
vcpu = xa_load(&kvm->vcpu_array, idx);
3915+
if (!READ_ONCE(vcpu->ready))
3916+
continue;
3917+
if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
3918+
continue;
39153919

3916-
yielded = kvm_vcpu_yield_to(vcpu);
3917-
if (yielded > 0) {
3918-
WRITE_ONCE(kvm->last_boosted_vcpu, i);
3919-
break;
3920-
} else if (yielded < 0) {
3921-
try--;
3922-
if (!try)
3923-
break;
3924-
}
3920+
/*
3921+
* Treat the target vCPU as being in-kernel if it has a pending
3922+
* interrupt, as the vCPU trying to yield may be spinning
3923+
* waiting on IPI delivery, i.e. the target vCPU is in-kernel
3924+
* for the purposes of directed yield.
3925+
*/
3926+
if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3927+
!kvm_arch_dy_has_pending_interrupt(vcpu) &&
3928+
!kvm_arch_vcpu_preempted_in_kernel(vcpu))
3929+
continue;
3930+
3931+
if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3932+
continue;
3933+
3934+
yielded = kvm_vcpu_yield_to(vcpu);
3935+
if (yielded > 0) {
3936+
WRITE_ONCE(kvm->last_boosted_vcpu, i);
3937+
break;
3938+
} else if (yielded < 0 && !--try) {
3939+
break;
39253940
}
39263941
}
39273942
kvm_vcpu_set_in_spin_loop(me, false);
@@ -4018,9 +4033,9 @@ static int vcpu_get_pid(void *data, u64 *val)
40184033
{
40194034
struct kvm_vcpu *vcpu = data;
40204035

4021-
rcu_read_lock();
4022-
*val = pid_nr(rcu_dereference(vcpu->pid));
4023-
rcu_read_unlock();
4036+
read_lock(&vcpu->pid_lock);
4037+
*val = pid_nr(vcpu->pid);
4038+
read_unlock(&vcpu->pid_lock);
40244039
return 0;
40254040
}
40264041

@@ -4306,7 +4321,14 @@ static long kvm_vcpu_ioctl(struct file *filp,
43064321
r = -EINVAL;
43074322
if (arg)
43084323
goto out;
4309-
oldpid = rcu_access_pointer(vcpu->pid);
4324+
4325+
/*
4326+
* Note, vcpu->pid is primarily protected by vcpu->mutex. The
4327+
* dedicated r/w lock allows other tasks, e.g. other vCPUs, to
4328+
* read vcpu->pid while this vCPU is in KVM_RUN, e.g. to yield
4329+
* directly to this vCPU
4330+
*/
4331+
oldpid = vcpu->pid;
43104332
if (unlikely(oldpid != task_pid(current))) {
43114333
/* The thread running this VCPU changed. */
43124334
struct pid *newpid;
@@ -4316,9 +4338,10 @@ static long kvm_vcpu_ioctl(struct file *filp,
43164338
break;
43174339

43184340
newpid = get_task_pid(current, PIDTYPE_PID);
4319-
rcu_assign_pointer(vcpu->pid, newpid);
4320-
if (oldpid)
4321-
synchronize_rcu();
4341+
write_lock(&vcpu->pid_lock);
4342+
vcpu->pid = newpid;
4343+
write_unlock(&vcpu->pid_lock);
4344+
43224345
put_pid(oldpid);
43234346
}
43244347
vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe);

0 commit comments

Comments
 (0)