@@ -447,6 +447,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
447
447
vcpu -> kvm = kvm ;
448
448
vcpu -> vcpu_id = id ;
449
449
vcpu -> pid = NULL ;
450
+ rwlock_init (& vcpu -> pid_lock );
450
451
#ifndef __KVM_HAVE_ARCH_WQP
451
452
rcuwait_init (& vcpu -> wait );
452
453
#endif
@@ -474,7 +475,7 @@ static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
474
475
* the vcpu->pid pointer, and at destruction time all file descriptors
475
476
* are already gone.
476
477
*/
477
- put_pid (rcu_dereference_protected ( vcpu -> pid , 1 ) );
478
+ put_pid (vcpu -> pid );
478
479
479
480
free_page ((unsigned long )vcpu -> run );
480
481
kmem_cache_free (kvm_vcpu_cache , vcpu );
@@ -3770,17 +3771,19 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3770
3771
3771
3772
int kvm_vcpu_yield_to (struct kvm_vcpu * target )
3772
3773
{
3773
- struct pid * pid ;
3774
3774
struct task_struct * task = NULL ;
3775
- int ret = 0 ;
3775
+ int ret ;
3776
+
3777
+ if (!read_trylock (& target -> pid_lock ))
3778
+ return 0 ;
3779
+
3780
+ if (target -> pid )
3781
+ task = get_pid_task (target -> pid , PIDTYPE_PID );
3782
+
3783
+ read_unlock (& target -> pid_lock );
3776
3784
3777
- rcu_read_lock ();
3778
- pid = rcu_dereference (target -> pid );
3779
- if (pid )
3780
- task = get_pid_task (pid , PIDTYPE_PID );
3781
- rcu_read_unlock ();
3782
3785
if (!task )
3783
- return ret ;
3786
+ return 0 ;
3784
3787
ret = yield_to (task , 1 );
3785
3788
put_task_struct (task );
3786
3789
@@ -3869,59 +3872,71 @@ bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3869
3872
3870
3873
void kvm_vcpu_on_spin (struct kvm_vcpu * me , bool yield_to_kernel_mode )
3871
3874
{
3875
+ int nr_vcpus , start , i , idx , yielded ;
3872
3876
struct kvm * kvm = me -> kvm ;
3873
3877
struct kvm_vcpu * vcpu ;
3874
- int last_boosted_vcpu ;
3875
- unsigned long i ;
3876
- int yielded = 0 ;
3877
3878
int try = 3 ;
3878
- int pass ;
3879
3879
3880
- last_boosted_vcpu = READ_ONCE (kvm -> last_boosted_vcpu );
3880
+ nr_vcpus = atomic_read (& kvm -> online_vcpus );
3881
+ if (nr_vcpus < 2 )
3882
+ return ;
3883
+
3884
+ /* Pairs with the smp_wmb() in kvm_vm_ioctl_create_vcpu(). */
3885
+ smp_rmb ();
3886
+
3881
3887
kvm_vcpu_set_in_spin_loop (me , true);
3888
+
3882
3889
/*
3883
- * We boost the priority of a VCPU that is runnable but not
3884
- * currently running, because it got preempted by something
3885
- * else and called schedule in __vcpu_run. Hopefully that
3886
- * VCPU is holding the lock that we need and will release it.
3887
- * We approximate round-robin by starting at the last boosted VCPU.
3890
+ * The current vCPU ("me") is spinning in kernel mode, i.e. is likely
3891
+ * waiting for a resource to become available. Attempt to yield to a
3892
+ * vCPU that is runnable, but not currently running, e.g. because the
3893
+ * vCPU was preempted by a higher priority task. With luck, the vCPU
3894
+ * that was preempted is holding a lock or some other resource that the
3895
+ * current vCPU is waiting to acquire, and yielding to the other vCPU
3896
+ * will allow it to make forward progress and release the lock (or kick
3897
+ * the spinning vCPU, etc).
3898
+ *
3899
+ * Since KVM has no insight into what exactly the guest is doing,
3900
+ * approximate a round-robin selection by iterating over all vCPUs,
3901
+ * starting at the last boosted vCPU. I.e. if N=kvm->last_boosted_vcpu,
3902
+ * iterate over vCPU[N+1]..vCPU[N-1], wrapping as needed.
3903
+ *
3904
+ * Note, this is inherently racy, e.g. if multiple vCPUs are spinning,
3905
+ * they may all try to yield to the same vCPU(s). But as above, this
3906
+ * is all best effort due to KVM's lack of visibility into the guest.
3888
3907
*/
3889
- for (pass = 0 ; pass < 2 && !yielded && try ; pass ++ ) {
3890
- kvm_for_each_vcpu (i , vcpu , kvm ) {
3891
- if (!pass && i <= last_boosted_vcpu ) {
3892
- i = last_boosted_vcpu ;
3893
- continue ;
3894
- } else if (pass && i > last_boosted_vcpu )
3895
- break ;
3896
- if (!READ_ONCE (vcpu -> ready ))
3897
- continue ;
3898
- if (vcpu == me )
3899
- continue ;
3900
- if (kvm_vcpu_is_blocking (vcpu ) && !vcpu_dy_runnable (vcpu ))
3901
- continue ;
3908
+ start = READ_ONCE (kvm -> last_boosted_vcpu ) + 1 ;
3909
+ for (i = 0 ; i < nr_vcpus ; i ++ ) {
3910
+ idx = (start + i ) % nr_vcpus ;
3911
+ if (idx == me -> vcpu_idx )
3912
+ continue ;
3902
3913
3903
- /*
3904
- * Treat the target vCPU as being in-kernel if it has a
3905
- * pending interrupt, as the vCPU trying to yield may
3906
- * be spinning waiting on IPI delivery, i.e. the target
3907
- * vCPU is in-kernel for the purposes of directed yield.
3908
- */
3909
- if (READ_ONCE (vcpu -> preempted ) && yield_to_kernel_mode &&
3910
- !kvm_arch_dy_has_pending_interrupt (vcpu ) &&
3911
- !kvm_arch_vcpu_preempted_in_kernel (vcpu ))
3912
- continue ;
3913
- if (!kvm_vcpu_eligible_for_directed_yield (vcpu ))
3914
- continue ;
3914
+ vcpu = xa_load (& kvm -> vcpu_array , idx );
3915
+ if (!READ_ONCE (vcpu -> ready ))
3916
+ continue ;
3917
+ if (kvm_vcpu_is_blocking (vcpu ) && !vcpu_dy_runnable (vcpu ))
3918
+ continue ;
3915
3919
3916
- yielded = kvm_vcpu_yield_to (vcpu );
3917
- if (yielded > 0 ) {
3918
- WRITE_ONCE (kvm -> last_boosted_vcpu , i );
3919
- break ;
3920
- } else if (yielded < 0 ) {
3921
- try -- ;
3922
- if (!try )
3923
- break ;
3924
- }
3920
+ /*
3921
+ * Treat the target vCPU as being in-kernel if it has a pending
3922
+ * interrupt, as the vCPU trying to yield may be spinning
3923
+ * waiting on IPI delivery, i.e. the target vCPU is in-kernel
3924
+ * for the purposes of directed yield.
3925
+ */
3926
+ if (READ_ONCE (vcpu -> preempted ) && yield_to_kernel_mode &&
3927
+ !kvm_arch_dy_has_pending_interrupt (vcpu ) &&
3928
+ !kvm_arch_vcpu_preempted_in_kernel (vcpu ))
3929
+ continue ;
3930
+
3931
+ if (!kvm_vcpu_eligible_for_directed_yield (vcpu ))
3932
+ continue ;
3933
+
3934
+ yielded = kvm_vcpu_yield_to (vcpu );
3935
+ if (yielded > 0 ) {
3936
+ WRITE_ONCE (kvm -> last_boosted_vcpu , i );
3937
+ break ;
3938
+ } else if (yielded < 0 && !-- try ) {
3939
+ break ;
3925
3940
}
3926
3941
}
3927
3942
kvm_vcpu_set_in_spin_loop (me , false);
@@ -4018,9 +4033,9 @@ static int vcpu_get_pid(void *data, u64 *val)
4018
4033
{
4019
4034
struct kvm_vcpu * vcpu = data ;
4020
4035
4021
- rcu_read_lock ( );
4022
- * val = pid_nr (rcu_dereference ( vcpu -> pid ) );
4023
- rcu_read_unlock ( );
4036
+ read_lock ( & vcpu -> pid_lock );
4037
+ * val = pid_nr (vcpu -> pid );
4038
+ read_unlock ( & vcpu -> pid_lock );
4024
4039
return 0 ;
4025
4040
}
4026
4041
@@ -4306,7 +4321,14 @@ static long kvm_vcpu_ioctl(struct file *filp,
4306
4321
r = - EINVAL ;
4307
4322
if (arg )
4308
4323
goto out ;
4309
- oldpid = rcu_access_pointer (vcpu -> pid );
4324
+
4325
+ /*
4326
+ * Note, vcpu->pid is primarily protected by vcpu->mutex. The
4327
+ * dedicated r/w lock allows other tasks, e.g. other vCPUs, to
4328
+ * read vcpu->pid while this vCPU is in KVM_RUN, e.g. to yield
4329
+ * directly to this vCPU
4330
+ */
4331
+ oldpid = vcpu -> pid ;
4310
4332
if (unlikely (oldpid != task_pid (current ))) {
4311
4333
/* The thread running this VCPU changed. */
4312
4334
struct pid * newpid ;
@@ -4316,9 +4338,10 @@ static long kvm_vcpu_ioctl(struct file *filp,
4316
4338
break ;
4317
4339
4318
4340
newpid = get_task_pid (current , PIDTYPE_PID );
4319
- rcu_assign_pointer (vcpu -> pid , newpid );
4320
- if (oldpid )
4321
- synchronize_rcu ();
4341
+ write_lock (& vcpu -> pid_lock );
4342
+ vcpu -> pid = newpid ;
4343
+ write_unlock (& vcpu -> pid_lock );
4344
+
4322
4345
put_pid (oldpid );
4323
4346
}
4324
4347
vcpu -> wants_to_run = !READ_ONCE (vcpu -> run -> immediate_exit__unsafe );
0 commit comments