Skip to content

Commit 7c9ff3d

Browse files
haiyangzliuw
authored andcommitted
Drivers: hv: vmbus: Fix duplicate CPU assignments within a device
The vmbus module uses a rotational algorithm to assign target CPUs to a device's channels. Depending on the timing of different device's channel offers, different channels of a device may be assigned to the same CPU. For example on a VM with 2 CPUs, if NIC A and B's channels are offered in the following order, NIC A will have both channels on CPU0, and NIC B will have both channels on CPU1 -- see below. This kind of assignment causes RSS load that is spreading across different channels to end up on the same CPU. Timing of channel offers: NIC A channel 0 NIC B channel 0 NIC A channel 1 NIC B channel 1 VMBUS ID 14: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} - Synthetic network adapter Device_ID = {cab064cd-1f31-47d5-a8b4-9d57e320cccd} Sysfs path: /sys/bus/vmbus/devices/cab064cd-1f31-47d5-a8b4-9d57e320cccd Rel_ID=14, target_cpu=0 Rel_ID=17, target_cpu=0 VMBUS ID 16: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} - Synthetic network adapter Device_ID = {244225ca-743e-4020-a17d-d7baa13d6cea} Sysfs path: /sys/bus/vmbus/devices/244225ca-743e-4020-a17d-d7baa13d6cea Rel_ID=16, target_cpu=1 Rel_ID=18, target_cpu=1 Update the vmbus CPU assignment algorithm to avoid duplicate CPU assignments within a device. The new algorithm iterates num_online_cpus + 1 times. The existing rotational algorithm to find "next NUMA & CPU" is still here. But if the resulting CPU is already used by the same device, it will try the next CPU. In the last iteration, it assigns the channel to the next available CPU like the existing algorithm. This is not normally expected, because during device probe, we limit the number of channels of a device to be <= number of online CPUs. Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com> Reviewed-by: Michael Kelley <mikelley@microsoft.com> Tested-by: Michael Kelley <mikelley@microsoft.com> Link: https://lore.kernel.org/r/1626459673-17420-1-git-send-email-haiyangz@microsoft.com Signed-off-by: Wei Liu <wei.liu@kernel.org>
1 parent e73f0f0 commit 7c9ff3d

File tree

1 file changed

+64
-32
lines changed

1 file changed

+64
-32
lines changed

drivers/hv/channel_mgmt.c

Lines changed: 64 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,17 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
605605
*/
606606
mutex_lock(&vmbus_connection.channel_mutex);
607607

608+
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
609+
if (guid_equal(&channel->offermsg.offer.if_type,
610+
&newchannel->offermsg.offer.if_type) &&
611+
guid_equal(&channel->offermsg.offer.if_instance,
612+
&newchannel->offermsg.offer.if_instance)) {
613+
fnew = false;
614+
newchannel->primary_channel = channel;
615+
break;
616+
}
617+
}
618+
608619
init_vp_index(newchannel);
609620

610621
/* Remember the channels that should be cleaned up upon suspend. */
@@ -617,16 +628,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
617628
*/
618629
atomic_dec(&vmbus_connection.offer_in_progress);
619630

620-
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
621-
if (guid_equal(&channel->offermsg.offer.if_type,
622-
&newchannel->offermsg.offer.if_type) &&
623-
guid_equal(&channel->offermsg.offer.if_instance,
624-
&newchannel->offermsg.offer.if_instance)) {
625-
fnew = false;
626-
break;
627-
}
628-
}
629-
630631
if (fnew) {
631632
list_add_tail(&newchannel->listentry,
632633
&vmbus_connection.chn_list);
@@ -647,7 +648,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
647648
/*
648649
* Process the sub-channel.
649650
*/
650-
newchannel->primary_channel = channel;
651651
list_add_tail(&newchannel->sc_list, &channel->sc_list);
652652
}
653653

@@ -683,6 +683,30 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
683683
queue_work(wq, &newchannel->add_channel_work);
684684
}
685685

686+
/*
687+
* Check if CPUs used by other channels of the same device.
688+
* It should only be called by init_vp_index().
689+
*/
690+
static bool hv_cpuself_used(u32 cpu, struct vmbus_channel *chn)
691+
{
692+
struct vmbus_channel *primary = chn->primary_channel;
693+
struct vmbus_channel *sc;
694+
695+
lockdep_assert_held(&vmbus_connection.channel_mutex);
696+
697+
if (!primary)
698+
return false;
699+
700+
if (primary->target_cpu == cpu)
701+
return true;
702+
703+
list_for_each_entry(sc, &primary->sc_list, sc_list)
704+
if (sc != chn && sc->target_cpu == cpu)
705+
return true;
706+
707+
return false;
708+
}
709+
686710
/*
687711
* We use this state to statically distribute the channel interrupt load.
688712
*/
@@ -702,6 +726,7 @@ static int next_numa_node_id;
702726
static void init_vp_index(struct vmbus_channel *channel)
703727
{
704728
bool perf_chn = hv_is_perf_channel(channel);
729+
u32 i, ncpu = num_online_cpus();
705730
cpumask_var_t available_mask;
706731
struct cpumask *alloced_mask;
707732
u32 target_cpu;
@@ -724,31 +749,38 @@ static void init_vp_index(struct vmbus_channel *channel)
724749
return;
725750
}
726751

727-
while (true) {
728-
numa_node = next_numa_node_id++;
729-
if (numa_node == nr_node_ids) {
730-
next_numa_node_id = 0;
731-
continue;
752+
for (i = 1; i <= ncpu + 1; i++) {
753+
while (true) {
754+
numa_node = next_numa_node_id++;
755+
if (numa_node == nr_node_ids) {
756+
next_numa_node_id = 0;
757+
continue;
758+
}
759+
if (cpumask_empty(cpumask_of_node(numa_node)))
760+
continue;
761+
break;
762+
}
763+
alloced_mask = &hv_context.hv_numa_map[numa_node];
764+
765+
if (cpumask_weight(alloced_mask) ==
766+
cpumask_weight(cpumask_of_node(numa_node))) {
767+
/*
768+
* We have cycled through all the CPUs in the node;
769+
* reset the alloced map.
770+
*/
771+
cpumask_clear(alloced_mask);
732772
}
733-
if (cpumask_empty(cpumask_of_node(numa_node)))
734-
continue;
735-
break;
736-
}
737-
alloced_mask = &hv_context.hv_numa_map[numa_node];
738773

739-
if (cpumask_weight(alloced_mask) ==
740-
cpumask_weight(cpumask_of_node(numa_node))) {
741-
/*
742-
* We have cycled through all the CPUs in the node;
743-
* reset the alloced map.
744-
*/
745-
cpumask_clear(alloced_mask);
746-
}
774+
cpumask_xor(available_mask, alloced_mask,
775+
cpumask_of_node(numa_node));
747776

748-
cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node));
777+
target_cpu = cpumask_first(available_mask);
778+
cpumask_set_cpu(target_cpu, alloced_mask);
749779

750-
target_cpu = cpumask_first(available_mask);
751-
cpumask_set_cpu(target_cpu, alloced_mask);
780+
if (channel->offermsg.offer.sub_channel_index >= ncpu ||
781+
i > ncpu || !hv_cpuself_used(target_cpu, channel))
782+
break;
783+
}
752784

753785
channel->target_cpu = target_cpu;
754786

0 commit comments

Comments
 (0)