Skip to content

Commit 9f89c9e

Browse files
Brian-PerkinsBrian Perkins
andauthored
netvsp: fix race between VF device arrival and switch data path (#1615)
Adding a VF to the guest should block and only restart the primary worker if it is not waiting for an action Co-authored-by: Brian Perkins <brian.perkins@microsoft.com>
1 parent c765316 commit 9f89c9e

File tree

4 files changed

+120
-41
lines changed

4 files changed

+120
-41
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4496,6 +4496,7 @@ dependencies = [
44964496
"event-listener",
44974497
"futures",
44984498
"futures-concurrency",
4499+
"getrandom 0.3.2",
44994500
"guestmem",
45004501
"guid",
45014502
"hvdef",

vm/devices/net/netvsp/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,5 +46,8 @@ hvdef.workspace = true
4646
event-listener.workspace = true
4747
test_with_tracing.workspace = true
4848

49+
getrandom.workspace = true
50+
51+
4952
[lints]
5053
workspace = true

vm/devices/net/netvsp/src/lib.rs

Lines changed: 43 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3746,16 +3746,46 @@ impl Coordinator {
37463746
PendingVfStateComplete,
37473747
TimerExpired,
37483748
}
3749-
let timer_sleep = async {
3750-
if let Some(sleep_duration) = sleep_duration {
3751-
let mut timer = PolledTimer::new(&state.adapter.driver);
3752-
timer.sleep_until(sleep_duration).await;
3753-
} else {
3754-
pending::<()>().await;
3749+
let message = if matches!(
3750+
state.pending_vf_state,
3751+
CoordinatorStatePendingVfState::Pending
3752+
) {
3753+
// The primary worker is allowed to run, but as no
3754+
// notifications are being processed, if it is waiting for an
3755+
// action then it should remain stopped until this completes
3756+
// and the regular message processing logic resumes. Currently
3757+
// the only message that requires processing is
3758+
// DataPathSwitchPending, so check for that here.
3759+
if !self.workers[0].is_running()
3760+
&& self.primary_mut().is_none_or(|primary| {
3761+
!matches!(
3762+
primary.guest_vf_state,
3763+
PrimaryChannelGuestVfState::DataPathSwitchPending { result: None, .. }
3764+
)
3765+
})
3766+
{
3767+
self.workers[0].start();
37553768
}
3756-
Message::TimerExpired
3757-
};
3758-
let message = {
3769+
3770+
// guest_ready_for_device is not restartable, so do not poll on
3771+
// stop.
3772+
state
3773+
.virtual_function
3774+
.as_mut()
3775+
.expect("Pending requires a VF")
3776+
.guest_ready_for_device()
3777+
.await;
3778+
Message::PendingVfStateComplete
3779+
} else {
3780+
let timer_sleep = async {
3781+
if let Some(sleep_duration) = sleep_duration {
3782+
let mut timer = PolledTimer::new(&state.adapter.driver);
3783+
timer.sleep_until(sleep_duration).await;
3784+
} else {
3785+
pending::<()>().await;
3786+
}
3787+
Message::TimerExpired
3788+
};
37593789
let wait_for_message = async {
37603790
let internal_msg = self
37613791
.recv
@@ -3791,21 +3821,7 @@ impl Coordinator {
37913821
.race()
37923822
.await
37933823
}
3794-
CoordinatorStatePendingVfState::Pending => {
3795-
// Allow the network workers to continue while
3796-
// waiting for the Vf add/remove call to
3797-
// complete, but block any other notifications
3798-
// while it is running. This is necessary to
3799-
// support Vf removal, which may trigger the
3800-
// guest to send a switch data path request and
3801-
// wait for a completion message as part of
3802-
// its eject handling. The switch data path
3803-
// request won't send a message here because
3804-
// the Vf is not available -- it will be a
3805-
// no-op.
3806-
vf.guest_ready_for_device().await;
3807-
Message::PendingVfStateComplete
3808-
}
3824+
CoordinatorStatePendingVfState::Pending => unreachable!(),
38093825
}
38103826
} else {
38113827
(internal_msg, endpoint_restart, timer_sleep).race().await
@@ -3984,12 +4000,13 @@ impl Coordinator {
39844000
_ => (true, None),
39854001
};
39864002
// Cancel any outstanding delay timers for VF offers if the data path is
3987-
// getting switched. Those timers are essentially no-op at this point.
4003+
// getting switched, since the guest is already issuing
4004+
// commands assuming a VF.
39884005
if matches!(
39894006
c_state.pending_vf_state,
39904007
CoordinatorStatePendingVfState::Delay { .. }
39914008
) {
3992-
c_state.pending_vf_state = CoordinatorStatePendingVfState::Ready;
4009+
c_state.pending_vf_state = CoordinatorStatePendingVfState::Pending;
39934010
}
39944011
let result = c_state.endpoint.set_data_path_to_guest_vf(to_guest).await;
39954012
let result = if let Err(err) = result {

vm/devices/net/netvsp/src/test.rs

Lines changed: 73 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ impl net_backend::Endpoint for TestNicEndpoint {
275275
}
276276

277277
async fn set_data_path_to_guest_vf(&self, use_vf: bool) -> anyhow::Result<()> {
278+
tracing::info!(use_vf, "set_data_path_to_guest_vf");
278279
let inner = self.inner.clone();
279280
let mut iter = {
280281
let locked_inner = inner.lock().await;
@@ -657,14 +658,17 @@ impl TestNicDevice {
657658

658659
pub async fn retarget_vp(&self, vp: u32) {
659660
let modify_request = ModifyRequest::TargetVp { target_vp: vp };
660-
let modify_response = self
661-
.send_to_channel(
662-
0,
663-
ChannelRequest::Modify,
664-
modify_request,
665-
ChannelResponse::Modify,
666-
)
661+
let send_request = self.send_to_channel(
662+
0,
663+
ChannelRequest::Modify,
664+
modify_request,
665+
ChannelResponse::Modify,
666+
);
667+
let modify_response = mesh::CancelContext::new()
668+
.with_timeout(Duration::from_millis(333))
669+
.until_cancelled(send_request)
667670
.await
671+
.expect("response received")
668672
.expect("modify successful");
669673

670674
assert!(matches!(modify_response, ChannelResponse::Modify(0)));
@@ -1493,6 +1497,13 @@ impl VirtualFunction for TestVirtualFunction {
14931497
self.state.id()
14941498
}
14951499
async fn guest_ready_for_device(&mut self) {
1500+
// Wait a random amount of time before completing the request.
1501+
let mut wait_ms: u64 = 0;
1502+
getrandom::fill(wait_ms.as_mut_bytes()).expect("rng failure");
1503+
wait_ms %= 50;
1504+
tracing::info!(id = self.state.id(), wait_ms, "Readying VF...");
1505+
let mut ctx = mesh::CancelContext::new().with_timeout(Duration::from_millis(wait_ms));
1506+
let _ = ctx.until_cancelled(pending::<()>()).await;
14961507
tracing::info!(id = self.state.id(), "VF ready");
14971508
self.state.set_ready(true).await;
14981509
}
@@ -2538,7 +2549,7 @@ async fn stop_start_with_vf(driver: DefaultDriver) {
25382549
// 'guest VF' state logic.
25392550
assert!(
25402551
test_vf_state
2541-
.await_ready(true, Duration::ZERO)
2552+
.await_ready(true, Duration::from_millis(333))
25422553
.await
25432554
.is_ok()
25442555
);
@@ -4640,6 +4651,22 @@ async fn race_coordinator_and_worker_stop_events(driver: DefaultDriver) {
46404651
} else {
46414652
false
46424653
};
4654+
// Change the VF availability every other instance.
4655+
if (i % 2) == 0 {
4656+
let is_add = (i % 4) == 0;
4657+
test_vf_state
4658+
.update_id(
4659+
if is_add { Some(124) } else { None },
4660+
Some(Duration::from_millis(100)),
4661+
)
4662+
.await
4663+
.unwrap();
4664+
4665+
if is_add {
4666+
PolledTimer::new(&driver).sleep(VF_DEVICE_DELAY).await;
4667+
}
4668+
}
4669+
46434670
// send switch data path message
46444671
channel
46454672
.write(OutgoingPacket {
@@ -4672,6 +4699,7 @@ async fn race_coordinator_and_worker_stop_events(driver: DefaultDriver) {
46724699
)
46734700
.await;
46744701
}
4702+
46754703
// Trigger a retarget VP 2/3 of the time offset with the link update,
46764704
// such that 1/3 times only link update or retarget VP will be
46774705
// triggered.
@@ -4690,14 +4718,44 @@ async fn race_coordinator_and_worker_stop_events(driver: DefaultDriver) {
46904718
.read_with(|packet| match packet {
46914719
IncomingPacket::Completion(_) => None,
46924720
IncomingPacket::Data(data) => {
4693-
let (rndis_header, _) = rndis_parser.parse_control_message(data);
4694-
if rndis_header.message_type == rndisprot::MESSAGE_TYPE_KEEPALIVE_CMPLT {
4695-
tracing::info!("Got keepalive completion");
4696-
Some(data.transaction_id().expect("should request completion"))
4697-
} else {
4698-
tracing::info!(rndis_header.message_type, "Got link status update");
4699-
Some(data.transaction_id().expect("should request completion"))
4721+
let mut reader = data.reader();
4722+
let header: protocol::MessageHeader = reader.read_plain().unwrap();
4723+
match header.message_type {
4724+
protocol::MESSAGE4_TYPE_SEND_VF_ASSOCIATION => {
4725+
let association_data: protocol::Message4SendVfAssociation =
4726+
reader.read_plain().unwrap();
4727+
tracing::info!(
4728+
is_vf = association_data.vf_allocated,
4729+
vfid = association_data.serial_number,
4730+
"Message: VF association"
4731+
);
4732+
}
4733+
protocol::MESSAGE4_TYPE_SWITCH_DATA_PATH => {
4734+
tracing::info!("Message: switch data path");
4735+
let switch_result: protocol::Message4SwitchDataPath =
4736+
reader.read_plain().unwrap();
4737+
// Switch data path is expected when the data
4738+
// path is forced to synthetic.
4739+
assert_eq!(
4740+
switch_result.active_data_path,
4741+
protocol::DataPath::SYNTHETIC.0
4742+
);
4743+
}
4744+
_ => {
4745+
let (rndis_header, _) = rndis_parser.parse_control_message(data);
4746+
if rndis_header.message_type
4747+
== rndisprot::MESSAGE_TYPE_KEEPALIVE_CMPLT
4748+
{
4749+
tracing::info!("Message: keepalive completion");
4750+
} else {
4751+
tracing::info!(
4752+
rndis_header.message_type,
4753+
"Message: link status update"
4754+
);
4755+
}
4756+
}
47004757
}
4758+
Some(data.transaction_id().expect("should request completion"))
47014759
}
47024760
})
47034761
.await

0 commit comments

Comments
 (0)