Skip to content

Commit 6cea64b

Browse files
Yishai Hadasawilliam
authored andcommitted
vfio/virtio: Add PRE_COPY support for live migration
Add PRE_COPY support for live migration. This functionality may reduce the downtime upon STOP_COPY as of letting the target machine to get some 'initial data' from the source once the machine is still in its RUNNING state and let it prepares itself pre-ahead to get the final STOP_COPY data. As the Virtio specification does not support reading partial or incremental device contexts. This means that during the PRE_COPY state, the vfio-virtio driver reads the full device state. As the device state can be changed and the benefit is highest when the pre copy data closely matches the final data we read it in a rate limiter mode. This means we avoid reading new data from the device for a specified time interval after the last read. With PRE_COPY enabled, we observed a downtime reduction of approximately 70-75% in various scenarios compared to when PRE_COPY was disabled, while keeping the total migration time nearly the same. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Link: https://lore.kernel.org/r/20241113115200.209269-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
1 parent 0bbc82e commit 6cea64b

File tree

2 files changed

+227
-8
lines changed

2 files changed

+227
-8
lines changed

drivers/vfio/pci/virtio/common.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
enum virtiovf_migf_state {
1212
VIRTIOVF_MIGF_STATE_ERROR = 1,
13+
VIRTIOVF_MIGF_STATE_PRECOPY = 2,
14+
VIRTIOVF_MIGF_STATE_COMPLETE = 3,
1315
};
1416

1517
enum virtiovf_load_state {
@@ -57,6 +59,8 @@ struct virtiovf_migration_file {
5759
/* synchronize access to the file state */
5860
struct mutex lock;
5961
loff_t max_pos;
62+
u64 pre_copy_initial_bytes;
63+
struct ratelimit_state pre_copy_rl_state;
6064
u64 record_size;
6165
u32 record_tag;
6266
u8 has_obj_id:1;

drivers/vfio/pci/virtio/migrate.c

Lines changed: 223 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626
/* Initial target buffer size */
2727
#define VIRTIOVF_TARGET_INITIAL_BUF_SIZE SZ_1M
2828

29+
static int
30+
virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
31+
u32 ctx_size);
32+
2933
static struct page *
3034
virtiovf_get_migration_page(struct virtiovf_data_buffer *buf,
3135
unsigned long offset)
@@ -159,6 +163,41 @@ virtiovf_pci_free_obj_id(struct virtiovf_pci_core_device *virtvdev, u32 obj_id)
159163
VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id);
160164
}
161165

166+
static struct virtiovf_data_buffer *
167+
virtiovf_get_data_buffer(struct virtiovf_migration_file *migf, size_t length)
168+
{
169+
struct virtiovf_data_buffer *buf, *temp_buf;
170+
struct list_head free_list;
171+
172+
INIT_LIST_HEAD(&free_list);
173+
174+
spin_lock_irq(&migf->list_lock);
175+
list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
176+
list_del_init(&buf->buf_elm);
177+
if (buf->allocated_length >= length) {
178+
spin_unlock_irq(&migf->list_lock);
179+
goto found;
180+
}
181+
/*
182+
* Prevent holding redundant buffers. Put in a free
183+
* list and call at the end not under the spin lock
184+
* (&migf->list_lock) to minimize its scope usage.
185+
*/
186+
list_add(&buf->buf_elm, &free_list);
187+
}
188+
spin_unlock_irq(&migf->list_lock);
189+
buf = virtiovf_alloc_data_buffer(migf, length);
190+
191+
found:
192+
while ((temp_buf = list_first_entry_or_null(&free_list,
193+
struct virtiovf_data_buffer, buf_elm))) {
194+
list_del(&temp_buf->buf_elm);
195+
virtiovf_free_data_buffer(temp_buf);
196+
}
197+
198+
return buf;
199+
}
200+
162201
static void virtiovf_clean_migf_resources(struct virtiovf_migration_file *migf)
163202
{
164203
struct virtiovf_data_buffer *entry;
@@ -345,6 +384,7 @@ static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t le
345384
{
346385
struct virtiovf_migration_file *migf = filp->private_data;
347386
struct virtiovf_data_buffer *vhca_buf;
387+
bool first_loop_call = true;
348388
bool end_of_data;
349389
ssize_t done = 0;
350390

@@ -362,6 +402,19 @@ static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t le
362402
ssize_t count;
363403

364404
vhca_buf = virtiovf_get_data_buff_from_pos(migf, *pos, &end_of_data);
405+
if (first_loop_call) {
406+
first_loop_call = false;
407+
/* Temporary end of file as part of PRE_COPY */
408+
if (end_of_data && migf->state == VIRTIOVF_MIGF_STATE_PRECOPY) {
409+
done = -ENOMSG;
410+
goto out_unlock;
411+
}
412+
if (end_of_data && migf->state != VIRTIOVF_MIGF_STATE_COMPLETE) {
413+
done = -EINVAL;
414+
goto out_unlock;
415+
}
416+
}
417+
365418
if (end_of_data)
366419
goto out_unlock;
367420

@@ -383,9 +436,101 @@ static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t le
383436
return done;
384437
}
385438

439+
static long virtiovf_precopy_ioctl(struct file *filp, unsigned int cmd,
440+
unsigned long arg)
441+
{
442+
struct virtiovf_migration_file *migf = filp->private_data;
443+
struct virtiovf_pci_core_device *virtvdev = migf->virtvdev;
444+
struct vfio_precopy_info info = {};
445+
loff_t *pos = &filp->f_pos;
446+
bool end_of_data = false;
447+
unsigned long minsz;
448+
u32 ctx_size = 0;
449+
int ret;
450+
451+
if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
452+
return -ENOTTY;
453+
454+
minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
455+
if (copy_from_user(&info, (void __user *)arg, minsz))
456+
return -EFAULT;
457+
458+
if (info.argsz < minsz)
459+
return -EINVAL;
460+
461+
mutex_lock(&virtvdev->state_mutex);
462+
if (virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
463+
virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
464+
ret = -EINVAL;
465+
goto err_state_unlock;
466+
}
467+
468+
/*
469+
* The virtio specification does not include a PRE_COPY concept.
470+
* Since we can expect the data to remain the same for a certain period,
471+
* we use a rate limiter mechanism before making a call to the device.
472+
*/
473+
if (__ratelimit(&migf->pre_copy_rl_state)) {
474+
475+
ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
476+
VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id,
477+
VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
478+
&ctx_size);
479+
if (ret)
480+
goto err_state_unlock;
481+
}
482+
483+
mutex_lock(&migf->lock);
484+
if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
485+
ret = -ENODEV;
486+
goto err_migf_unlock;
487+
}
488+
489+
if (migf->pre_copy_initial_bytes > *pos) {
490+
info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
491+
} else {
492+
info.dirty_bytes = migf->max_pos - *pos;
493+
if (!info.dirty_bytes)
494+
end_of_data = true;
495+
info.dirty_bytes += ctx_size;
496+
}
497+
498+
if (!end_of_data || !ctx_size) {
499+
mutex_unlock(&migf->lock);
500+
goto done;
501+
}
502+
503+
mutex_unlock(&migf->lock);
504+
/*
505+
* We finished transferring the current state and the device has a
506+
* dirty state, read a new state.
507+
*/
508+
ret = virtiovf_read_device_context_chunk(migf, ctx_size);
509+
if (ret)
510+
/*
511+
* The machine is running, and context size could be grow, so no reason to mark
512+
* the device state as VIRTIOVF_MIGF_STATE_ERROR.
513+
*/
514+
goto err_state_unlock;
515+
516+
done:
517+
virtiovf_state_mutex_unlock(virtvdev);
518+
if (copy_to_user((void __user *)arg, &info, minsz))
519+
return -EFAULT;
520+
return 0;
521+
522+
err_migf_unlock:
523+
mutex_unlock(&migf->lock);
524+
err_state_unlock:
525+
virtiovf_state_mutex_unlock(virtvdev);
526+
return ret;
527+
}
528+
386529
static const struct file_operations virtiovf_save_fops = {
387530
.owner = THIS_MODULE,
388531
.read = virtiovf_save_read,
532+
.unlocked_ioctl = virtiovf_precopy_ioctl,
533+
.compat_ioctl = compat_ptr_ioctl,
389534
.release = virtiovf_release_file,
390535
};
391536

@@ -429,7 +574,7 @@ virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
429574
int nent;
430575
int ret;
431576

432-
buf = virtiovf_alloc_data_buffer(migf, ctx_size);
577+
buf = virtiovf_get_data_buffer(migf, ctx_size);
433578
if (IS_ERR(buf))
434579
return PTR_ERR(buf);
435580

@@ -464,7 +609,7 @@ virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
464609
goto out;
465610

466611
buf->length = res_size;
467-
header_buf = virtiovf_alloc_data_buffer(migf,
612+
header_buf = virtiovf_get_data_buffer(migf,
468613
sizeof(struct virtiovf_migration_header));
469614
if (IS_ERR(header_buf)) {
470615
ret = PTR_ERR(header_buf);
@@ -489,8 +634,43 @@ virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
489634
return ret;
490635
}
491636

637+
static int
638+
virtiovf_pci_save_device_final_data(struct virtiovf_pci_core_device *virtvdev)
639+
{
640+
struct virtiovf_migration_file *migf = virtvdev->saving_migf;
641+
u32 ctx_size;
642+
int ret;
643+
644+
if (migf->state == VIRTIOVF_MIGF_STATE_ERROR)
645+
return -ENODEV;
646+
647+
ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
648+
VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id,
649+
VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
650+
&ctx_size);
651+
if (ret)
652+
goto err;
653+
654+
if (!ctx_size) {
655+
ret = -EINVAL;
656+
goto err;
657+
}
658+
659+
ret = virtiovf_read_device_context_chunk(migf, ctx_size);
660+
if (ret)
661+
goto err;
662+
663+
migf->state = VIRTIOVF_MIGF_STATE_COMPLETE;
664+
return 0;
665+
666+
err:
667+
migf->state = VIRTIOVF_MIGF_STATE_ERROR;
668+
return ret;
669+
}
670+
492671
static struct virtiovf_migration_file *
493-
virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev)
672+
virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev,
673+
bool pre_copy)
494674
{
495675
struct virtiovf_migration_file *migf;
496676
u32 ctx_size;
@@ -541,6 +721,18 @@ virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev)
541721
if (ret)
542722
goto out_clean;
543723

724+
if (pre_copy) {
725+
migf->pre_copy_initial_bytes = migf->max_pos;
726+
/* Arbitrarily set the pre-copy rate limit to 1-second intervals */
727+
ratelimit_state_init(&migf->pre_copy_rl_state, 1 * HZ, 1);
728+
/* Prevent any rate messages upon its usage */
729+
ratelimit_set_flags(&migf->pre_copy_rl_state,
730+
RATELIMIT_MSG_ON_RELEASE);
731+
migf->state = VIRTIOVF_MIGF_STATE_PRECOPY;
732+
} else {
733+
migf->state = VIRTIOVF_MIGF_STATE_COMPLETE;
734+
}
735+
544736
return migf;
545737

546738
out_clean:
@@ -950,15 +1142,17 @@ virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev,
9501142
return NULL;
9511143
}
9521144

953-
if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
1145+
if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
1146+
(cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
9541147
ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev,
9551148
BIT(VIRTIO_ADMIN_CMD_DEV_MODE_F_STOPPED));
9561149
if (ret)
9571150
return ERR_PTR(ret);
9581151
return NULL;
9591152
}
9601153

961-
if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) {
1154+
if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
1155+
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
9621156
ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev, 0);
9631157
if (ret)
9641158
return ERR_PTR(ret);
@@ -968,15 +1162,17 @@ virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev,
9681162
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
9691163
struct virtiovf_migration_file *migf;
9701164

971-
migf = virtiovf_pci_save_device_data(virtvdev);
1165+
migf = virtiovf_pci_save_device_data(virtvdev, false);
9721166
if (IS_ERR(migf))
9731167
return ERR_CAST(migf);
9741168
get_file(migf->filp);
9751169
virtvdev->saving_migf = migf;
9761170
return migf->filp;
9771171
}
9781172

979-
if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
1173+
if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
1174+
(cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
1175+
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
9801176
virtiovf_disable_fds(virtvdev);
9811177
return NULL;
9821178
}
@@ -997,6 +1193,24 @@ virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev,
9971193
return NULL;
9981194
}
9991195

1196+
if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
1197+
(cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
1198+
new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1199+
struct virtiovf_migration_file *migf;
1200+
1201+
migf = virtiovf_pci_save_device_data(virtvdev, true);
1202+
if (IS_ERR(migf))
1203+
return ERR_CAST(migf);
1204+
get_file(migf->filp);
1205+
virtvdev->saving_migf = migf;
1206+
return migf->filp;
1207+
}
1208+
1209+
if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
1210+
ret = virtiovf_pci_save_device_final_data(virtvdev);
1211+
return ret ? ERR_PTR(ret) : NULL;
1212+
}
1213+
10001214
/*
10011215
* vfio_mig_get_next_state() does not use arcs other than the above
10021216
*/
@@ -1101,7 +1315,8 @@ void virtiovf_set_migratable(struct virtiovf_pci_core_device *virtvdev)
11011315
spin_lock_init(&virtvdev->reset_lock);
11021316
virtvdev->core_device.vdev.migration_flags =
11031317
VFIO_MIGRATION_STOP_COPY |
1104-
VFIO_MIGRATION_P2P;
1318+
VFIO_MIGRATION_P2P |
1319+
VFIO_MIGRATION_PRE_COPY;
11051320
virtvdev->core_device.vdev.mig_ops = &virtvdev_pci_mig_ops;
11061321
}
11071322

0 commit comments

Comments
 (0)