Skip to content

Commit 5798e4d

Browse files
Yishai Hadasawilliam
authored andcommitted
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase
This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
1 parent 9114100 commit 5798e4d

File tree

3 files changed

+116
-65
lines changed

3 files changed

+116
-65
lines changed

drivers/vfio/pci/mlx5/cmd.c

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -632,9 +632,9 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
632632
}
633633

634634
if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
635-
if (async_data->stop_copy_chunk && migf->buf_header) {
636-
header_buf = migf->buf_header;
637-
migf->buf_header = NULL;
635+
if (async_data->stop_copy_chunk && migf->buf_header[0]) {
636+
header_buf = migf->buf_header[0];
637+
migf->buf_header[0] = NULL;
638638
} else {
639639
header_buf = mlx5vf_get_data_buffer(migf,
640640
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
@@ -721,18 +721,21 @@ void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
721721
void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
722722
{
723723
struct mlx5_vhca_data_buffer *entry;
724+
int i;
724725

725726
lockdep_assert_held(&migf->mvdev->state_mutex);
726727
WARN_ON(migf->mvdev->mdev_detach);
727728

728-
if (migf->buf) {
729-
mlx5vf_free_data_buffer(migf->buf);
730-
migf->buf = NULL;
731-
}
729+
for (i = 0; i < MAX_NUM_CHUNKS; i++) {
730+
if (migf->buf[i]) {
731+
mlx5vf_free_data_buffer(migf->buf[i]);
732+
migf->buf[i] = NULL;
733+
}
732734

733-
if (migf->buf_header) {
734-
mlx5vf_free_data_buffer(migf->buf_header);
735-
migf->buf_header = NULL;
735+
if (migf->buf_header[i]) {
736+
mlx5vf_free_data_buffer(migf->buf_header[i]);
737+
migf->buf_header[i] = NULL;
738+
}
736739
}
737740

738741
list_splice(&migf->avail_list, &migf->buf_list);

drivers/vfio/pci/mlx5/cmd.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ struct mlx5_vhca_data_buffer {
6464
u32 mkey;
6565
enum dma_data_direction dma_dir;
6666
u8 dmaed:1;
67+
u8 stop_copy_chunk_num;
6768
struct list_head buf_elm;
6869
struct mlx5_vf_migration_file *migf;
6970
/* Optimize mlx5vf_get_migration_page() for sequential access */
@@ -82,6 +83,8 @@ struct mlx5vf_async_data {
8283
void *out;
8384
};
8485

86+
#define MAX_NUM_CHUNKS 2
87+
8588
struct mlx5_vf_migration_file {
8689
struct file *filp;
8790
struct mutex lock;
@@ -94,8 +97,9 @@ struct mlx5_vf_migration_file {
9497
u32 record_tag;
9598
u64 stop_copy_prep_size;
9699
u64 pre_copy_initial_bytes;
97-
struct mlx5_vhca_data_buffer *buf;
98-
struct mlx5_vhca_data_buffer *buf_header;
100+
/* Upon chunk mode preserve another set of buffers for stop_copy phase */
101+
struct mlx5_vhca_data_buffer *buf[MAX_NUM_CHUNKS];
102+
struct mlx5_vhca_data_buffer *buf_header[MAX_NUM_CHUNKS];
99103
spinlock_t list_lock;
100104
struct list_head buf_list;
101105
struct list_head avail_list;

drivers/vfio/pci/mlx5/main.c

Lines changed: 97 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
/* Device specification max LOAD size */
2525
#define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
2626

27+
#define MAX_CHUNK_SIZE SZ_8M
28+
2729
static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
2830
{
2931
struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
@@ -304,7 +306,8 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
304306
wake_up_interruptible(&migf->poll_wait);
305307
}
306308

307-
static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
309+
static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
310+
bool track)
308311
{
309312
size_t size = sizeof(struct mlx5_vf_migration_header) +
310313
sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
@@ -331,7 +334,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
331334
to_buff = kmap_local_page(page);
332335
memcpy(to_buff, &header, sizeof(header));
333336
header_buf->length = sizeof(header);
334-
data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length);
337+
data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
335338
memcpy(to_buff + sizeof(header), &data, sizeof(data));
336339
header_buf->length += sizeof(data);
337340
kunmap_local(to_buff);
@@ -340,48 +343,83 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
340343
spin_lock_irqsave(&migf->list_lock, flags);
341344
list_add_tail(&header_buf->buf_elm, &migf->buf_list);
342345
spin_unlock_irqrestore(&migf->list_lock, flags);
343-
migf->pre_copy_initial_bytes = size;
346+
if (track)
347+
migf->pre_copy_initial_bytes = size;
344348
return 0;
345349
err:
346350
mlx5vf_put_data_buffer(header_buf);
347351
return ret;
348352
}
349353

350-
static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf,
351-
size_t state_size)
354+
static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
355+
struct mlx5_vf_migration_file *migf,
356+
size_t state_size, u64 full_size,
357+
bool track)
352358
{
353359
struct mlx5_vhca_data_buffer *buf;
354360
size_t inc_state_size;
361+
int num_chunks;
355362
int ret;
363+
int i;
356364

357-
/* let's be ready for stop_copy size that might grow by 10 percents */
358-
if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
359-
inc_state_size = state_size;
365+
if (mvdev->chunk_mode) {
366+
size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
360367

361-
buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
362-
if (IS_ERR(buf))
363-
return PTR_ERR(buf);
368+
/* from firmware perspective at least 'state_size' buffer should be set */
369+
inc_state_size = max(state_size, chunk_size);
370+
} else {
371+
if (track) {
372+
/* let's be ready for stop_copy size that might grow by 10 percents */
373+
if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
374+
inc_state_size = state_size;
375+
} else {
376+
inc_state_size = state_size;
377+
}
378+
}
364379

365-
migf->buf = buf;
366-
buf = mlx5vf_get_data_buffer(migf,
367-
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
368-
if (IS_ERR(buf)) {
369-
ret = PTR_ERR(buf);
370-
goto err;
380+
/* let's not overflow the device specification max SAVE size */
381+
inc_state_size = min_t(size_t, inc_state_size,
382+
(BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
383+
384+
num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
385+
for (i = 0; i < num_chunks; i++) {
386+
buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
387+
if (IS_ERR(buf)) {
388+
ret = PTR_ERR(buf);
389+
goto err;
390+
}
391+
392+
migf->buf[i] = buf;
393+
buf = mlx5vf_get_data_buffer(migf,
394+
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
395+
if (IS_ERR(buf)) {
396+
ret = PTR_ERR(buf);
397+
goto err;
398+
}
399+
migf->buf_header[i] = buf;
400+
if (mvdev->chunk_mode) {
401+
migf->buf[i]->stop_copy_chunk_num = i + 1;
402+
migf->buf_header[i]->stop_copy_chunk_num = i + 1;
403+
}
371404
}
372405

373-
migf->buf_header = buf;
374-
ret = mlx5vf_add_stop_copy_header(migf);
406+
ret = mlx5vf_add_stop_copy_header(migf, track);
375407
if (ret)
376-
goto err_header;
408+
goto err;
377409
return 0;
378410

379-
err_header:
380-
mlx5vf_put_data_buffer(migf->buf_header);
381-
migf->buf_header = NULL;
382411
err:
383-
mlx5vf_put_data_buffer(migf->buf);
384-
migf->buf = NULL;
412+
for (i = 0; i < num_chunks; i++) {
413+
if (migf->buf[i]) {
414+
mlx5vf_put_data_buffer(migf->buf[i]);
415+
migf->buf[i] = NULL;
416+
}
417+
if (migf->buf_header[i]) {
418+
mlx5vf_put_data_buffer(migf->buf_header[i]);
419+
migf->buf_header[i] = NULL;
420+
}
421+
}
422+
385423
return ret;
386424
}
387425

@@ -511,9 +549,9 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
511549
goto err;
512550

513551
/* Checking whether we have a matching pre-allocated buffer that can fit */
514-
if (migf->buf && migf->buf->allocated_length >= length) {
515-
buf = migf->buf;
516-
migf->buf = NULL;
552+
if (migf->buf[0]->allocated_length >= length) {
553+
buf = migf->buf[0];
554+
migf->buf[0] = NULL;
517555
} else {
518556
buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
519557
if (IS_ERR(buf)) {
@@ -541,6 +579,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
541579
struct mlx5_vf_migration_file *migf;
542580
struct mlx5_vhca_data_buffer *buf;
543581
size_t length;
582+
u64 full_size;
544583
int ret;
545584

546585
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
@@ -574,20 +613,25 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
574613
INIT_LIST_HEAD(&migf->buf_list);
575614
INIT_LIST_HEAD(&migf->avail_list);
576615
spin_lock_init(&migf->list_lock);
577-
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, 0);
616+
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
617+
if (ret)
618+
goto out_pd;
619+
620+
ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
578621
if (ret)
579622
goto out_pd;
580623

581624
if (track) {
582-
ret = mlx5vf_prep_stop_copy(migf, length);
583-
if (ret)
625+
/* leave the allocated buffer ready for the stop-copy phase */
626+
buf = mlx5vf_alloc_data_buffer(migf,
627+
migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
628+
if (IS_ERR(buf)) {
629+
ret = PTR_ERR(buf);
584630
goto out_pd;
585-
}
586-
587-
buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
588-
if (IS_ERR(buf)) {
589-
ret = PTR_ERR(buf);
590-
goto out_pd;
631+
}
632+
} else {
633+
buf = migf->buf[0];
634+
migf->buf[0] = NULL;
591635
}
592636

593637
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
@@ -820,8 +864,8 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
820864
size_t len, loff_t *pos)
821865
{
822866
struct mlx5_vf_migration_file *migf = filp->private_data;
823-
struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
824-
struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
867+
struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
868+
struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
825869
loff_t requested_length;
826870
bool has_work = false;
827871
ssize_t done = 0;
@@ -856,15 +900,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
856900
if (vhca_buf_header->allocated_length < migf->record_size) {
857901
mlx5vf_free_data_buffer(vhca_buf_header);
858902

859-
migf->buf_header = mlx5vf_alloc_data_buffer(migf,
903+
migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
860904
migf->record_size, DMA_NONE);
861-
if (IS_ERR(migf->buf_header)) {
862-
ret = PTR_ERR(migf->buf_header);
863-
migf->buf_header = NULL;
905+
if (IS_ERR(migf->buf_header[0])) {
906+
ret = PTR_ERR(migf->buf_header[0]);
907+
migf->buf_header[0] = NULL;
864908
goto out_unlock;
865909
}
866910

867-
vhca_buf_header = migf->buf_header;
911+
vhca_buf_header = migf->buf_header[0];
868912
}
869913

870914
vhca_buf_header->start_pos = migf->max_pos;
@@ -884,15 +928,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
884928
if (vhca_buf->allocated_length < size) {
885929
mlx5vf_free_data_buffer(vhca_buf);
886930

887-
migf->buf = mlx5vf_alloc_data_buffer(migf,
931+
migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
888932
size, DMA_TO_DEVICE);
889-
if (IS_ERR(migf->buf)) {
890-
ret = PTR_ERR(migf->buf);
891-
migf->buf = NULL;
933+
if (IS_ERR(migf->buf[0])) {
934+
ret = PTR_ERR(migf->buf[0]);
935+
migf->buf[0] = NULL;
892936
goto out_unlock;
893937
}
894938

895-
vhca_buf = migf->buf;
939+
vhca_buf = migf->buf[0];
896940
}
897941

898942
vhca_buf->start_pos = migf->max_pos;
@@ -974,7 +1018,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
9741018
goto out_pd;
9751019
}
9761020

977-
migf->buf = buf;
1021+
migf->buf[0] = buf;
9781022
if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
9791023
buf = mlx5vf_alloc_data_buffer(migf,
9801024
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
@@ -983,7 +1027,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
9831027
goto out_buf;
9841028
}
9851029

986-
migf->buf_header = buf;
1030+
migf->buf_header[0] = buf;
9871031
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
9881032
} else {
9891033
/* Initial state will be to read the image */
@@ -997,7 +1041,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
9971041
spin_lock_init(&migf->list_lock);
9981042
return migf;
9991043
out_buf:
1000-
mlx5vf_free_data_buffer(migf->buf);
1044+
mlx5vf_free_data_buffer(migf->buf[0]);
10011045
out_pd:
10021046
mlx5vf_cmd_dealloc_pd(migf);
10031047
out_free:
@@ -1101,7 +1145,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
11011145
if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
11021146
ret = mlx5vf_cmd_load_vhca_state(mvdev,
11031147
mvdev->resuming_migf,
1104-
mvdev->resuming_migf->buf);
1148+
mvdev->resuming_migf->buf[0]);
11051149
if (ret)
11061150
return ERR_PTR(ret);
11071151
}

0 commit comments

Comments
 (0)