Skip to content

Commit 75cf822

Browse files
committed
images/inventory: add plugins field
This patch extends the inventory image with a `plugins` field that contains an array of plugins which were used during checkpoint, for example, to save GPU state. In particular, the CUDA and AMDGPU plugins are added to this field only when the checkpoint contains GPU state. This allows to disable unnecessary plugins during restore, show appropriate error messages if required CRIU plugin are missing, and migrate a process that does not use GPU from a GPU-enabled system to CPU-only environment. We use the `optional plugins_entry` for backwards compatibility. This entry allows us to distinguish between *unset* and *missing* field: - When the field is missing, it indicates that the checkpoint was created with a previous version of CRIU, and all plugins should be enabled during restore. - When the field is empty, it indicates that no plugins were used during the dump. Thus, all plugins can be disabled during restore. Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
1 parent 56bc739 commit 75cf822

File tree

7 files changed

+190
-5
lines changed

7 files changed

+190
-5
lines changed

criu/cr-restore.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2354,12 +2354,12 @@ int cr_restore_tasks(void)
23542354
if (init_service_fd())
23552355
return 1;
23562356

2357-
if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
2358-
return -1;
2359-
23602357
if (check_img_inventory(/* restore = */ true) < 0)
23612358
goto err;
23622359

2360+
if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
2361+
return -1;
2362+
23632363
if (init_stats(RESTORE_STATS))
23642364
goto err;
23652365

criu/image.c

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,14 @@ TaskKobjIdsEntry *root_ids;
2626
u32 root_cg_set;
2727
Lsmtype image_lsm;
2828

29+
struct inventory_plugin {
30+
struct list_head node;
31+
char *name;
32+
};
33+
34+
struct list_head inventory_plugins_list = LIST_HEAD_INIT(inventory_plugins_list);
35+
static int n_inventory_plugins;
36+
2937
int check_img_inventory(bool restore)
3038
{
3139
int ret = -1;
@@ -99,6 +107,19 @@ int check_img_inventory(bool restore)
99107
} else {
100108
opts.network_lock_method = he->network_lock_method;
101109
}
110+
111+
if (!he->plugins_entry) {
112+
/* backwards compatibility: if the 'plugins_entry' field is missing,
113+
* all plugins should be enabled during restore.
114+
*/
115+
n_inventory_plugins = -1;
116+
} else {
117+
PluginsEntry *pe = he->plugins_entry;
118+
for (int i = 0; i < pe->n_plugins; i++) {
119+
if (add_inventory_plugin(pe->plugins[i]))
120+
goto out_err;
121+
}
122+
}
102123
}
103124

104125
ret = 0;
@@ -110,8 +131,87 @@ int check_img_inventory(bool restore)
110131
return ret;
111132
}
112133

134+
/**
135+
* Check if the 'plugins' field in the inventory image contains
136+
* the specified plugin name. If found, the plugin is removed
137+
* from the linked list.
138+
*/
139+
bool check_and_remove_inventory_plugin(const char *name, size_t n)
140+
{
141+
if (n_inventory_plugins == -1)
142+
return true; /* backwards compatibility */
143+
144+
if (n_inventory_plugins > 0) {
145+
struct inventory_plugin *p, *tmp;
146+
list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) {
147+
if (!strncmp(name, p->name, n)) {
148+
xfree(p->name);
149+
list_del(&p->node);
150+
n_inventory_plugins--;
151+
return true;
152+
}
153+
}
154+
}
155+
156+
return false;
157+
}
158+
159+
/**
160+
* We expect during restore all loaded plugins to be removed from
161+
* the inventory_plugins_list. If the list is not empty, show an
162+
* error message for each missing plugin.
163+
*/
164+
int check_inventory_plugins(void)
165+
{
166+
struct inventory_plugin *p;
167+
168+
if (n_inventory_plugins <= 0)
169+
return 0;
170+
171+
list_for_each_entry(p, &inventory_plugins_list, node) {
172+
pr_err("Required plugin is missing: %s\n", p->name);
173+
}
174+
175+
return -1;
176+
}
177+
178+
/**
179+
* Add a plugin name to the inventory image. This array of names
180+
* is used to load only the necessary plugins during restore.
181+
*/
182+
int add_inventory_plugin(const char *name)
183+
{
184+
struct inventory_plugin *p;
185+
186+
p = xmalloc(sizeof(struct inventory_plugin));
187+
if (p == NULL)
188+
return -1;
189+
190+
p->name = xstrdup(name);
191+
if (!p->name) {
192+
xfree(p);
193+
return -1;
194+
}
195+
list_add(&p->node, &inventory_plugins_list);
196+
n_inventory_plugins++;
197+
198+
return 0;
199+
}
200+
201+
void free_inventory_plugins_list(void)
202+
{
203+
struct inventory_plugin *p;
204+
205+
if (!list_empty(&inventory_plugins_list)) {
206+
list_for_each_entry(p, &inventory_plugins_list, node) {
207+
xfree(p->name);
208+
}
209+
}
210+
}
211+
113212
int write_img_inventory(InventoryEntry *he)
114213
{
214+
PluginsEntry pe = PLUGINS_ENTRY__INIT;
115215
struct cr_img *img;
116216
int ret;
117217

@@ -121,8 +221,27 @@ int write_img_inventory(InventoryEntry *he)
121221
if (!img)
122222
return -1;
123223

224+
if (!list_empty(&inventory_plugins_list)) {
225+
struct inventory_plugin *p;
226+
int i = 0;
227+
228+
pe.n_plugins = n_inventory_plugins;
229+
pe.plugins = xmalloc(n_inventory_plugins * sizeof(char *));
230+
if (!pe.plugins)
231+
return -1;
232+
233+
list_for_each_entry(p, &inventory_plugins_list, node) {
234+
pe.plugins[i] = p->name;
235+
i++;
236+
}
237+
}
238+
he->plugins_entry = &pe;
239+
124240
ret = pb_write_one(img, he, PB_INVENTORY);
125241

242+
free_inventory_plugins_list();
243+
xfree(pe.plugins);
244+
126245
xfree(he->root_ids);
127246
close_image(img);
128247
if (ret < 0)

criu/include/image.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,4 +177,8 @@ extern int read_img_str(struct cr_img *, char **pstr, int size);
177177

178178
extern void close_image(struct cr_img *);
179179

180+
extern int add_inventory_plugin(const char *name);
181+
extern int check_inventory_plugins(void);
182+
extern bool check_and_remove_inventory_plugin(const char *name, size_t n);
183+
180184
#endif /* __CR_IMAGE_H__ */

criu/plugin.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,9 @@ int cr_plugin_init(int stage)
256256
goto err;
257257
}
258258

259+
if (stage == CR_PLUGIN_STAGE__RESTORE && check_inventory_plugins())
260+
goto err;
261+
259262
exit_code = 0;
260263
err:
261264
closedir(d);

images/inventory.proto

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,13 @@ enum lsmtype {
1010
APPARMOR = 2;
1111
}
1212

13+
// It is not possible to distinguish between an empty repeated field
14+
// and unset repeated field. To solve this problem and provide backwards
15+
// compabibility, we use the 'plugins_entry' message.
16+
message plugins_entry {
17+
repeated string plugins = 12;
18+
};
19+
1320
message inventory_entry {
1421
required uint32 img_version = 1;
1522
optional bool fdinfo_per_id = 2;
@@ -21,4 +28,5 @@ message inventory_entry {
2128
optional uint32 pre_dump_mode = 9;
2229
optional bool tcp_close = 10;
2330
optional uint32 network_lock_method = 11;
31+
optional plugins_entry plugins_entry = 12;
2432
}

plugins/amdgpu/amdgpu_plugin.c

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ static LIST_HEAD(update_vma_info_list);
6060

6161
size_t kfd_max_buffer_size;
6262

63+
/* Indicates if the plugin has been added to the inventory image */
64+
bool plugin_added_to_inventory = false;
65+
66+
bool plugin_disabled = false;
67+
6368
/**************************************************************************************************/
6469

6570
/* Call ioctl, restarting if it is interrupted */
@@ -332,6 +337,13 @@ void getenv_size_t(const char *var, size_t *value)
332337

333338
int amdgpu_plugin_init(int stage)
334339
{
340+
if (stage == CR_PLUGIN_STAGE__RESTORE) {
341+
if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) {
342+
plugin_disabled = true;
343+
return 0;
344+
}
345+
}
346+
335347
pr_info("initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name);
336348

337349
topology_init(&src_topology);
@@ -365,6 +377,9 @@ int amdgpu_plugin_init(int stage)
365377

366378
void amdgpu_plugin_fini(int stage, int ret)
367379
{
380+
if (plugin_disabled)
381+
return;
382+
368383
pr_info("finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name);
369384

370385
if (stage == CR_PLUGIN_STAGE__RESTORE)
@@ -414,6 +429,14 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf)
414429
if (ret)
415430
pr_perror("%s(), Can't handle VMAs of input device", __func__);
416431

432+
if (!plugin_added_to_inventory) {
433+
ret = add_inventory_plugin(CR_PLUGIN_DESC.name);
434+
if (ret)
435+
pr_err("Failed to add AMDGPU plugin to inventory image\n");
436+
else
437+
plugin_added_to_inventory = true;
438+
}
439+
417440
return ret;
418441
}
419442
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma)
@@ -1540,6 +1563,9 @@ int amdgpu_plugin_restore_file(int id)
15401563
size_t img_size;
15411564
FILE *img_fp = NULL;
15421565

1566+
if (plugin_disabled)
1567+
return -ENOTSUP;
1568+
15431569
pr_info("Initialized kfd plugin restorer with ID = %d\n", id);
15441570

15451571
snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id);
@@ -1746,6 +1772,9 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const
17461772
char *p_end;
17471773
bool is_kfd = false, is_renderD = false;
17481774

1775+
if (plugin_disabled)
1776+
return -ENOTSUP;
1777+
17491778
plugin_log_msg("Enter %s\n", __func__);
17501779

17511780
strncpy(path, in_path, sizeof(path));
@@ -1805,6 +1834,9 @@ int amdgpu_plugin_resume_devices_late(int target_pid)
18051834
struct kfd_ioctl_criu_args args = { 0 };
18061835
int fd, exit_code = 0;
18071836

1837+
if (plugin_disabled)
1838+
return -ENOTSUP;
1839+
18081840
pr_info("Inside %s for target pid = %d\n", __func__, target_pid);
18091841

18101842
fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC);

plugins/cuda/cuda_plugin.c

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@
3838
*/
3939
bool plugin_disabled = false;
4040

41+
/* Indicates if the plugin has been added to the inventory image */
42+
bool plugin_added_to_inventory = false;
43+
4144
struct pid_info {
4245
int pid;
4346
char checkpointed;
@@ -319,7 +322,7 @@ int cuda_plugin_checkpoint_devices(int pid)
319322
k_rtsigset_t save_sigset;
320323

321324
if (plugin_disabled) {
322-
return 0;
325+
return -ENOTSUP;
323326
}
324327

325328
restore_tid = get_cuda_restore_tid(pid);
@@ -354,6 +357,15 @@ int cuda_plugin_checkpoint_devices(int pid)
354357
pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid);
355358
}
356359
}
360+
361+
if (!plugin_added_to_inventory) {
362+
status = add_inventory_plugin(CR_PLUGIN_DESC.name);
363+
if (status)
364+
pr_err("Failed to add CUDA plugin to inventory image\n");
365+
else
366+
plugin_added_to_inventory = true;
367+
}
368+
357369
interrupt:
358370
int_ret = interrupt_restore_thread(restore_tid, &save_sigset);
359371

@@ -367,7 +379,7 @@ int cuda_plugin_pause_devices(int pid)
367379
char msg_buf[CUDA_CKPT_BUF_SIZE];
368380

369381
if (plugin_disabled) {
370-
return 0;
382+
return -ENOTSUP;
371383
}
372384

373385
restore_tid = get_cuda_restore_tid(pid);
@@ -463,6 +475,13 @@ int cuda_plugin_init(int stage)
463475
{
464476
int ret;
465477

478+
if (stage == CR_PLUGIN_STAGE__RESTORE) {
479+
if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) {
480+
plugin_disabled = true;
481+
return 0;
482+
}
483+
}
484+
466485
if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) {
467486
pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n");
468487
plugin_disabled = true;

0 commit comments

Comments
 (0)