@@ -57,13 +57,6 @@ struct vma_metadata {
5757
5858/************************************ Global Variables ********************************************/
5959
60- /**
61- * FD of KFD device used to checkpoint. On a multi-process
62- * tree the order of checkpointing goes from parent to child
63- * and so on - so saving the FD will not be overwritten
64- */
65- static int kfd_checkpoint_fd ;
66-
6760static LIST_HEAD (update_vma_info_list );
6861
6962static LIST_HEAD (amdgpu_processes );
@@ -1041,28 +1034,34 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
10411034 return 0 ;
10421035}
10431036
1044- static int unpause_process (int fd )
1037+ int amdgpu_unpause_processes (int pid )
10451038{
10461039 int ret = 0 ;
10471040 struct kfd_ioctl_criu_args args = { 0 };
1041+ struct list_head * l = get_dumped_fds ();
1042+ struct dumped_fd * st ;
10481043
1049- args .op = KFD_CRIU_OP_UNPAUSE ;
1044+ list_for_each_entry (st , l , l ) {
1045+ if (st -> is_drm ) {
1046+ close (st -> fd );
1047+ } else {
1048+ args .op = KFD_CRIU_OP_UNPAUSE ;
10501049
1051- ret = kmtIoctl (fd , AMDKFD_IOC_CRIU_OP , & args );
1052- if (ret ) {
1053- pr_perror ("Failed to unpause process" );
1054- goto exit ;
1050+ ret = kmtIoctl (st -> fd , AMDKFD_IOC_CRIU_OP , & args );
1051+ if (ret ) {
1052+ pr_perror ("Failed to unpause process" );
1053+ goto exit ;
1054+ }
1055+ }
10551056 }
10561057
1057- // Reset the KFD FD
1058- kfd_checkpoint_fd = -1 ;
1059- sys_close_drm_render_devices (& src_topology );
1060-
10611058exit :
10621059 pr_info ("Process unpaused %s (ret:%d)\n" , ret ? "Failed" : "Ok" , ret );
1060+ clear_dumped_fds ();
10631061
10641062 return ret ;
10651063}
1064+ CR_PLUGIN_REGISTER_HOOK (CR_PLUGIN_HOOK__DUMP_DEVICE_LATE , amdgpu_unpause_processes )
10661065
10671066static void dmabuf_socket_name_gen (struct sockaddr_un * addr , int * len , int pid )
10681067{
@@ -1382,9 +1381,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
13821381 return -1 ;
13831382 }
13841383
1385- /* Initialize number of device files that will be checkpointed */
1386- init_gpu_count (& src_topology );
1387-
13881384 /* Check whether this plugin was called for kfd or render nodes */
13891385 if (major (st .st_rdev ) != major (st_kfd .st_rdev ) || minor (st .st_rdev ) != 0 ) {
13901386
@@ -1396,11 +1392,9 @@ int amdgpu_plugin_dump_file(int fd, int id)
13961392 if (ret )
13971393 return ret ;
13981394
1399- /* Invoke unpause process if needed */
1400- decrement_checkpoint_count ();
1401- if (checkpoint_is_complete ()) {
1402- ret = unpause_process (kfd_checkpoint_fd );
1403- }
1395+ ret = record_dumped_fd (fd , true);
1396+ if (ret )
1397+ return ret ;
14041398
14051399 /* Need to return success here so that criu can call plugins for renderD nodes */
14061400 return ret ;
@@ -1498,14 +1492,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
14981492
14991493 xfree (buf );
15001494
1501- exit :
1502- /* Restore all queues if conditions permit */
1503- kfd_checkpoint_fd = fd ;
1504- decrement_checkpoint_count ();
1505- if (checkpoint_is_complete ()) {
1506- ret = unpause_process (fd );
1507- }
1495+ ret = record_dumped_fd (fd , false);
1496+ if (ret )
1497+ goto exit ;
15081498
1499+ exit :
15091500 xfree ((void * )args .devices );
15101501 xfree ((void * )args .bos );
15111502 xfree ((void * )args .priv_data );
0 commit comments