@@ -58,13 +58,6 @@ struct vma_metadata {
5858
5959/************************************ Global Variables ********************************************/
6060
61- /**
62- * FD of KFD device used to checkpoint. On a multi-process
63- * tree the order of checkpointing goes from parent to child
64- * and so on - so saving the FD will not be overwritten
65- */
66- static int kfd_checkpoint_fd ;
67-
6861static LIST_HEAD (update_vma_info_list );
6962
7063size_t kfd_max_buffer_size ;
@@ -1050,28 +1043,34 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
10501043 return 0 ;
10511044}
10521045
1053- static int unpause_process (int fd )
1046+ int amdgpu_unpause_processes (int pid )
10541047{
10551048 int ret = 0 ;
10561049 struct kfd_ioctl_criu_args args = { 0 };
1050+ struct list_head * l = get_dumped_fds ();
1051+ struct dumped_fd * st ;
10571052
1058- args .op = KFD_CRIU_OP_UNPAUSE ;
1053+ list_for_each_entry (st , l , l ) {
1054+ if (st -> is_drm ) {
1055+ close (st -> fd );
1056+ } else {
1057+ args .op = KFD_CRIU_OP_UNPAUSE ;
10591058
1060- ret = kmtIoctl (fd , AMDKFD_IOC_CRIU_OP , & args );
1061- if (ret ) {
1062- pr_perror ("Failed to unpause process" );
1063- goto exit ;
1059+ ret = kmtIoctl (st -> fd , AMDKFD_IOC_CRIU_OP , & args );
1060+ if (ret ) {
1061+ pr_perror ("Failed to unpause process" );
1062+ goto exit ;
1063+ }
1064+ }
10641065 }
10651066
1066- // Reset the KFD FD
1067- kfd_checkpoint_fd = -1 ;
1068- sys_close_drm_render_devices (& src_topology );
1069-
10701067exit :
10711068 pr_info ("Process unpaused %s (ret:%d)\n" , ret ? "Failed" : "Ok" , ret );
1069+ clear_dumped_fds ();
10721070
10731071 return ret ;
10741072}
1073+ CR_PLUGIN_REGISTER_HOOK (CR_PLUGIN_HOOK__DUMP_DEVICES_LATE , amdgpu_unpause_processes )
10751074
10761075int store_dmabuf_fd (int handle , int fd )
10771076{
@@ -1404,9 +1403,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
14041403 return -1 ;
14051404 }
14061405
1407- /* Initialize number of device files that will be checkpointed */
1408- init_gpu_count (& src_topology );
1409-
14101406 /* Check whether this plugin was called for kfd or render nodes */
14111407 if (major (st .st_rdev ) != major (st_kfd .st_rdev ) || minor (st .st_rdev ) != 0 ) {
14121408
@@ -1418,11 +1414,9 @@ int amdgpu_plugin_dump_file(int fd, int id)
14181414 if (ret )
14191415 return ret ;
14201416
1421- /* Invoke unpause process if needed */
1422- decrement_checkpoint_count ();
1423- if (checkpoint_is_complete ()) {
1424- ret = unpause_process (kfd_checkpoint_fd );
1425- }
1417+ ret = record_dumped_fd (fd , true);
1418+ if (ret )
1419+ return ret ;
14261420
14271421 /* Need to return success here so that criu can call plugins for renderD nodes */
14281422 return ret ;
@@ -1520,14 +1514,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
15201514
15211515 xfree (buf );
15221516
1523- exit :
1524- /* Restore all queues if conditions permit */
1525- kfd_checkpoint_fd = fd ;
1526- decrement_checkpoint_count ();
1527- if (checkpoint_is_complete ()) {
1528- ret = unpause_process (fd );
1529- }
1517+ ret = record_dumped_fd (fd , false);
1518+ if (ret )
1519+ goto exit ;
15301520
1521+ exit :
15311522 xfree ((void * )args .devices );
15321523 xfree ((void * )args .bos );
15331524 xfree ((void * )args .priv_data );
0 commit comments