@@ -58,13 +58,6 @@ struct vma_metadata {
5858
5959/************************************ Global Variables ********************************************/
6060
61- /**
62- * FD of KFD device used to checkpoint. On a multi-process
63- * tree the order of checkpointing goes from parent to child
64- * and so on - so saving the FD will not be overwritten
65- */
66- static int kfd_checkpoint_fd ;
67-
6861static LIST_HEAD (update_vma_info_list );
6962
7063size_t kfd_max_buffer_size ;
@@ -1050,28 +1043,34 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
10501043 return 0 ;
10511044}
10521045
1053- static int unpause_process (int fd )
1046+ int amdgpu_unpause_processes (int pid )
10541047{
10551048 int ret = 0 ;
10561049 struct kfd_ioctl_criu_args args = { 0 };
1050+ struct list_head * l = get_dumped_fds ();
1051+ struct dumped_fd * st ;
10571052
1058- args .op = KFD_CRIU_OP_UNPAUSE ;
1053+ list_for_each_entry (st , l , l ) {
1054+ if (st -> is_drm ) {
1055+ close (st -> fd );
1056+ } else {
1057+ args .op = KFD_CRIU_OP_UNPAUSE ;
10591058
1060- ret = kmtIoctl (fd , AMDKFD_IOC_CRIU_OP , & args );
1061- if (ret ) {
1062- pr_perror ("Failed to unpause process" );
1063- goto exit ;
1059+ ret = kmtIoctl (st -> fd , AMDKFD_IOC_CRIU_OP , & args );
1060+ if (ret ) {
1061+ pr_perror ("Failed to unpause process" );
1062+ goto exit ;
1063+ }
1064+ }
10641065 }
10651066
1066- // Reset the KFD FD
1067- kfd_checkpoint_fd = -1 ;
1068- sys_close_drm_render_devices (& src_topology );
1069-
10701067exit :
10711068 pr_info ("Process unpaused %s (ret:%d)\n" , ret ? "Failed" : "Ok" , ret );
1069+ clear_dumped_fds ();
10721070
10731071 return ret ;
10741072}
1073+ CR_PLUGIN_REGISTER_HOOK (CR_PLUGIN_HOOK__DUMP_DEVICES_LATE , amdgpu_unpause_processes )
10751074
10761075int store_dmabuf_fd (int handle , int fd )
10771076{
@@ -1401,9 +1400,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
14011400 return -1 ;
14021401 }
14031402
1404- /* Initialize number of device files that will be checkpointed */
1405- init_gpu_count (& src_topology );
1406-
14071403 /* Check whether this plugin was called for kfd or render nodes */
14081404 if (major (st .st_rdev ) != major (st_kfd .st_rdev ) || minor (st .st_rdev ) != 0 ) {
14091405
@@ -1415,11 +1411,9 @@ int amdgpu_plugin_dump_file(int fd, int id)
14151411 if (ret )
14161412 return ret ;
14171413
1418- /* Invoke unpause process if needed */
1419- decrement_checkpoint_count ();
1420- if (checkpoint_is_complete ()) {
1421- ret = unpause_process (kfd_checkpoint_fd );
1422- }
1414+ ret = record_dumped_fd (fd , true);
1415+ if (ret )
1416+ return ret ;
14231417
14241418 /* Need to return success here so that criu can call plugins for renderD nodes */
14251419 return ret ;
@@ -1517,14 +1511,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
15171511
15181512 xfree (buf );
15191513
1520- exit :
1521- /* Restore all queues if conditions permit */
1522- kfd_checkpoint_fd = fd ;
1523- decrement_checkpoint_count ();
1524- if (checkpoint_is_complete ()) {
1525- ret = unpause_process (fd );
1526- }
1514+ ret = record_dumped_fd (fd , false);
1515+ if (ret )
1516+ goto exit ;
15271517
1518+ exit :
15281519 xfree ((void * )args .devices );
15291520 xfree ((void * )args .bos );
15301521 xfree ((void * )args .priv_data );
0 commit comments