@@ -54,13 +54,6 @@ struct vma_metadata {
5454
5555/************************************ Global Variables ********************************************/
5656
57- /**
58- * FD of KFD device used to checkpoint. On a multi-process
59- * tree the order of checkpointing goes from parent to child
60- * and so on - so saving the FD will not be overwritten
61- */
62- static int kfd_checkpoint_fd ;
63-
6457static LIST_HEAD (update_vma_info_list );
6558
6659static LIST_HEAD (amdgpu_processes );
@@ -1018,28 +1011,34 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
10181011 return 0 ;
10191012}
10201013
1021- static int unpause_process (int fd )
1014+ int amdgpu_unpause_processes (int pid )
10221015{
10231016 int ret = 0 ;
10241017 struct kfd_ioctl_criu_args args = { 0 };
1018+ struct list_head * l = get_dumped_fds ();
1019+ struct dumped_fd * st ;
10251020
1026- args .op = KFD_CRIU_OP_UNPAUSE ;
1021+ list_for_each_entry (st , l , l ) {
1022+ if (st -> is_drm ) {
1023+ close (st -> fd );
1024+ } else {
1025+ args .op = KFD_CRIU_OP_UNPAUSE ;
10271026
1028- ret = kmtIoctl (fd , AMDKFD_IOC_CRIU_OP , & args );
1029- if (ret ) {
1030- pr_perror ("Failed to unpause process" );
1031- goto exit ;
1027+ ret = kmtIoctl (st -> fd , AMDKFD_IOC_CRIU_OP , & args );
1028+ if (ret ) {
1029+ pr_perror ("Failed to unpause process" );
1030+ goto exit ;
1031+ }
1032+ }
10321033 }
10331034
1034- // Reset the KFD FD
1035- kfd_checkpoint_fd = -1 ;
1036- sys_close_drm_render_devices (& src_topology );
1037-
10381035exit :
10391036 pr_info ("Process unpaused %s (ret:%d)\n" , ret ? "Failed" : "Ok" , ret );
1037+ clear_dumped_fds ();
10401038
10411039 return ret ;
10421040}
1041+ CR_PLUGIN_REGISTER_HOOK (CR_PLUGIN_HOOK__DUMP_DEVICE_LATE , amdgpu_unpause_processes )
10431042
10441043static void dmabuf_socket_name_gen (struct sockaddr_un * addr , int * len , int pid )
10451044{
@@ -1359,9 +1358,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
13591358 return -1 ;
13601359 }
13611360
1362- /* Initialize number of device files that will be checkpointed */
1363- init_gpu_count (& src_topology );
1364-
13651361 /* Check whether this plugin was called for kfd or render nodes */
13661362 if (major (st .st_rdev ) != major (st_kfd .st_rdev ) || minor (st .st_rdev ) != 0 ) {
13671363
@@ -1373,11 +1369,9 @@ int amdgpu_plugin_dump_file(int fd, int id)
13731369 if (ret )
13741370 return ret ;
13751371
1376- /* Invoke unpause process if needed */
1377- decrement_checkpoint_count ();
1378- if (checkpoint_is_complete ()) {
1379- ret = unpause_process (kfd_checkpoint_fd );
1380- }
1372+ ret = record_dumped_fd (fd , true);
1373+ if (ret )
1374+ return ret ;
13811375
13821376 /* Need to return success here so that criu can call plugins for renderD nodes */
13831377 return ret ;
@@ -1475,14 +1469,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
14751469
14761470 xfree (buf );
14771471
1478- exit :
1479- /* Restore all queues if conditions permit */
1480- kfd_checkpoint_fd = fd ;
1481- decrement_checkpoint_count ();
1482- if (checkpoint_is_complete ()) {
1483- ret = unpause_process (fd );
1484- }
1472+ ret = record_dumped_fd (fd , false);
1473+ if (ret )
1474+ goto exit ;
14851475
1476+ exit :
14861477 xfree ((void * )args .devices );
14871478 xfree ((void * )args .bos );
14881479 xfree ((void * )args .priv_data );
0 commit comments