@@ -54,13 +54,6 @@ struct vma_metadata {
5454
5555/************************************ Global Variables ********************************************/
5656
57- /**
58- * FD of KFD device used to checkpoint. On a multi-process
59- * tree the order of checkpointing goes from parent to child
60- * and so on - so saving the FD will not be overwritten
61- */
62- static int kfd_checkpoint_fd ;
63-
6457static LIST_HEAD (update_vma_info_list );
6558
6659static LIST_HEAD (amdgpu_processes );
@@ -1018,28 +1011,39 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
10181011 return 0 ;
10191012}
10201013
1021- static int unpause_process (int fd )
1014+ int amdgpu_unpause_processes (int pid )
10221015{
10231016 int ret = 0 ;
10241017 struct kfd_ioctl_criu_args args = { 0 };
1018+ struct list_head * l = get_dumped_fds ();
1019+ struct dumped_fd * st ;
1020+
1021+ list_for_each_entry (st , l , l ) {
1022+ if (st -> is_drm ) {
1023+ ret = amdgpu_plugin_drm_unpause_file (st -> fd );
1024+ if (ret ) {
1025+ pr_perror ("Failed to unpause drm device file" );
1026+ goto exit ;
1027+ }
1028+ close (st -> fd );
1029+ } else {
1030+ args .op = KFD_CRIU_OP_UNPAUSE ;
10251031
1026- args . op = KFD_CRIU_OP_UNPAUSE ;
1027-
1028- ret = kmtIoctl ( fd , AMDKFD_IOC_CRIU_OP , & args );
1029- if ( ret ) {
1030- pr_perror ( "Failed to unpause process" );
1031- goto exit ;
1032+ ret = kmtIoctl ( st -> fd , AMDKFD_IOC_CRIU_OP , & args ) ;
1033+ if ( ret ) {
1034+ pr_perror ( "Failed to unpause process" );
1035+ goto exit ;
1036+ }
1037+ }
10321038 }
10331039
1034- // Reset the KFD FD
1035- kfd_checkpoint_fd = -1 ;
1036- sys_close_drm_render_devices (& src_topology );
1037-
10381040exit :
10391041 pr_info ("Process unpaused %s (ret:%d)\n" , ret ? "Failed" : "Ok" , ret );
1042+ clear_dumped_fds ();
10401043
10411044 return ret ;
10421045}
1046+ CR_PLUGIN_REGISTER_HOOK (CR_PLUGIN_HOOK__DUMP_DEVICE_LATE , amdgpu_unpause_processes )
10431047
10441048static void dmabuf_socket_name_gen (struct sockaddr_un * addr , int * len , int pid )
10451049{
@@ -1359,9 +1363,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
13591363 return -1 ;
13601364 }
13611365
1362- /* Initialize number of device files that will be checkpointed */
1363- init_gpu_count (& src_topology );
1364-
13651366 /* Check whether this plugin was called for kfd or render nodes */
13661367 if (major (st .st_rdev ) != major (st_kfd .st_rdev ) || minor (st .st_rdev ) != 0 ) {
13671368
@@ -1373,11 +1374,9 @@ int amdgpu_plugin_dump_file(int fd, int id)
13731374 if (ret )
13741375 return ret ;
13751376
1376- /* Invoke unpause process if needed */
1377- decrement_checkpoint_count ();
1378- if (checkpoint_is_complete ()) {
1379- ret = unpause_process (kfd_checkpoint_fd );
1380- }
1377+ ret = record_dumped_fd (fd , true);
1378+ if (ret )
1379+ return ret ;
13811380
13821381 /* Need to return success here so that criu can call plugins for renderD nodes */
13831382 return ret ;
@@ -1475,14 +1474,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
14751474
14761475 xfree (buf );
14771476
1478- exit :
1479- /* Restore all queues if conditions permit */
1480- kfd_checkpoint_fd = fd ;
1481- decrement_checkpoint_count ();
1482- if (checkpoint_is_complete ()) {
1483- ret = unpause_process (fd );
1484- }
1477+ ret = record_dumped_fd (fd , false);
1478+ if (ret )
1479+ goto exit ;
14851480
1481+ exit :
14861482 xfree ((void * )args .devices );
14871483 xfree ((void * )args .bos );
14881484 xfree ((void * )args .priv_data );
0 commit comments