@@ -49,13 +49,6 @@ struct vma_metadata {
4949
5050/************************************ Global Variables ********************************************/
5151
52- /**
53- * FD of KFD device used to checkpoint. On a multi-process
54- * tree the order of checkpointing goes from parent to child
55- * and so on - so saving the FD will not be overwritten
56- */
57- static int kfd_checkpoint_fd ;
58-
5952static LIST_HEAD (update_vma_info_list );
6053
6154size_t kfd_max_buffer_size ;
@@ -1007,28 +1000,39 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
10071000 return 0 ;
10081001}
10091002
1010- static int unpause_process (int fd )
1003+ int amdgpu_unpause_processes (int pid )
10111004{
10121005 int ret = 0 ;
10131006 struct kfd_ioctl_criu_args args = { 0 };
1007+ struct list_head * l = get_dumped_fds ();
1008+ struct dumped_fd * st ;
1009+
1010+ list_for_each_entry (st , l , l ) {
1011+ if (st -> is_drm ) {
1012+ ret = amdgpu_plugin_drm_unpause_file (st -> fd );
1013+ if (ret ) {
1014+ pr_perror ("Failed to unpause drm device file" );
1015+ goto exit ;
1016+ }
1017+ close (st -> fd );
1018+ } else {
1019+ args .op = KFD_CRIU_OP_UNPAUSE ;
10141020
1015- args . op = KFD_CRIU_OP_UNPAUSE ;
1016-
1017- ret = kmtIoctl ( fd , AMDKFD_IOC_CRIU_OP , & args );
1018- if ( ret ) {
1019- pr_perror ( "Failed to unpause process" );
1020- goto exit ;
1021+ ret = kmtIoctl ( st -> fd , AMDKFD_IOC_CRIU_OP , & args ) ;
1022+ if ( ret ) {
1023+ pr_perror ( "Failed to unpause process" );
1024+ goto exit ;
1025+ }
1026+ }
10211027 }
10221028
1023- // Reset the KFD FD
1024- kfd_checkpoint_fd = -1 ;
1025- sys_close_drm_render_devices (& src_topology );
1026-
10271029exit :
10281030 pr_info ("Process unpaused %s (ret:%d)\n" , ret ? "Failed" : "Ok" , ret );
1031+ clear_dumped_fds ();
10291032
10301033 return ret ;
10311034}
1035+ CR_PLUGIN_REGISTER_HOOK (CR_PLUGIN_HOOK__DUMP_DEVICE_LATE , amdgpu_unpause_processes )
10321036
10331037static int save_devices (int fd , struct kfd_ioctl_criu_args * args , struct kfd_criu_device_bucket * device_buckets ,
10341038 CriuKfd * e )
@@ -1230,9 +1234,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
12301234 return -1 ;
12311235 }
12321236
1233- /* Initialize number of device files that will be checkpointed */
1234- init_gpu_count (& src_topology );
1235-
12361237 /* Check whether this plugin was called for kfd or render nodes */
12371238 if (major (st .st_rdev ) != major (st_kfd .st_rdev ) || minor (st .st_rdev ) != 0 ) {
12381239
@@ -1244,11 +1245,9 @@ int amdgpu_plugin_dump_file(int fd, int id)
12441245 if (ret )
12451246 return ret ;
12461247
1247- /* Invoke unpause process if needed */
1248- decrement_checkpoint_count ();
1249- if (checkpoint_is_complete ()) {
1250- ret = unpause_process (kfd_checkpoint_fd );
1251- }
1248+ ret = record_dumped_fd (fd , true);
1249+ if (ret )
1250+ return ret ;
12521251
12531252 /* Need to return success here so that criu can call plugins for renderD nodes */
12541253 return ret ;
@@ -1346,14 +1345,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
13461345
13471346 xfree (buf );
13481347
1349- exit :
1350- /* Restore all queues if conditions permit */
1351- kfd_checkpoint_fd = fd ;
1352- decrement_checkpoint_count ();
1353- if (checkpoint_is_complete ()) {
1354- ret = unpause_process (fd );
1355- }
1348+ ret = record_dumped_fd (fd , false);
1349+ if (ret )
1350+ goto exit ;
13561351
1352+ exit :
13571353 xfree ((void * )args .devices );
13581354 xfree ((void * )args .bos );
13591355 xfree ((void * )args .priv_data );
0 commit comments