Skip to content
This repository was archived by the owner on Nov 8, 2023. It is now read-only.

Commit eb85341

Browse files
Lang Yualexdeucher
authored andcommitted
drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs
Small APUs(i.e., consumer, embedded products) usually have a small carveout device memory which can't satisfy most compute workloads memory allocation requirements. We can't even run a Basic MNIST Example with a default 512MB carveout. https://github.com/pytorch/examples/tree/main/mnist. Error Log: "torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate 84.00 MiB. GPU 0 has a total capacity of 512.00 MiB of which 0 bytes is free. Of the allocated memory 103.83 MiB is allocated by PyTorch, and 22.17 MiB is reserved by PyTorch but unallocated" Though we can change BIOS settings to enlarge carveout size, which is inflexible and may bring complaint. On the other hand, the memory resource can't be effectively used between host and device. The solution is MI300A approach, i.e., let VRAM allocations go to GTT. Then device and host can flexibly and effectively share memory resource. v2: Report local_mem_size_private as 0. (Felix) Signed-off-by: Lang Yu <Lang.Yu@amd.com> Reviewed-by: Felix Kuehling <felix.kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 2a705f3 commit eb85341

File tree

5 files changed

+23
-13
lines changed

5 files changed

+23
-13
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,9 @@ void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device *adev,
455455
else
456456
mem_info->local_mem_size_private =
457457
KFD_XCP_MEMORY_SIZE(adev, xcp->id);
458+
} else if (adev->flags & AMD_IS_APU) {
459+
mem_info->local_mem_size_public = (ttm_tt_pages_limit() << PAGE_SHIFT);
460+
mem_info->local_mem_size_private = 0;
458461
} else {
459462
mem_info->local_mem_size_public = adev->gmc.visible_vram_size;
460463
mem_info->local_mem_size_private = adev->gmc.real_vram_size -
@@ -824,6 +827,8 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id)
824827
}
825828
do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
826829
return ALIGN_DOWN(tmp, PAGE_SIZE);
830+
} else if (adev->flags & AMD_IS_APU) {
831+
return (ttm_tt_pages_limit() << PAGE_SHIFT);
827832
} else {
828833
return adev->gmc.real_vram_size;
829834
}

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
196196
return -EINVAL;
197197

198198
vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);
199-
if (adev->gmc.is_app_apu) {
199+
if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
200200
system_mem_needed = size;
201201
ttm_mem_needed = size;
202202
}
@@ -232,7 +232,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
232232
"adev reference can't be null when vram is used");
233233
if (adev && xcp_id >= 0) {
234234
adev->kfd.vram_used[xcp_id] += vram_needed;
235-
adev->kfd.vram_used_aligned[xcp_id] += adev->gmc.is_app_apu ?
235+
adev->kfd.vram_used_aligned[xcp_id] +=
236+
(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) ?
236237
vram_needed :
237238
ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);
238239
}
@@ -260,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
260261

261262
if (adev) {
262263
adev->kfd.vram_used[xcp_id] -= size;
263-
if (adev->gmc.is_app_apu) {
264+
if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
264265
adev->kfd.vram_used_aligned[xcp_id] -= size;
265266
kfd_mem_limit.system_mem_used -= size;
266267
kfd_mem_limit.ttm_mem_used -= size;
@@ -889,7 +890,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem,
889890
* if peer device has large BAR. In contrast, access over xGMI is
890891
* allowed for both small and large BAR configurations of peer device
891892
*/
892-
if ((adev != bo_adev && !adev->gmc.is_app_apu) &&
893+
if ((adev != bo_adev && !(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)) &&
893894
((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) ||
894895
(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) ||
895896
(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {
@@ -1657,7 +1658,7 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev,
16571658
- atomic64_read(&adev->vram_pin_size)
16581659
- reserved_for_pt;
16591660

1660-
if (adev->gmc.is_app_apu) {
1661+
if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
16611662
system_mem_available = no_system_mem_limit ?
16621663
kfd_mem_limit.max_system_mem_limit :
16631664
kfd_mem_limit.max_system_mem_limit -
@@ -1705,7 +1706,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
17051706
if (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
17061707
domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM;
17071708

1708-
if (adev->gmc.is_app_apu) {
1709+
if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
17091710
domain = AMDGPU_GEM_DOMAIN_GTT;
17101711
alloc_domain = AMDGPU_GEM_DOMAIN_GTT;
17111712
alloc_flags = 0;
@@ -1952,7 +1953,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
19521953
if (size) {
19531954
if (!is_imported &&
19541955
(mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM ||
1955-
(adev->gmc.is_app_apu &&
1956+
((adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) &&
19561957
mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT)))
19571958
*size = bo_size;
19581959
else
@@ -2374,8 +2375,9 @@ static int import_obj_create(struct amdgpu_device *adev,
23742375
(*mem)->dmabuf = dma_buf;
23752376
(*mem)->bo = bo;
23762377
(*mem)->va = va;
2377-
(*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) && !adev->gmc.is_app_apu ?
2378-
AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;
2378+
(*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) &&
2379+
!(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) ?
2380+
AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;
23792381

23802382
(*mem)->mapped_to_gpu_memory = 0;
23812383
(*mem)->process_info = avm->process_info;

drivers/gpu/drm/amd/amdkfd/kfd_migrate.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1023,7 +1023,7 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev)
10231023
if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 0, 1))
10241024
return -EINVAL;
10251025

1026-
if (adev->gmc.is_app_apu)
1026+
if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)
10271027
return 0;
10281028

10291029
pgmap = &kfddev->pgmap;

drivers/gpu/drm/amd/amdkfd/kfd_svm.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2619,7 +2619,8 @@ svm_range_best_restore_location(struct svm_range *prange,
26192619
return -1;
26202620
}
26212621

2622-
if (node->adev->gmc.is_app_apu)
2622+
if (node->adev->gmc.is_app_apu ||
2623+
node->adev->flags & AMD_IS_APU)
26232624
return 0;
26242625

26252626
if (prange->preferred_loc == gpuid ||
@@ -3337,7 +3338,8 @@ svm_range_best_prefetch_location(struct svm_range *prange)
33373338
goto out;
33383339
}
33393340

3340-
if (bo_node->adev->gmc.is_app_apu) {
3341+
if (bo_node->adev->gmc.is_app_apu ||
3342+
bo_node->adev->flags & AMD_IS_APU) {
33413343
best_loc = 0;
33423344
goto out;
33433345
}

drivers/gpu/drm/amd/amdkfd/kfd_svm.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,8 @@ void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_s
201201
* is initialized to not 0 when page migration register device memory.
202202
*/
203203
#define KFD_IS_SVM_API_SUPPORTED(adev) ((adev)->kfd.pgmap.type != 0 ||\
204-
(adev)->gmc.is_app_apu)
204+
(adev)->gmc.is_app_apu ||\
205+
((adev)->flags & AMD_IS_APU))
205206

206207
void svm_range_bo_unref_async(struct svm_range_bo *svm_bo);
207208

0 commit comments

Comments
 (0)