Skip to content

Commit 3f882f2

Browse files
matt-auldrodrigovivi
authored andcommitted
drm/i915: improve the catch-all evict to handle lock contention
The catch-all evict can fail due to object lock contention, since it only goes as far as trylocking the object, due to us already holding the vm->mutex. Doing a full object lock here can deadlock, since the vm->mutex is always our inner lock. Add another execbuf pass which drops the vm->mutex and then tries to grab the object will the full lock, before then retrying the eviction. This should be good enough for now to fix the immediate regression with userspace seeing -ENOSPC from execbuf due to contended object locks during GTT eviction. v2 (Mani) - Also revamp the docs for the different passes. Testcase: igt@gem_ppgtt@shrink-vs-evict-* Fixes: 7e00897 ("drm/i915: Add object locking to i915_gem_evict_for_node and i915_gem_evict_something, v2.") References: https://gitlab.freedesktop.org/drm/intel/-/issues/7627 References: https://gitlab.freedesktop.org/drm/intel/-/issues/7570 References: https://bugzilla.mozilla.org/show_bug.cgi?id=1779558 Signed-off-by: Matthew Auld <matthew.auld@intel.com> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Cc: Andrzej Hajda <andrzej.hajda@intel.com> Cc: Mani Milani <mani@chromium.org> Cc: <stable@vger.kernel.org> # v5.18+ Reviewed-by: Mani Milani <mani@chromium.org> Tested-by: Mani Milani <mani@chromium.org> Link: https://patchwork.freedesktop.org/patch/msgid/20221216113456.414183-1-matthew.auld@intel.com (cherry picked from commit 801fa7a) Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
1 parent fff7586 commit 3f882f2

File tree

6 files changed

+82
-26
lines changed

6 files changed

+82
-26
lines changed

drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -730,32 +730,69 @@ static int eb_reserve(struct i915_execbuffer *eb)
730730
bool unpinned;
731731

732732
/*
733-
* Attempt to pin all of the buffers into the GTT.
734-
* This is done in 2 phases:
733+
* We have one more buffers that we couldn't bind, which could be due to
734+
* various reasons. To resolve this we have 4 passes, with every next
735+
* level turning the screws tighter:
735736
*
736-
* 1. Unbind all objects that do not match the GTT constraints for
737-
* the execbuffer (fenceable, mappable, alignment etc).
738-
* 2. Bind new objects.
737+
* 0. Unbind all objects that do not match the GTT constraints for the
738+
* execbuffer (fenceable, mappable, alignment etc). Bind all new
739+
* objects. This avoids unnecessary unbinding of later objects in order
740+
* to make room for the earlier objects *unless* we need to defragment.
739741
*
740-
* This avoid unnecessary unbinding of later objects in order to make
741-
* room for the earlier objects *unless* we need to defragment.
742+
* 1. Reorder the buffers, where objects with the most restrictive
743+
* placement requirements go first (ignoring fixed location buffers for
744+
* now). For example, objects needing the mappable aperture (the first
745+
* 256M of GTT), should go first vs objects that can be placed just
746+
* about anywhere. Repeat the previous pass.
742747
*
743-
* Defragmenting is skipped if all objects are pinned at a fixed location.
748+
* 2. Consider buffers that are pinned at a fixed location. Also try to
749+
* evict the entire VM this time, leaving only objects that we were
750+
* unable to lock. Try again to bind the buffers. (still using the new
751+
* buffer order).
752+
*
753+
* 3. We likely have object lock contention for one or more stubborn
754+
* objects in the VM, for which we need to evict to make forward
755+
* progress (perhaps we are fighting the shrinker?). When evicting the
756+
* VM this time around, anything that we can't lock we now track using
757+
* the busy_bo, using the full lock (after dropping the vm->mutex to
758+
* prevent deadlocks), instead of trylock. We then continue to evict the
759+
* VM, this time with the stubborn object locked, which we can now
760+
* hopefully unbind (if still bound in the VM). Repeat until the VM is
761+
* evicted. Finally we should be able bind everything.
744762
*/
745-
for (pass = 0; pass <= 2; pass++) {
763+
for (pass = 0; pass <= 3; pass++) {
746764
int pin_flags = PIN_USER | PIN_VALIDATE;
747765

748766
if (pass == 0)
749767
pin_flags |= PIN_NONBLOCK;
750768

751769
if (pass >= 1)
752-
unpinned = eb_unbind(eb, pass == 2);
770+
unpinned = eb_unbind(eb, pass >= 2);
753771

754772
if (pass == 2) {
755773
err = mutex_lock_interruptible(&eb->context->vm->mutex);
756774
if (!err) {
757-
err = i915_gem_evict_vm(eb->context->vm, &eb->ww);
775+
err = i915_gem_evict_vm(eb->context->vm, &eb->ww, NULL);
776+
mutex_unlock(&eb->context->vm->mutex);
777+
}
778+
if (err)
779+
return err;
780+
}
781+
782+
if (pass == 3) {
783+
retry:
784+
err = mutex_lock_interruptible(&eb->context->vm->mutex);
785+
if (!err) {
786+
struct drm_i915_gem_object *busy_bo = NULL;
787+
788+
err = i915_gem_evict_vm(eb->context->vm, &eb->ww, &busy_bo);
758789
mutex_unlock(&eb->context->vm->mutex);
790+
if (err && busy_bo) {
791+
err = i915_gem_object_lock(busy_bo, &eb->ww);
792+
i915_gem_object_put(busy_bo);
793+
if (!err)
794+
goto retry;
795+
}
759796
}
760797
if (err)
761798
return err;

drivers/gpu/drm/i915/gem/i915_gem_mman.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,7 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf)
369369
if (vma == ERR_PTR(-ENOSPC)) {
370370
ret = mutex_lock_interruptible(&ggtt->vm.mutex);
371371
if (!ret) {
372-
ret = i915_gem_evict_vm(&ggtt->vm, &ww);
372+
ret = i915_gem_evict_vm(&ggtt->vm, &ww, NULL);
373373
mutex_unlock(&ggtt->vm.mutex);
374374
}
375375
if (ret)

drivers/gpu/drm/i915/i915_gem_evict.c

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,11 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
416416
* @vm: Address space to cleanse
417417
* @ww: An optional struct i915_gem_ww_ctx. If not NULL, i915_gem_evict_vm
418418
* will be able to evict vma's locked by the ww as well.
419+
* @busy_bo: Optional pointer to struct drm_i915_gem_object. If not NULL, then
420+
* in the event i915_gem_evict_vm() is unable to trylock an object for eviction,
421+
* then @busy_bo will point to it. -EBUSY is also returned. The caller must drop
422+
* the vm->mutex, before trying again to acquire the contended lock. The caller
423+
* also owns a reference to the object.
419424
*
420425
* This function evicts all vmas from a vm.
421426
*
@@ -425,7 +430,8 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
425430
* To clarify: This is for freeing up virtual address space, not for freeing
426431
* memory in e.g. the shrinker.
427432
*/
428-
int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww)
433+
int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww,
434+
struct drm_i915_gem_object **busy_bo)
429435
{
430436
int ret = 0;
431437

@@ -457,41 +463,52 @@ int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww)
457463
* the resv is shared among multiple objects, we still
458464
* need the object ref.
459465
*/
460-
if (dying_vma(vma) ||
466+
if (!i915_gem_object_get_rcu(vma->obj) ||
461467
(ww && (dma_resv_locking_ctx(vma->obj->base.resv) == &ww->ctx))) {
462468
__i915_vma_pin(vma);
463469
list_add(&vma->evict_link, &locked_eviction_list);
464470
continue;
465471
}
466472

467-
if (!i915_gem_object_trylock(vma->obj, ww))
473+
if (!i915_gem_object_trylock(vma->obj, ww)) {
474+
if (busy_bo) {
475+
*busy_bo = vma->obj; /* holds ref */
476+
ret = -EBUSY;
477+
break;
478+
}
479+
i915_gem_object_put(vma->obj);
468480
continue;
481+
}
469482

470483
__i915_vma_pin(vma);
471484
list_add(&vma->evict_link, &eviction_list);
472485
}
473486
if (list_empty(&eviction_list) && list_empty(&locked_eviction_list))
474487
break;
475488

476-
ret = 0;
477489
/* Unbind locked objects first, before unlocking the eviction_list */
478490
list_for_each_entry_safe(vma, vn, &locked_eviction_list, evict_link) {
479491
__i915_vma_unpin(vma);
480492

481-
if (ret == 0)
493+
if (ret == 0) {
482494
ret = __i915_vma_unbind(vma);
483-
if (ret != -EINTR) /* "Get me out of here!" */
484-
ret = 0;
495+
if (ret != -EINTR) /* "Get me out of here!" */
496+
ret = 0;
497+
}
498+
if (!dying_vma(vma))
499+
i915_gem_object_put(vma->obj);
485500
}
486501

487502
list_for_each_entry_safe(vma, vn, &eviction_list, evict_link) {
488503
__i915_vma_unpin(vma);
489-
if (ret == 0)
504+
if (ret == 0) {
490505
ret = __i915_vma_unbind(vma);
491-
if (ret != -EINTR) /* "Get me out of here!" */
492-
ret = 0;
506+
if (ret != -EINTR) /* "Get me out of here!" */
507+
ret = 0;
508+
}
493509

494510
i915_gem_object_unlock(vma->obj);
511+
i915_gem_object_put(vma->obj);
495512
}
496513
} while (ret == 0);
497514

drivers/gpu/drm/i915/i915_gem_evict.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
struct drm_mm_node;
1212
struct i915_address_space;
1313
struct i915_gem_ww_ctx;
14+
struct drm_i915_gem_object;
1415

1516
int __must_check i915_gem_evict_something(struct i915_address_space *vm,
1617
struct i915_gem_ww_ctx *ww,
@@ -23,6 +24,7 @@ int __must_check i915_gem_evict_for_node(struct i915_address_space *vm,
2324
struct drm_mm_node *node,
2425
unsigned int flags);
2526
int i915_gem_evict_vm(struct i915_address_space *vm,
26-
struct i915_gem_ww_ctx *ww);
27+
struct i915_gem_ww_ctx *ww,
28+
struct drm_i915_gem_object **busy_bo);
2729

2830
#endif /* __I915_GEM_EVICT_H__ */

drivers/gpu/drm/i915/i915_vma.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1566,7 +1566,7 @@ static int __i915_ggtt_pin(struct i915_vma *vma, struct i915_gem_ww_ctx *ww,
15661566
* locked objects when called from execbuf when pinning
15671567
* is removed. This would probably regress badly.
15681568
*/
1569-
i915_gem_evict_vm(vm, NULL);
1569+
i915_gem_evict_vm(vm, NULL, NULL);
15701570
mutex_unlock(&vm->mutex);
15711571
}
15721572
} while (1);

drivers/gpu/drm/i915/selftests/i915_gem_evict.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ static int igt_evict_vm(void *arg)
344344

345345
/* Everything is pinned, nothing should happen */
346346
mutex_lock(&ggtt->vm.mutex);
347-
err = i915_gem_evict_vm(&ggtt->vm, NULL);
347+
err = i915_gem_evict_vm(&ggtt->vm, NULL, NULL);
348348
mutex_unlock(&ggtt->vm.mutex);
349349
if (err) {
350350
pr_err("i915_gem_evict_vm on a full GGTT returned err=%d]\n",
@@ -356,7 +356,7 @@ static int igt_evict_vm(void *arg)
356356

357357
for_i915_gem_ww(&ww, err, false) {
358358
mutex_lock(&ggtt->vm.mutex);
359-
err = i915_gem_evict_vm(&ggtt->vm, &ww);
359+
err = i915_gem_evict_vm(&ggtt->vm, &ww, NULL);
360360
mutex_unlock(&ggtt->vm.mutex);
361361
}
362362

0 commit comments

Comments
 (0)