Skip to content

Commit fe52c64

Browse files
Tvrtko UrsulinChristianKoenigAMD
authored andcommitted
dma-fence: Use kernel's sort for merging fences
One alternative to the fix Christian proposed in https://lore.kernel.org/dri-devel/20241024124159.4519-3-christian.koenig@amd.com/ is to replace the rather complex open coded sorting loops with the kernel standard sort followed by a context squashing pass. Proposed advantage of this would be readability but one concern Christian raised was that there could be many fences, that they are typically mostly sorted, and so the kernel's heap sort would be much worse by the proposed algorithm. I had a look running some games and vkcube to see what are the typical number of input fences. Tested scenarios: 1) Hogwarts Legacy under Gamescope 450 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.91% 1 69.40% 2-3 28.72% 9.4% (90.6% resolved to one fence) 4-5 0.93% 6-9 0.03% 10+ 2) Cyberpunk 2077 under Gamescope 1050 calls per second, amounting to 0.01% CPU time according to perf top. N Before After 0 1.13% 1 52.30% 2-3 40.34% 55.57% 4-5 1.46% 0.50% 6-9 2.44% 10+ 2.34% 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-3 100% 0% (Ie. all resolved to a single fence) 4-5 6-9 10+ In the case of vkcube all invocations in the 2-3 bucket were actually just two input fences. From these numbers it looks like the heap sort should not be a disadvantage, given how the dominant case is <= 2 input fences which heap sort solves with just one compare and swap. (And for the case of one input fence we have a fast path in the previous patch.) A complementary possibility is to implement a different sorting algorithm under the same API as the kernel's sort() and so keep the simplicity, potentially moving the new sort under lib/ if it would be found more widely useful. v2: * Hold on to fence references and reduce commentary. (Christian) * Record and use latest signaled timestamp in the 2nd loop too. * Consolidate zero or one fences fast paths. v3: * Reverse the seqno sort order for a simpler squashing pass. (Christian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Fixes: 245a4a7 ("dma-buf: generalize dma_fence unwrap & merging v3") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3617 Cc: Christian König <christian.koenig@amd.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Sumit Semwal <sumit.semwal@linaro.org> Cc: Gustavo Padovan <gustavo@padovan.org> Cc: Friedrich Vock <friedrich.vock@gmx.de> Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-sig@lists.linaro.org Cc: <stable@vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-3-tursulin@igalia.com
1 parent 949291c commit fe52c64

File tree

1 file changed

+61
-67
lines changed

1 file changed

+61
-67
lines changed

drivers/dma-buf/dma-fence-unwrap.c

Lines changed: 61 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <linux/dma-fence-chain.h>
1313
#include <linux/dma-fence-unwrap.h>
1414
#include <linux/slab.h>
15+
#include <linux/sort.h>
1516

1617
/* Internal helper to start new array iteration, don't use directly */
1718
static struct dma_fence *
@@ -59,6 +60,25 @@ struct dma_fence *dma_fence_unwrap_next(struct dma_fence_unwrap *cursor)
5960
}
6061
EXPORT_SYMBOL_GPL(dma_fence_unwrap_next);
6162

63+
64+
static int fence_cmp(const void *_a, const void *_b)
65+
{
66+
struct dma_fence *a = *(struct dma_fence **)_a;
67+
struct dma_fence *b = *(struct dma_fence **)_b;
68+
69+
if (a->context < b->context)
70+
return -1;
71+
else if (a->context > b->context)
72+
return 1;
73+
74+
if (dma_fence_is_later(b, a))
75+
return 1;
76+
else if (dma_fence_is_later(a, b))
77+
return -1;
78+
79+
return 0;
80+
}
81+
6282
/* Implementation for the dma_fence_merge() marco, don't use directly */
6383
struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
6484
struct dma_fence **fences,
@@ -67,8 +87,7 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
6787
struct dma_fence_array *result;
6888
struct dma_fence *tmp, **array;
6989
ktime_t timestamp;
70-
unsigned int i;
71-
size_t count;
90+
int i, j, count;
7291

7392
count = 0;
7493
timestamp = ns_to_ktime(0);
@@ -96,80 +115,55 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
96115
if (!array)
97116
return NULL;
98117

99-
/*
100-
* This trashes the input fence array and uses it as position for the
101-
* following merge loop. This works because the dma_fence_merge()
102-
* wrapper macro is creating this temporary array on the stack together
103-
* with the iterators.
104-
*/
105-
for (i = 0; i < num_fences; ++i)
106-
fences[i] = dma_fence_unwrap_first(fences[i], &iter[i]);
107-
108118
count = 0;
109-
do {
110-
unsigned int sel;
111-
112-
restart:
113-
tmp = NULL;
114-
for (i = 0; i < num_fences; ++i) {
115-
struct dma_fence *next;
116-
117-
while (fences[i] && dma_fence_is_signaled(fences[i]))
118-
fences[i] = dma_fence_unwrap_next(&iter[i]);
119-
120-
next = fences[i];
121-
if (!next)
122-
continue;
123-
124-
/*
125-
* We can't guarantee that inpute fences are ordered by
126-
* context, but it is still quite likely when this
127-
* function is used multiple times. So attempt to order
128-
* the fences by context as we pass over them and merge
129-
* fences with the same context.
130-
*/
131-
if (!tmp || tmp->context > next->context) {
132-
tmp = next;
133-
sel = i;
134-
135-
} else if (tmp->context < next->context) {
136-
continue;
137-
138-
} else if (dma_fence_is_later(tmp, next)) {
139-
fences[i] = dma_fence_unwrap_next(&iter[i]);
140-
goto restart;
119+
for (i = 0; i < num_fences; ++i) {
120+
dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) {
121+
if (!dma_fence_is_signaled(tmp)) {
122+
array[count++] = dma_fence_get(tmp);
141123
} else {
142-
fences[sel] = dma_fence_unwrap_next(&iter[sel]);
143-
goto restart;
144-
}
145-
}
124+
ktime_t t = dma_fence_timestamp(tmp);
146125

147-
if (tmp) {
148-
array[count++] = dma_fence_get(tmp);
149-
fences[sel] = dma_fence_unwrap_next(&iter[sel]);
126+
if (ktime_after(t, timestamp))
127+
timestamp = t;
128+
}
150129
}
151-
} while (tmp);
152-
153-
if (count == 0) {
154-
tmp = dma_fence_allocate_private_stub(ktime_get());
155-
goto return_tmp;
156130
}
157131

158-
if (count == 1) {
159-
tmp = array[0];
160-
goto return_tmp;
161-
}
132+
if (count == 0 || count == 1)
133+
goto return_fastpath;
134+
135+
sort(array, count, sizeof(*array), fence_cmp, NULL);
162136

163-
result = dma_fence_array_create(count, array,
164-
dma_fence_context_alloc(1),
165-
1, false);
166-
if (!result) {
167-
for (i = 0; i < count; i++)
137+
/*
138+
* Only keep the most recent fence for each context.
139+
*/
140+
j = 0;
141+
for (i = 1; i < count; i++) {
142+
if (array[i]->context == array[j]->context)
168143
dma_fence_put(array[i]);
169-
tmp = NULL;
170-
goto return_tmp;
144+
else
145+
array[++j] = array[i];
146+
}
147+
count = ++j;
148+
149+
if (count > 1) {
150+
result = dma_fence_array_create(count, array,
151+
dma_fence_context_alloc(1),
152+
1, false);
153+
if (!result) {
154+
for (i = 0; i < count; i++)
155+
dma_fence_put(array[i]);
156+
tmp = NULL;
157+
goto return_tmp;
158+
}
159+
return &result->base;
171160
}
172-
return &result->base;
161+
162+
return_fastpath:
163+
if (count == 0)
164+
tmp = dma_fence_allocate_private_stub(timestamp);
165+
else
166+
tmp = array[0];
173167

174168
return_tmp:
175169
kfree(array);

0 commit comments

Comments
 (0)