Skip to content

Commit 8622e45

Browse files
anakryikoIngo Molnar
authored andcommitted
uprobes: Reuse return_instances between multiple uretprobes within task
Instead of constantly allocating and freeing very short-lived struct return_instance, reuse it as much as possible within current task. For that, store a linked list of reusable return_instances within current->utask. The only complication is that ri_timer() might be still processing such return_instance. And so while the main uretprobe processing logic might be already done with return_instance and would be OK to immediately reuse it for the next uretprobe instance, it's not correct to unconditionally reuse it just like that. Instead we make sure that ri_timer() can't possibly be processing it by using seqcount_t, with ri_timer() being "a writer", while free_ret_instance() being "a reader". If, after we unlink return instance from utask->return_instances list, we know that ri_timer() hasn't gotten to processing utask->return_instances yet, then we can be sure that immediate return_instance reuse is OK, and so we put it onto utask->ri_pool for future (potentially, almost immediate) reuse. This change shows improvements both in single CPU performance (by avoiding relatively expensive kmalloc/free combon) and in terms of multi-CPU scalability, where you can see that per-CPU throughput doesn't decline as steeply with increased number of CPUs (which were previously attributed to kmalloc()/free() through profiling): BASELINE (latest perf/core) =========================== uretprobe-nop ( 1 cpus): 1.898 ± 0.002M/s ( 1.898M/s/cpu) uretprobe-nop ( 2 cpus): 3.574 ± 0.011M/s ( 1.787M/s/cpu) uretprobe-nop ( 3 cpus): 5.279 ± 0.066M/s ( 1.760M/s/cpu) uretprobe-nop ( 4 cpus): 6.824 ± 0.047M/s ( 1.706M/s/cpu) uretprobe-nop ( 5 cpus): 8.339 ± 0.060M/s ( 1.668M/s/cpu) uretprobe-nop ( 6 cpus): 9.812 ± 0.047M/s ( 1.635M/s/cpu) uretprobe-nop ( 7 cpus): 11.030 ± 0.048M/s ( 1.576M/s/cpu) uretprobe-nop ( 8 cpus): 12.453 ± 0.126M/s ( 1.557M/s/cpu) uretprobe-nop (10 cpus): 14.838 ± 0.044M/s ( 1.484M/s/cpu) uretprobe-nop (12 cpus): 17.092 ± 0.115M/s ( 1.424M/s/cpu) uretprobe-nop (14 cpus): 19.576 ± 0.022M/s ( 1.398M/s/cpu) uretprobe-nop (16 cpus): 22.264 ± 0.015M/s ( 1.391M/s/cpu) uretprobe-nop (24 cpus): 33.534 ± 0.078M/s ( 1.397M/s/cpu) uretprobe-nop (32 cpus): 43.262 ± 0.127M/s ( 1.352M/s/cpu) uretprobe-nop (40 cpus): 53.252 ± 0.080M/s ( 1.331M/s/cpu) uretprobe-nop (48 cpus): 55.778 ± 0.045M/s ( 1.162M/s/cpu) uretprobe-nop (56 cpus): 56.850 ± 0.227M/s ( 1.015M/s/cpu) uretprobe-nop (64 cpus): 62.005 ± 0.077M/s ( 0.969M/s/cpu) uretprobe-nop (72 cpus): 66.445 ± 0.236M/s ( 0.923M/s/cpu) uretprobe-nop (80 cpus): 68.353 ± 0.180M/s ( 0.854M/s/cpu) THIS PATCHSET (on top of latest perf/core) ========================================== uretprobe-nop ( 1 cpus): 2.253 ± 0.004M/s ( 2.253M/s/cpu) uretprobe-nop ( 2 cpus): 4.281 ± 0.003M/s ( 2.140M/s/cpu) uretprobe-nop ( 3 cpus): 6.389 ± 0.027M/s ( 2.130M/s/cpu) uretprobe-nop ( 4 cpus): 8.328 ± 0.005M/s ( 2.082M/s/cpu) uretprobe-nop ( 5 cpus): 10.353 ± 0.001M/s ( 2.071M/s/cpu) uretprobe-nop ( 6 cpus): 12.513 ± 0.010M/s ( 2.086M/s/cpu) uretprobe-nop ( 7 cpus): 14.525 ± 0.017M/s ( 2.075M/s/cpu) uretprobe-nop ( 8 cpus): 15.633 ± 0.013M/s ( 1.954M/s/cpu) uretprobe-nop (10 cpus): 19.532 ± 0.011M/s ( 1.953M/s/cpu) uretprobe-nop (12 cpus): 21.405 ± 0.009M/s ( 1.784M/s/cpu) uretprobe-nop (14 cpus): 24.857 ± 0.020M/s ( 1.776M/s/cpu) uretprobe-nop (16 cpus): 26.466 ± 0.018M/s ( 1.654M/s/cpu) uretprobe-nop (24 cpus): 40.513 ± 0.222M/s ( 1.688M/s/cpu) uretprobe-nop (32 cpus): 54.180 ± 0.074M/s ( 1.693M/s/cpu) uretprobe-nop (40 cpus): 66.100 ± 0.082M/s ( 1.652M/s/cpu) uretprobe-nop (48 cpus): 70.544 ± 0.068M/s ( 1.470M/s/cpu) uretprobe-nop (56 cpus): 74.494 ± 0.055M/s ( 1.330M/s/cpu) uretprobe-nop (64 cpus): 79.317 ± 0.029M/s ( 1.239M/s/cpu) uretprobe-nop (72 cpus): 84.875 ± 0.020M/s ( 1.179M/s/cpu) uretprobe-nop (80 cpus): 92.318 ± 0.224M/s ( 1.154M/s/cpu) For reference, with uprobe-nop we hit the following throughput: uprobe-nop (80 cpus): 143.485 ± 0.035M/s ( 1.794M/s/cpu) So now uretprobe stays a bit closer to that performance. Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Signed-off-by: Ingo Molnar <mingo@kernel.org> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Oleg Nesterov <oleg@redhat.com> Link: https://lore.kernel.org/r/20241206002417.3295533-5-andrii@kernel.org
1 parent 0cf981d commit 8622e45

File tree

2 files changed

+75
-14
lines changed

2 files changed

+75
-14
lines changed

include/linux/uprobes.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <linux/types.h>
1717
#include <linux/wait.h>
1818
#include <linux/timer.h>
19+
#include <linux/seqlock.h>
1920

2021
struct uprobe;
2122
struct vm_area_struct;
@@ -124,6 +125,10 @@ struct uprobe_task {
124125
unsigned int depth;
125126
struct return_instance *return_instances;
126127

128+
struct return_instance *ri_pool;
129+
struct timer_list ri_timer;
130+
seqcount_t ri_seqcount;
131+
127132
union {
128133
struct {
129134
struct arch_uprobe_task autask;
@@ -137,7 +142,6 @@ struct uprobe_task {
137142
};
138143

139144
struct uprobe *active_uprobe;
140-
struct timer_list ri_timer;
141145
unsigned long xol_vaddr;
142146

143147
struct arch_uprobe *auprobe;

kernel/events/uprobes.c

Lines changed: 70 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1888,17 +1888,57 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
18881888
return instruction_pointer(regs);
18891889
}
18901890

1891-
static void free_ret_instance(struct return_instance *ri, bool cleanup_hprobe)
1891+
static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri)
18921892
{
1893+
ri->cons_cnt = 0;
1894+
ri->next = utask->ri_pool;
1895+
utask->ri_pool = ri;
1896+
}
1897+
1898+
static struct return_instance *ri_pool_pop(struct uprobe_task *utask)
1899+
{
1900+
struct return_instance *ri = utask->ri_pool;
1901+
1902+
if (likely(ri))
1903+
utask->ri_pool = ri->next;
1904+
1905+
return ri;
1906+
}
1907+
1908+
static void ri_free(struct return_instance *ri)
1909+
{
1910+
kfree(ri->extra_consumers);
1911+
kfree_rcu(ri, rcu);
1912+
}
1913+
1914+
static void free_ret_instance(struct uprobe_task *utask,
1915+
struct return_instance *ri, bool cleanup_hprobe)
1916+
{
1917+
unsigned seq;
1918+
18931919
if (cleanup_hprobe) {
18941920
enum hprobe_state hstate;
18951921

18961922
(void)hprobe_consume(&ri->hprobe, &hstate);
18971923
hprobe_finalize(&ri->hprobe, hstate);
18981924
}
18991925

1900-
kfree(ri->extra_consumers);
1901-
kfree_rcu(ri, rcu);
1926+
/*
1927+
* At this point return_instance is unlinked from utask's
1928+
* return_instances list and this has become visible to ri_timer().
1929+
* If seqcount now indicates that ri_timer's return instance
1930+
* processing loop isn't active, we can return ri into the pool of
1931+
* to-be-reused return instances for future uretprobes. If ri_timer()
1932+
* happens to be running right now, though, we fallback to safety and
1933+
* just perform RCU-delated freeing of ri.
1934+
*/
1935+
if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) {
1936+
/* immediate reuse of ri without RCU GP is OK */
1937+
ri_pool_push(utask, ri);
1938+
} else {
1939+
/* we might be racing with ri_timer(), so play it safe */
1940+
ri_free(ri);
1941+
}
19021942
}
19031943

19041944
/*
@@ -1920,7 +1960,15 @@ void uprobe_free_utask(struct task_struct *t)
19201960
ri = utask->return_instances;
19211961
while (ri) {
19221962
ri_next = ri->next;
1923-
free_ret_instance(ri, true /* cleanup_hprobe */);
1963+
free_ret_instance(utask, ri, true /* cleanup_hprobe */);
1964+
ri = ri_next;
1965+
}
1966+
1967+
/* free_ret_instance() above might add to ri_pool, so this loop should come last */
1968+
ri = utask->ri_pool;
1969+
while (ri) {
1970+
ri_next = ri->next;
1971+
ri_free(ri);
19241972
ri = ri_next;
19251973
}
19261974

@@ -1943,8 +1991,12 @@ static void ri_timer(struct timer_list *timer)
19431991
/* RCU protects return_instance from freeing. */
19441992
guard(rcu)();
19451993

1994+
write_seqcount_begin(&utask->ri_seqcount);
1995+
19461996
for_each_ret_instance_rcu(ri, utask->return_instances)
19471997
hprobe_expire(&ri->hprobe, false);
1998+
1999+
write_seqcount_end(&utask->ri_seqcount);
19482000
}
19492001

19502002
static struct uprobe_task *alloc_utask(void)
@@ -1956,6 +2008,7 @@ static struct uprobe_task *alloc_utask(void)
19562008
return NULL;
19572009

19582010
timer_setup(&utask->ri_timer, ri_timer, 0);
2011+
seqcount_init(&utask->ri_seqcount);
19592012

19602013
return utask;
19612014
}
@@ -1975,10 +2028,14 @@ static struct uprobe_task *get_utask(void)
19752028
return current->utask;
19762029
}
19772030

1978-
static struct return_instance *alloc_return_instance(void)
2031+
static struct return_instance *alloc_return_instance(struct uprobe_task *utask)
19792032
{
19802033
struct return_instance *ri;
19812034

2035+
ri = ri_pool_pop(utask);
2036+
if (ri)
2037+
return ri;
2038+
19822039
ri = kzalloc(sizeof(*ri), GFP_KERNEL);
19832040
if (!ri)
19842041
return ZERO_SIZE_PTR;
@@ -2119,7 +2176,7 @@ static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
21192176
rcu_assign_pointer(utask->return_instances, ri_next);
21202177
utask->depth--;
21212178

2122-
free_ret_instance(ri, true /* cleanup_hprobe */);
2179+
free_ret_instance(utask, ri, true /* cleanup_hprobe */);
21232180
ri = ri_next;
21242181
}
21252182
}
@@ -2186,7 +2243,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
21862243

21872244
return;
21882245
free:
2189-
kfree(ri);
2246+
ri_free(ri);
21902247
}
21912248

21922249
/* Prepare to single-step probed instruction out of line. */
@@ -2385,8 +2442,7 @@ static struct return_instance *push_consumer(struct return_instance *ri, __u64 i
23852442
if (unlikely(ri->cons_cnt > 0)) {
23862443
ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL);
23872444
if (!ric) {
2388-
kfree(ri->extra_consumers);
2389-
kfree_rcu(ri, rcu);
2445+
ri_free(ri);
23902446
return ZERO_SIZE_PTR;
23912447
}
23922448
ri->extra_consumers = ric;
@@ -2428,8 +2484,9 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
24282484
struct uprobe_consumer *uc;
24292485
bool has_consumers = false, remove = true;
24302486
struct return_instance *ri = NULL;
2487+
struct uprobe_task *utask = current->utask;
24312488

2432-
current->utask->auprobe = &uprobe->arch;
2489+
utask->auprobe = &uprobe->arch;
24332490

24342491
list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
24352492
bool session = uc->handler && uc->ret_handler;
@@ -2449,12 +2506,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
24492506
continue;
24502507

24512508
if (!ri)
2452-
ri = alloc_return_instance();
2509+
ri = alloc_return_instance(utask);
24532510

24542511
if (session)
24552512
ri = push_consumer(ri, uc->id, cookie);
24562513
}
2457-
current->utask->auprobe = NULL;
2514+
utask->auprobe = NULL;
24582515

24592516
if (!ZERO_OR_NULL_PTR(ri))
24602517
prepare_uretprobe(uprobe, regs, ri);
@@ -2554,7 +2611,7 @@ void uprobe_handle_trampoline(struct pt_regs *regs)
25542611
hprobe_finalize(&ri->hprobe, hstate);
25552612

25562613
/* We already took care of hprobe, no need to waste more time on that. */
2557-
free_ret_instance(ri, false /* !cleanup_hprobe */);
2614+
free_ret_instance(utask, ri, false /* !cleanup_hprobe */);
25582615
ri = ri_next;
25592616
} while (ri != next_chain);
25602617
} while (!valid);

0 commit comments

Comments
 (0)