Skip to content

Commit 3b8a9b2

Browse files
committed
Merge tag 'trace-v6.7-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace
Pull tracing fixes from Steven Rostedt: - Fix eventfs to check creating new files for events with names greater than NAME_MAX. The eventfs lookup needs to check the return result of simple_lookup(). - Fix the ring buffer to check the proper max data size. Events must be able to fit on the ring buffer sub-buffer, if it cannot, then it fails to be written and the logic to add the event is avoided. The code to check if an event can fit failed to add the possible absolute timestamp which may make the event not be able to fit. This causes the ring buffer to go into an infinite loop trying to find a sub-buffer that would fit the event. Luckily, there's a check that will bail out if it looped over a 1000 times and it also warns. The real fix is not to add the absolute timestamp to an event that is starting at the beginning of a sub-buffer because it uses the sub-buffer timestamp. By avoiding the timestamp at the start of the sub-buffer allows events that pass the first check to always find a sub-buffer that it can fit on. - Have large events that do not fit on a trace_seq to print "LINE TOO BIG" like it does for the trace_pipe instead of what it does now which is to silently drop the output. - Fix a memory leak of forgetting to free the spare page that is saved by a trace instance. - Update the size of the snapshot buffer when the main buffer is updated if the snapshot buffer is allocated. - Fix ring buffer timestamp logic by removing all the places that tried to put the before_stamp back to the write stamp so that the next event doesn't add an absolute timestamp. But each of these updates added a race where by making the two timestamp equal, it was validating the write_stamp so that it can be incorrectly used for calculating the delta of an event. - There's a temp buffer used for printing the event that was using the event data size for allocation when it needed to use the size of the entire event (meta-data and payload data) - For hardening, use "%.*s" for printing the trace_marker output, to limit the amount that is printed by the size of the event. This was discovered by development that added a bug that truncated the '\0' and caused a crash. - Fix a use-after-free bug in the use of the histogram files when an instance is being removed. - Remove a useless update in the rb_try_to_discard of the write_stamp. The before_stamp was already changed to force the next event to add an absolute timestamp that the write_stamp is not used. But the write_stamp is modified again using an unneeded 64-bit cmpxchg. - Fix several races in the 32-bit implementation of the rb_time_cmpxchg() that does a 64-bit cmpxchg. - While looking at fixing the 64-bit cmpxchg, I noticed that because the ring buffer uses normal cmpxchg, and this can be done in NMI context, there's some architectures that do not have a working cmpxchg in NMI context. For these architectures, fail recording events that happen in NMI context. * tag 'trace-v6.7-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace: ring-buffer: Do not record in NMI if the arch does not support cmpxchg in NMI ring-buffer: Have rb_time_cmpxchg() set the msb counter too ring-buffer: Fix 32-bit rb_time_read() race with rb_time_cmpxchg() ring-buffer: Fix a race in rb_time_cmpxchg() for 32 bit archs ring-buffer: Remove useless update to write_stamp in rb_try_to_discard() ring-buffer: Do not try to put back write_stamp tracing: Fix uaf issue when open the hist or hist_debug file tracing: Add size check when printing trace_marker output ring-buffer: Have saved event hold the entire event ring-buffer: Do not update before stamp when switching sub-buffers tracing: Update snapshot buffer on resize if it is allocated ring-buffer: Fix memory leak of free page eventfs: Fix events beyond NAME_MAX blocking tasks tracing: Have large events show up as '[LINE TOO BIG]' instead of nothing ring-buffer: Fix writing to the buffer with max_data_size
2 parents c8e97fc + 7122923 commit 3b8a9b2

File tree

6 files changed

+72
-82
lines changed

6 files changed

+72
-82
lines changed

fs/tracefs/event_inode.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
546546
if (strcmp(ei_child->name, name) != 0)
547547
continue;
548548
ret = simple_lookup(dir, dentry, flags);
549+
if (IS_ERR(ret))
550+
goto out;
549551
create_dir_dentry(ei, ei_child, ei_dentry, true);
550552
created = true;
551553
break;
@@ -568,6 +570,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
568570
if (r <= 0)
569571
continue;
570572
ret = simple_lookup(dir, dentry, flags);
573+
if (IS_ERR(ret))
574+
goto out;
571575
create_file_dentry(ei, i, ei_dentry, name, mode, cdata,
572576
fops, true);
573577
break;

kernel/trace/ring_buffer.c

Lines changed: 42 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -644,8 +644,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
644644

645645
*cnt = rb_time_cnt(top);
646646

647-
/* If top and msb counts don't match, this interrupted a write */
648-
if (*cnt != rb_time_cnt(msb))
647+
/* If top, msb or bottom counts don't match, this interrupted a write */
648+
if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom))
649649
return false;
650650

651651
/* The shift to msb will lose its cnt bits */
@@ -706,24 +706,28 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
706706
unsigned long cnt2, top2, bottom2, msb2;
707707
u64 val;
708708

709+
/* Any interruptions in this function should cause a failure */
710+
cnt = local_read(&t->cnt);
711+
709712
/* The cmpxchg always fails if it interrupted an update */
710713
if (!__rb_time_read(t, &val, &cnt2))
711714
return false;
712715

713716
if (val != expect)
714717
return false;
715718

716-
cnt = local_read(&t->cnt);
717719
if ((cnt & 3) != cnt2)
718720
return false;
719721

720722
cnt2 = cnt + 1;
721723

722724
rb_time_split(val, &top, &bottom, &msb);
725+
msb = rb_time_val_cnt(msb, cnt);
723726
top = rb_time_val_cnt(top, cnt);
724727
bottom = rb_time_val_cnt(bottom, cnt);
725728

726729
rb_time_split(set, &top2, &bottom2, &msb2);
730+
msb2 = rb_time_val_cnt(msb2, cnt);
727731
top2 = rb_time_val_cnt(top2, cnt2);
728732
bottom2 = rb_time_val_cnt(bottom2, cnt2);
729733

@@ -1787,6 +1791,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
17871791
free_buffer_page(bpage);
17881792
}
17891793

1794+
free_page((unsigned long)cpu_buffer->free_page);
1795+
17901796
kfree(cpu_buffer);
17911797
}
17921798

@@ -2407,7 +2413,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
24072413
*/
24082414
barrier();
24092415

2410-
if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE)
2416+
if ((iter->head + length) > commit || length > BUF_PAGE_SIZE)
24112417
/* Writer corrupted the read? */
24122418
goto reset;
24132419

@@ -2981,34 +2987,13 @@ static unsigned rb_calculate_event_length(unsigned length)
29812987
return length;
29822988
}
29832989

2984-
static u64 rb_time_delta(struct ring_buffer_event *event)
2985-
{
2986-
switch (event->type_len) {
2987-
case RINGBUF_TYPE_PADDING:
2988-
return 0;
2989-
2990-
case RINGBUF_TYPE_TIME_EXTEND:
2991-
return rb_event_time_stamp(event);
2992-
2993-
case RINGBUF_TYPE_TIME_STAMP:
2994-
return 0;
2995-
2996-
case RINGBUF_TYPE_DATA:
2997-
return event->time_delta;
2998-
default:
2999-
return 0;
3000-
}
3001-
}
3002-
30032990
static inline bool
30042991
rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
30052992
struct ring_buffer_event *event)
30062993
{
30072994
unsigned long new_index, old_index;
30082995
struct buffer_page *bpage;
30092996
unsigned long addr;
3010-
u64 write_stamp;
3011-
u64 delta;
30122997

30132998
new_index = rb_event_index(event);
30142999
old_index = new_index + rb_event_ts_length(event);
@@ -3017,14 +3002,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
30173002

30183003
bpage = READ_ONCE(cpu_buffer->tail_page);
30193004

3020-
delta = rb_time_delta(event);
3021-
3022-
if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
3023-
return false;
3024-
3025-
/* Make sure the write stamp is read before testing the location */
3026-
barrier();
3027-
3005+
/*
3006+
* Make sure the tail_page is still the same and
3007+
* the next write location is the end of this event
3008+
*/
30283009
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
30293010
unsigned long write_mask =
30303011
local_read(&bpage->write) & ~RB_WRITE_MASK;
@@ -3035,20 +3016,20 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
30353016
* to make sure that the next event adds an absolute
30363017
* value and does not rely on the saved write stamp, which
30373018
* is now going to be bogus.
3019+
*
3020+
* By setting the before_stamp to zero, the next event
3021+
* is not going to use the write_stamp and will instead
3022+
* create an absolute timestamp. This means there's no
3023+
* reason to update the wirte_stamp!
30383024
*/
30393025
rb_time_set(&cpu_buffer->before_stamp, 0);
30403026

3041-
/* Something came in, can't discard */
3042-
if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
3043-
write_stamp, write_stamp - delta))
3044-
return false;
3045-
30463027
/*
30473028
* If an event were to come in now, it would see that the
30483029
* write_stamp and the before_stamp are different, and assume
30493030
* that this event just added itself before updating
30503031
* the write stamp. The interrupting event will fix the
3051-
* write stamp for us, and use the before stamp as its delta.
3032+
* write stamp for us, and use an absolute timestamp.
30523033
*/
30533034

30543035
/*
@@ -3485,7 +3466,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
34853466
return;
34863467

34873468
/*
3488-
* If this interrupted another event,
3469+
* If this interrupted another event,
34893470
*/
34903471
if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
34913472
goto out;
@@ -3579,7 +3560,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
35793560
* absolute timestamp.
35803561
* Don't bother if this is the start of a new page (w == 0).
35813562
*/
3582-
if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) {
3563+
if (!w) {
3564+
/* Use the sub-buffer timestamp */
3565+
info->delta = 0;
3566+
} else if (unlikely(!a_ok || !b_ok || info->before != info->after)) {
35833567
info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
35843568
info->length += RB_LEN_TIME_EXTEND;
35853569
} else {
@@ -3602,51 +3586,27 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
36023586

36033587
/* See if we shot pass the end of this buffer page */
36043588
if (unlikely(write > BUF_PAGE_SIZE)) {
3605-
/* before and after may now different, fix it up*/
3606-
b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
3607-
a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
3608-
if (a_ok && b_ok && info->before != info->after)
3609-
(void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
3610-
info->before, info->after);
3611-
if (a_ok && b_ok)
3612-
check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
3589+
check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
36133590
return rb_move_tail(cpu_buffer, tail, info);
36143591
}
36153592

36163593
if (likely(tail == w)) {
3617-
u64 save_before;
3618-
bool s_ok;
3619-
36203594
/* Nothing interrupted us between A and C */
36213595
/*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts);
3622-
barrier();
3623-
/*E*/ s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before);
3624-
RB_WARN_ON(cpu_buffer, !s_ok);
3596+
/*
3597+
* If something came in between C and D, the write stamp
3598+
* may now not be in sync. But that's fine as the before_stamp
3599+
* will be different and then next event will just be forced
3600+
* to use an absolute timestamp.
3601+
*/
36253602
if (likely(!(info->add_timestamp &
36263603
(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
36273604
/* This did not interrupt any time update */
36283605
info->delta = info->ts - info->after;
36293606
else
36303607
/* Just use full timestamp for interrupting event */
36313608
info->delta = info->ts;
3632-
barrier();
36333609
check_buffer(cpu_buffer, info, tail);
3634-
if (unlikely(info->ts != save_before)) {
3635-
/* SLOW PATH - Interrupted between C and E */
3636-
3637-
a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
3638-
RB_WARN_ON(cpu_buffer, !a_ok);
3639-
3640-
/* Write stamp must only go forward */
3641-
if (save_before > info->after) {
3642-
/*
3643-
* We do not care about the result, only that
3644-
* it gets updated atomically.
3645-
*/
3646-
(void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
3647-
info->after, save_before);
3648-
}
3649-
}
36503610
} else {
36513611
u64 ts;
36523612
/* SLOW PATH - Interrupted between A and C */
@@ -3714,6 +3674,12 @@ rb_reserve_next_event(struct trace_buffer *buffer,
37143674
int nr_loops = 0;
37153675
int add_ts_default;
37163676

3677+
/* ring buffer does cmpxchg, make sure it is safe in NMI context */
3678+
if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
3679+
(unlikely(in_nmi()))) {
3680+
return NULL;
3681+
}
3682+
37173683
rb_start_commit(cpu_buffer);
37183684
/* The commit page can not change after this */
37193685

@@ -3737,6 +3703,8 @@ rb_reserve_next_event(struct trace_buffer *buffer,
37373703
if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
37383704
add_ts_default = RB_ADD_STAMP_ABSOLUTE;
37393705
info.length += RB_LEN_TIME_EXTEND;
3706+
if (info.length > BUF_MAX_DATA_SIZE)
3707+
goto out_fail;
37403708
} else {
37413709
add_ts_default = RB_ADD_STAMP_NONE;
37423710
}
@@ -5118,7 +5086,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
51185086
if (!iter)
51195087
return NULL;
51205088

5121-
iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags);
5089+
/* Holds the entire event: data and meta data */
5090+
iter->event = kmalloc(BUF_PAGE_SIZE, flags);
51225091
if (!iter->event) {
51235092
kfree(iter);
51245093
return NULL;

kernel/trace/trace.c

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4722,7 +4722,11 @@ static int s_show(struct seq_file *m, void *v)
47224722
iter->leftover = ret;
47234723

47244724
} else {
4725-
print_trace_line(iter);
4725+
ret = print_trace_line(iter);
4726+
if (ret == TRACE_TYPE_PARTIAL_LINE) {
4727+
iter->seq.full = 0;
4728+
trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n");
4729+
}
47264730
ret = trace_print_seq(m, &iter->seq);
47274731
/*
47284732
* If we overflow the seq_file buffer, then it will
@@ -4964,6 +4968,12 @@ int tracing_release_file_tr(struct inode *inode, struct file *filp)
49644968
return 0;
49654969
}
49664970

4971+
int tracing_single_release_file_tr(struct inode *inode, struct file *filp)
4972+
{
4973+
tracing_release_file_tr(inode, filp);
4974+
return single_release(inode, filp);
4975+
}
4976+
49674977
static int tracing_mark_open(struct inode *inode, struct file *filp)
49684978
{
49694979
stream_open(inode, filp);
@@ -6344,15 +6354,15 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
63446354
if (!tr->array_buffer.buffer)
63456355
return 0;
63466356

6347-
/* Do not allow tracing while resizng ring buffer */
6357+
/* Do not allow tracing while resizing ring buffer */
63486358
tracing_stop_tr(tr);
63496359

63506360
ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu);
63516361
if (ret < 0)
63526362
goto out_start;
63536363

63546364
#ifdef CONFIG_TRACER_MAX_TRACE
6355-
if (!tr->current_trace->use_max_tr)
6365+
if (!tr->allocated_snapshot)
63566366
goto out;
63576367

63586368
ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);

kernel/trace/trace.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -617,6 +617,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp);
617617
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
618618
int tracing_open_file_tr(struct inode *inode, struct file *filp);
619619
int tracing_release_file_tr(struct inode *inode, struct file *filp);
620+
int tracing_single_release_file_tr(struct inode *inode, struct file *filp);
620621
bool tracing_is_disabled(void);
621622
bool tracer_tracing_is_on(struct trace_array *tr);
622623
void tracer_tracing_on(struct trace_array *tr);

kernel/trace/trace_events_hist.c

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5623,18 +5623,20 @@ static int event_hist_open(struct inode *inode, struct file *file)
56235623
{
56245624
int ret;
56255625

5626-
ret = security_locked_down(LOCKDOWN_TRACEFS);
5626+
ret = tracing_open_file_tr(inode, file);
56275627
if (ret)
56285628
return ret;
56295629

5630+
/* Clear private_data to avoid warning in single_open() */
5631+
file->private_data = NULL;
56305632
return single_open(file, hist_show, file);
56315633
}
56325634

56335635
const struct file_operations event_hist_fops = {
56345636
.open = event_hist_open,
56355637
.read = seq_read,
56365638
.llseek = seq_lseek,
5637-
.release = single_release,
5639+
.release = tracing_single_release_file_tr,
56385640
};
56395641

56405642
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
@@ -5900,18 +5902,20 @@ static int event_hist_debug_open(struct inode *inode, struct file *file)
59005902
{
59015903
int ret;
59025904

5903-
ret = security_locked_down(LOCKDOWN_TRACEFS);
5905+
ret = tracing_open_file_tr(inode, file);
59045906
if (ret)
59055907
return ret;
59065908

5909+
/* Clear private_data to avoid warning in single_open() */
5910+
file->private_data = NULL;
59075911
return single_open(file, hist_debug_show, file);
59085912
}
59095913

59105914
const struct file_operations event_hist_debug_fops = {
59115915
.open = event_hist_debug_open,
59125916
.read = seq_read,
59135917
.llseek = seq_lseek,
5914-
.release = single_release,
5918+
.release = tracing_single_release_file_tr,
59155919
};
59165920
#endif
59175921

kernel/trace/trace_output.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1587,11 +1587,12 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
15871587
{
15881588
struct print_entry *field;
15891589
struct trace_seq *s = &iter->seq;
1590+
int max = iter->ent_size - offsetof(struct print_entry, buf);
15901591

15911592
trace_assign_type(field, iter->ent);
15921593

15931594
seq_print_ip_sym(s, field->ip, flags);
1594-
trace_seq_printf(s, ": %s", field->buf);
1595+
trace_seq_printf(s, ": %.*s", max, field->buf);
15951596

15961597
return trace_handle_return(s);
15971598
}
@@ -1600,10 +1601,11 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
16001601
struct trace_event *event)
16011602
{
16021603
struct print_entry *field;
1604+
int max = iter->ent_size - offsetof(struct print_entry, buf);
16031605

16041606
trace_assign_type(field, iter->ent);
16051607

1606-
trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
1608+
trace_seq_printf(&iter->seq, "# %lx %.*s", field->ip, max, field->buf);
16071609

16081610
return trace_handle_return(&iter->seq);
16091611
}

0 commit comments

Comments
 (0)