Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 140 additions & 28 deletions pkg/ebpf/tracee.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ type Tracee struct {
// This does not mean they are required for tracee to function.
// TODO: remove this in favor of dependency manager nodes
requiredKsyms []string
// All possible tailcall map names from all Core events (computed once at startup)
allTailCallMapNames map[string]struct{}
}

func (t *Tracee) Stats() *metrics.Stats {
Expand Down Expand Up @@ -235,24 +237,28 @@ func New(cfg config.Config) (*Tracee, error) {
// Create Tracee

t := &Tracee{
config: cfg,
done: make(chan struct{}),
stats: metrics.NewStats(),
writtenFiles: make(map[string]string),
readFiles: make(map[string]string),
capturedFiles: make(map[string]int64),
streamsManager: streams.NewStreamsManager(),
policyManager: pm,
eventsDependencies: depsManager,
requiredKsyms: []string{},
extraProbes: make(map[string]*probes.ProbeGroup),
dataTypeDecoder: bufferdecoder.NewTypeDecoder(),
config: cfg,
done: make(chan struct{}),
stats: metrics.NewStats(),
writtenFiles: make(map[string]string),
readFiles: make(map[string]string),
capturedFiles: make(map[string]int64),
streamsManager: streams.NewStreamsManager(),
policyManager: pm,
eventsDependencies: depsManager,
requiredKsyms: []string{},
extraProbes: make(map[string]*probes.ProbeGroup),
dataTypeDecoder: bufferdecoder.NewTypeDecoder(),
allTailCallMapNames: make(map[string]struct{}),
}

// clear initial policies to avoid wrong references
initialPolicies = nil
t.config.InitialPolicies = nil

// Initialize list of all possible tailcall map names from Core events
t.initAllTailCallMapNames()

// Add/Drop capabilities to/from the Base ring (always effective)

capsToAdd, err := capabilities.ReqByString(t.config.Capabilities.AddCaps...)
Expand Down Expand Up @@ -593,6 +599,114 @@ func (t *Tracee) initTailCall(tailCall events.TailCall) error {
return nil
}

// initAllTailCallMapNames initializes the list of all possible tailcall map names
// by iterating through all Core event definitions and extracting their tailcall dependencies.
// This is called once during Tracee initialization to avoid repeated computation.
func (t *Tracee) initAllTailCallMapNames() {
for _, eventDefinition := range events.Core.GetDefinitions() {
deps := eventDefinition.GetDependencies()
primaryDeps := deps.GetPrimaryDependencies()
tailCalls := primaryDeps.GetTailCalls()
for _, tailCall := range tailCalls {
t.allTailCallMapNames[tailCall.GetMapName()] = struct{}{}
}

// Also check fallback dependencies for their tailcalls
fallbackDeps := deps.GetFallbackDependencies()
for _, fallback := range fallbackDeps {
tailCalls := fallback.GetTailCalls()
for _, tailCall := range tailCalls {
t.allTailCallMapNames[tailCall.GetMapName()] = struct{}{}
}
}
}
}

// rebuildAllTailCalls rebuilds all tailcall mappings based on the current dependency state.
// This method clears existing tailcall maps and then repopulates them with current dependencies.
// It's called during initial setup and whenever the dependency state changes (fallbacks, event additions/removals).
func (t *Tracee) rebuildAllTailCalls() error {
err := t.clearAllTailCallMaps()
if err != nil {
return errfmt.Errorf("failed to clear tailcall maps: %v", err)
}

err = t.buildAllTailCallMaps()
if err != nil {
return errfmt.Errorf("failed to build tailcall maps: %v", err)
}

return nil
}

// clearAllTailCallMaps clears all BPF maps that are used for tailcalls.
// It iterates through all BPF maps using the module iterator and clears any that are used for tailcalls.
func (t *Tracee) clearAllTailCallMaps() error {
// Iterate through all BPF maps and clear the ones used for tailcalls
iterator := t.bpfModule.Iterator()
for bpfMap := iterator.NextMap(); bpfMap != nil; bpfMap = iterator.NextMap() {
mapName := bpfMap.Name()

// Check if this map is used for tailcalls (using pre-computed list from all Core events)
if _, isTailCallMap := t.allTailCallMapNames[mapName]; isTailCallMap {
err := t.clearTailCallMap(bpfMap)
if err != nil {
return errfmt.Errorf("failed to clear tailcall map %s: %v", mapName, err)
}
}
}

return nil
}

// buildAllTailCallMaps builds all tailcall mappings based on current event dependencies.
func (t *Tracee) buildAllTailCallMaps() error {
for _, eventID := range t.eventsDependencies.GetEvents() {
depsNode, err := t.eventsDependencies.GetEvent(eventID)
if err != nil {
return errfmt.Errorf("failed to get event dependencies for %v: %v", eventID, err)
}
deps := depsNode.GetDependencies()
tailCalls := deps.GetTailCalls()
for _, tailCall := range tailCalls {
err := t.initTailCall(tailCall)
if err != nil {
return errfmt.Errorf("failed to initialize tail call: %v", err)
}
}
}

return nil
}

// clearTailCallMap clears all entries in the specified tailcall map.
// It uses map key iteration to only clear existing entries, which is more efficient
// than clearing all possible indexes.
func (t *Tracee) clearTailCallMap(bpfMap *bpf.BPFMap) error {
iterator := bpfMap.Iterator()

// Collect all keys first to avoid iterator invalidation during deletion
var keysToDelete []uint32
for iterator.Next() {
keyBytes := iterator.Key()
if len(keyBytes) >= 4 { // uint32 is 4 bytes
Copy link

Copilot AI Sep 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The unsafe pointer conversion lacks bounds checking. If keyBytes has fewer than 4 bytes, this will cause a buffer overflow. Add a check to ensure len(keyBytes) == 4 before the conversion.

Suggested change
if len(keyBytes) >= 4 { // uint32 is 4 bytes
if len(keyBytes) == 4 { // uint32 is 4 bytes

Copilot uses AI. Check for mistakes.
key := *(*uint32)(unsafe.Pointer(&keyBytes[0]))
keysToDelete = append(keysToDelete, key)
}
}

// Now delete all the collected keys
for _, key := range keysToDelete {
err := bpfMap.DeleteKey(unsafe.Pointer(&key))
if err != nil {
// Log but don't fail on individual delete errors
logger.Debugw("Failed to delete tailcall map entry", "map", bpfMap.Name(), "key", key, "error", err)
}
}

Comment on lines +698 to +706
Copy link

Copilot AI Sep 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Silently ignoring delete errors could mask serious issues like corrupted BPF maps or permission problems. Consider collecting and returning accumulated errors, or at least logging at a higher severity level for certain error types.

Suggested change
// Now delete all the collected keys
for _, key := range keysToDelete {
err := bpfMap.DeleteKey(unsafe.Pointer(&key))
if err != nil {
// Log but don't fail on individual delete errors
logger.Debugw("Failed to delete tailcall map entry", "map", bpfMap.Name(), "key", key, "error", err)
}
}
// Now delete all the collected keys, accumulate errors
var deleteErrors []error
for _, key := range keysToDelete {
err := bpfMap.DeleteKey(unsafe.Pointer(&key))
if err != nil {
// Log at warning level and accumulate error
logger.Warnw("Failed to delete tailcall map entry", "map", bpfMap.Name(), "key", key, "error", err)
deleteErrors = append(deleteErrors, fmt.Errorf("map %s key %d: %w", bpfMap.Name(), key, err))
}
}
if len(deleteErrors) > 0 {
return errors.Join(deleteErrors...)
}

Copilot uses AI. Check for mistakes.
return nil
}

// initDerivationTable initializes tracee's events.DerivationTable. For each
// event, represented through its ID, we declare to which other events it can be
// derived and the corresponding function to derive into that Event.
Expand Down Expand Up @@ -1267,22 +1381,9 @@ func (t *Tracee) populateBPFMaps() error {
}

// Initialize tail call dependencies
// TODO: Tail calls are not updated upon events changes in the dependency manager.
// Hence, upon events addition, fallbacks or removal, tail calls will not be updated.
// This should be fixed dynamically in the future.
for _, eventID := range t.eventsDependencies.GetEvents() {
depsNode, err := t.eventsDependencies.GetEvent(eventID)
if err != nil {
return errfmt.Errorf("failed to get event dependencies: %v", err)
}
deps := depsNode.GetDependencies()
tailCalls := deps.GetTailCalls()
for _, tailCall := range tailCalls {
err := t.initTailCall(tailCall)
if err != nil {
return errfmt.Errorf("failed to initialize tail call: %v", err)
}
}
err = t.buildAllTailCallMaps()
if err != nil {
return errfmt.WrapError(err)
}

return nil
Expand Down Expand Up @@ -1453,6 +1554,17 @@ func (t *Tracee) initBPF() error {
// collector to free the BPF object
t.config.BPFObjBytes = nil

// Register state change watcher for tailcall rebuilding
// This ensures tailcalls are updated whenever the dependency tree changes
t.eventsDependencies.SubscribeStateChange(func() {
// Note: This is called within the dependency manager's mutex,
// so we should avoid any operations that might deadlock
err := t.rebuildAllTailCalls()
if err != nil {
logger.Errorw("Failed to rebuild tailcalls after dependency change", "error", err)
}
Comment on lines +1562 to +1565
Copy link

Copilot AI Sep 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The callback function could potentially deadlock since it's called within the dependency manager's mutex and rebuildAllTailCalls() may need to access dependency state. Consider using a goroutine or channel to defer the tail call rebuilding to avoid potential deadlocks.

Suggested change
err := t.rebuildAllTailCalls()
if err != nil {
logger.Errorw("Failed to rebuild tailcalls after dependency change", "error", err)
}
go func() {
err := t.rebuildAllTailCalls()
if err != nil {
logger.Errorw("Failed to rebuild tailcalls after dependency change", "error", err)
}
}()

Copilot uses AI. Check for mistakes.
})

// Populate eBPF maps with initial data

err = t.populateBPFMaps()
Expand Down
60 changes: 57 additions & 3 deletions pkg/events/dependencies/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ type Manager struct {
probes map[probes.Handle]*ProbeNode
onAdd map[NodeType][]func(node interface{}) []Action
onRemove map[NodeType][]func(node interface{}) []Action
onStateChanged []func() // Watchers called when manager state changes
dependenciesGetter func(events.ID) events.DependencyStrategy
// Track failed probes and events to prevent issues such as incorrect fallback handling,
// duplicate processing, or inconsistent state when dependencies are shared between events.
Expand All @@ -48,6 +49,7 @@ func NewDependenciesManager(dependenciesGetter func(events.ID) events.Dependency
probes: make(map[probes.Handle]*ProbeNode),
onAdd: make(map[NodeType][]func(node interface{}) []Action),
onRemove: make(map[NodeType][]func(node interface{}) []Action),
onStateChanged: make([]func(), 0),
dependenciesGetter: dependenciesGetter,
failedProbes: make(map[probes.Handle]struct{}),
failedEvents: make(map[events.ID]struct{}),
Expand All @@ -73,6 +75,17 @@ func (m *Manager) SubscribeRemove(subscribeType NodeType, onRemove func(node int
m.onRemove[subscribeType] = append([]func(node interface{}) []Action{onRemove}, m.onRemove[subscribeType]...)
}

// SubscribeStateChange adds a watcher function called when the manager's dependency tree changes.
// State change watchers are called in the order of their subscription.
// They are invoked only when nodes are actually added/removed from the tree or when fallbacks are applied.
// Changes to the "explicitly selected" status of existing nodes do not trigger these watchers.
func (m *Manager) SubscribeStateChange(onStateChanged func()) {
m.mu.Lock()
defer m.mu.Unlock()

m.onStateChanged = append(m.onStateChanged, onStateChanged)
}

// GetEvent returns the dependencies of the given event.
func (m *Manager) GetEvent(id events.ID) (*EventNode, error) {
m.mu.RLock()
Expand Down Expand Up @@ -105,7 +118,19 @@ func (m *Manager) SelectEvent(id events.ID) (*EventNode, error) {
m.mu.Lock()
defer m.mu.Unlock()

return m.buildEvent(id, nil)
existingNode := m.getEventNode(id)

node, err := m.buildEvent(id, nil)
if err != nil {
return node, err
}

// Only trigger state change if this was a new node added to the tree
if existingNode == nil {
m.triggerStateChanged()
}

return node, err
}

// UnselectEvent marks the event as not explicitly selected.
Expand All @@ -120,8 +145,15 @@ func (m *Manager) UnselectEvent(id events.ID) bool {
if node == nil {
return false
}

node.unmarkAsExplicitlySelected()
removed := m.cleanUnreferencedEventNode(node)

// Only trigger state change if the node was actually removed from the tree
if removed {
m.triggerStateChanged()
}

return removed
}

Expand All @@ -136,7 +168,12 @@ func (m *Manager) RemoveEvent(id events.ID) error {
m.mu.Lock()
defer m.mu.Unlock()

return m.removeEvent(id)
err := m.removeEvent(id)
if err == nil {
m.triggerStateChanged()
}

return err
}

// removeEvent removes the given event from the tree.
Expand Down Expand Up @@ -398,6 +435,14 @@ func (m *Manager) triggerOnRemove(node interface{}) {
}
}

// triggerStateChanged triggers all state change watchers.
// This is called when the manager's state has actually changed.
func (m *Manager) triggerStateChanged() {
for _, onStateChanged := range m.onStateChanged {
onStateChanged()
}
}

func getNodeType(node interface{}) (NodeType, error) {
switch node.(type) {
case *EventNode:
Expand Down Expand Up @@ -500,7 +545,15 @@ func (m *Manager) FailEvent(id events.ID) (bool, error) {
m.mu.Lock()
defer m.mu.Unlock()

return m.failEvent(id)
removed, err := m.failEvent(id)
// Always trigger state change on successful FailEvent since either:
// 1. Event was removed (state changed), or
// 2. Fallback was applied (dependencies changed, state changed)
if err == nil {
m.triggerStateChanged()
}

return removed, err
}

// failEvent attempts to switch the given event dependencies to its next available fallback ones.
Expand Down Expand Up @@ -589,6 +642,7 @@ func (m *Manager) FailProbe(handle probes.Handle) error {
m.failedProbes[handle] = struct{}{}

m.removeProbe(handle)
m.triggerStateChanged()

return nil
}
Expand Down
Loading