Skip to content

Commit cbfee70

Browse files
committed
pkg/fuzzer/throttler: avoid repetitive crashes
Track the crash rate for the individual syscalls and rate limit the execution of those which happen too often. For determining the most unsafe syscalls, let's keep a sliding window of the last executed progs on every instance. * If a program has been evicted from the sliding window, it's safe enough. * If an instance crashed, all programs in the window are under suspicion. For preventing the execution of banned syscalls, let's add a special (skip) call parameter that is understood by the executor. The parameter is not supposed to leak into the corpus, so add appropriate assertions.
1 parent edc5149 commit cbfee70

File tree

15 files changed

+629
-57
lines changed

15 files changed

+629
-57
lines changed

executor/executor.cc

Lines changed: 32 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1236,34 +1236,38 @@ void execute_call(thread_t* th)
12361236
th->soft_fail_state = true;
12371237
}
12381238

1239-
if (flag_coverage)
1240-
cover_reset(&th->cov);
1241-
// For pseudo-syscalls and user-space functions NONFAILING can abort before assigning to th->res.
1242-
// Arrange for res = -1 and errno = EFAULT result for such case.
1243-
th->res = -1;
1244-
errno = EFAULT;
1245-
NONFAILING(th->res = execute_syscall(call, th->args));
1246-
th->reserrno = errno;
1247-
// Our pseudo-syscalls may misbehave.
1248-
if ((th->res == -1 && th->reserrno == 0) || call->attrs.ignore_return)
1249-
th->reserrno = EINVAL;
1250-
// Reset the flag before the first possible fail().
1251-
th->soft_fail_state = false;
1252-
1253-
if (flag_coverage) {
1254-
cover_collect(&th->cov);
1255-
if (th->cov.size >= kCoverSize)
1256-
failmsg("too much cover", "thr=%d, cov=%u", th->id, th->cov.size);
1257-
}
1258-
th->fault_injected = false;
1239+
if (th->call_props.skip > 0) {
1240+
th->reserrno = ENOSYS;
1241+
} else {
1242+
if (flag_coverage)
1243+
cover_reset(&th->cov);
1244+
// For pseudo-syscalls and user-space functions NONFAILING can abort before assigning to th->res.
1245+
// Arrange for res = -1 and errno = EFAULT result for such case.
1246+
th->res = -1;
1247+
errno = EFAULT;
1248+
NONFAILING(th->res = execute_syscall(call, th->args));
1249+
th->reserrno = errno;
1250+
// Our pseudo-syscalls may misbehave.
1251+
if ((th->res == -1 && th->reserrno == 0) || call->attrs.ignore_return)
1252+
th->reserrno = EINVAL;
1253+
// Reset the flag before the first possible fail().
1254+
th->soft_fail_state = false;
1255+
1256+
if (flag_coverage) {
1257+
cover_collect(&th->cov);
1258+
if (th->cov.size >= kCoverSize)
1259+
failmsg("too much cover", "thr=%d, cov=%u", th->id, th->cov.size);
1260+
}
1261+
th->fault_injected = false;
12591262

1260-
if (th->call_props.fail_nth > 0)
1261-
th->fault_injected = fault_injected(fail_fd);
1263+
if (th->call_props.fail_nth > 0)
1264+
th->fault_injected = fault_injected(fail_fd);
12621265

1263-
// If required, run the syscall some more times.
1264-
// But let's still return res, errno and coverage from the first execution.
1265-
for (int i = 0; i < th->call_props.rerun; i++)
1266-
NONFAILING(execute_syscall(call, th->args));
1266+
// If required, run the syscall some more times.
1267+
// But let's still return res, errno and coverage from the first execution.
1268+
for (int i = 0; i < th->call_props.rerun; i++)
1269+
NONFAILING(execute_syscall(call, th->args));
1270+
}
12671271

12681272
debug("#%d [%llums] <- %s=0x%llx",
12691273
th->id, current_time_ms() - start_time_ms, call->name, (uint64)th->res);
@@ -1275,6 +1279,8 @@ void execute_call(thread_t* th)
12751279
debug(" fault=%d", th->fault_injected);
12761280
if (th->call_props.rerun > 0)
12771281
debug(" rerun=%d", th->call_props.rerun);
1282+
if (th->call_props.skip > 0)
1283+
debug(" skipped");
12781284
debug("\n");
12791285
}
12801286

pkg/corpus/corpus.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ type NewItemEvent struct {
9090
}
9191

9292
func (corpus *Corpus) Save(inp NewInput) {
93+
validateCorpusProg(inp.Prog)
94+
9395
progData := inp.Prog.Serialize()
9496
sig := hash.String(progData)
9597

@@ -150,6 +152,18 @@ func (corpus *Corpus) Save(inp NewInput) {
150152
}
151153
}
152154
}
155+
156+
func validateCorpusProg(p *prog.Prog) {
157+
for _, call := range p.Calls {
158+
if call.Props.Async {
159+
panic("attempting to save a Async=true prog to corpus")
160+
}
161+
if call.Props.Skip {
162+
panic("attempting to save a Skip=true prog to corpus")
163+
}
164+
}
165+
}
166+
153167
func (corpus *Corpus) Signal() signal.Signal {
154168
corpus.mu.RLock()
155169
defer corpus.mu.RUnlock()

pkg/csource/csource.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,10 @@ func (ctx *context) generateCalls(p prog.ExecProg, trace bool) ([]string, []uint
253253
var calls []string
254254
csumSeq := 0
255255
for ci, call := range p.Calls {
256+
if call.Props.Skip {
257+
continue
258+
}
259+
256260
w := new(bytes.Buffer)
257261
// Copyin.
258262
for _, copyin := range call.Copyin {

pkg/fuzzer/fuzzer.go

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,12 +131,15 @@ func (fuzzer *Fuzzer) processResult(req *queue.Request, res *queue.Result, flags
131131
fuzzer.triageProgCall(req.Prog, res.Info.Extra, -1, &triage)
132132

133133
if len(triage) != 0 {
134+
p := req.Prog.Clone()
135+
triage = removeSkippedCalls(p, triage)
136+
134137
queue, stat := fuzzer.triageQueue, fuzzer.statJobsTriage
135138
if flags&progCandidate > 0 {
136139
queue, stat = fuzzer.triageCandidateQueue, fuzzer.statJobsTriageCandidate
137140
}
138141
fuzzer.startJob(stat, &triageJob{
139-
p: req.Prog.Clone(),
142+
p: p,
140143
flags: flags,
141144
queue: queue.Append(),
142145
calls: triage,
@@ -150,7 +153,7 @@ func (fuzzer *Fuzzer) processResult(req *queue.Request, res *queue.Result, flags
150153

151154
// Corpus candidates may have flaky coverage, so we give them a second chance.
152155
maxCandidateAttempts := 3
153-
if req.Risky() {
156+
if req.Risky {
154157
maxCandidateAttempts = 2
155158
}
156159
if len(triage) == 0 && flags&ProgFromCorpus != 0 && attempt < maxCandidateAttempts {
@@ -163,6 +166,26 @@ func (fuzzer *Fuzzer) processResult(req *queue.Request, res *queue.Result, flags
163166
return true
164167
}
165168

169+
func removeSkippedCalls(p *prog.Prog, triage map[int]*triageCall) map[int]*triageCall {
170+
ret := map[int]*triageCall{}
171+
if info := triage[-1]; info != nil {
172+
ret[-1] = info
173+
}
174+
oldPos := 0
175+
for i := 0; i < len(p.Calls); oldPos++ {
176+
if p.Calls[i].Props.Skip {
177+
p.RemoveCall(i)
178+
continue
179+
}
180+
info := triage[oldPos]
181+
if info != nil {
182+
ret[i] = info
183+
}
184+
i++
185+
}
186+
return ret
187+
}
188+
166189
type Config struct {
167190
Debug bool
168191
Corpus *corpus.Corpus

pkg/fuzzer/fuzzer_test.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,32 @@ func TestRotate(t *testing.T) {
181181
assert.Equal(t, 700, minus.Len())
182182
}
183183

184+
func TestRemoveSkippedCalls(t *testing.T) {
185+
target, err := prog.GetTarget(targets.TestOS, targets.TestArch64Fuzz)
186+
if err != nil {
187+
t.Fatal(err)
188+
}
189+
p, err := target.Deserialize([]byte(`
190+
serialize0(&AUTO) (skip)
191+
serialize1(&AUTO)
192+
serialize2(&AUTO) (skip)
193+
serialize3(&AUTO)
194+
`), prog.NonStrict)
195+
if err != nil {
196+
t.Fatal(err)
197+
}
198+
mm := map[int]*triageCall{
199+
1: {},
200+
}
201+
newMm := removeSkippedCalls(p, mm)
202+
assert.Len(t, newMm, 1)
203+
assert.NotNil(t, newMm[0])
204+
205+
assert.Len(t, p.Calls, 2)
206+
assert.Equal(t, "serialize1", p.Calls[0].Meta.Name)
207+
assert.Equal(t, "serialize3", p.Calls[1].Meta.Name)
208+
}
209+
184210
// Based on the example from Go documentation.
185211
var crc32q = crc32.MakeTable(0xD5828281)
186212

pkg/fuzzer/queue/queue.go

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ type Request struct {
4242
// Important requests will be retried even from crashed VMs.
4343
Important bool
4444

45+
// Risky requests will not be retried, even if they are important.
46+
Risky bool
47+
4548
// The callback will be called on request completion in the LIFO order.
4649
// If it returns false, all further processing will be stopped.
4750
// It allows wrappers to intercept Done() requests.
@@ -95,11 +98,6 @@ func (r *Request) Wait(ctx context.Context) *Result {
9598
}
9699
}
97100

98-
// Risky() returns true if there's a substantial risk of the input crashing the VM.
99-
func (r *Request) Risky() bool {
100-
return r.onceCrashed
101-
}
102-
103101
func (r *Request) Validate() error {
104102
collectSignal := r.ExecOpts.ExecFlags&flatrpc.ExecFlagCollectSignal > 0
105103
if len(r.ReturnAllSignal) != 0 && !collectSignal {

pkg/fuzzer/queue/retry.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ func (r *retryer) done(req *Request, res *Result) bool {
3434
return false
3535
}
3636
// Retry important requests from crashed VMs once.
37-
if res.Status == Crashed && req.Important && !req.onceCrashed {
38-
req.onceCrashed = true
37+
if res.Status == Crashed && req.Important && !req.Risky {
38+
req.Risky = true
3939
r.pq.Submit(req)
4040
return false
4141
}

0 commit comments

Comments
 (0)