Skip to content

Commit afc8c4f

Browse files
shabalindcompnerd
andauthored
Add script to compare benchmark results (#85)
* Add comparison script * Add geomean-based totals per column * Add argument parser * Add --filter flag * Add --filter-not flag * Add --columns flag * Drop fail helper function * Add docstrings * Add license comment * Add a docstring with usage example to compare script * Apply code review suggestions Co-authored-by: Saleem Abdulrasool <compnerd@compnerd.org> Co-authored-by: Saleem Abdulrasool <compnerd@compnerd.org>
1 parent a0564bf commit afc8c4f

File tree

1 file changed

+288
-0
lines changed

1 file changed

+288
-0
lines changed

Scripts/compare.py

Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""A command-line tool to compare benchmark results in json format.
16+
17+
This tool lets one to see the difference between two independent runs
18+
of the same benchmarks. This is is convenient whenever one develops a
19+
perfromance fix and wants to find out if a particula change brings
20+
measurable performance improvement.
21+
22+
For example:
23+
24+
$ swift run -c release BenchmarkMinimalExample --format json > a.json
25+
26+
$ swift run -c release BenchmarkMinimalExample --format json > b.json
27+
28+
$ python Scripts/compare.py a.json b.json
29+
benchmark column a b %
30+
-----------------------------------------------------------------
31+
add string no capacity time 37099.00 37160.00 -0.16
32+
add string no capacity std 1.13 1.30 -15.27
33+
add string no capacity iterations 37700.00 37618.00 0.22
34+
add string reserved capacity time 36730.00 36743.00 -0.04
35+
add string reserved capacity std 1.12 2.42 -116.30
36+
add string reserved capacity iterations 38078.00 38084.00 -0.02
37+
-----------------------------------------------------------------
38+
time -0.10
39+
std -57.90
40+
iterations 0.10
41+
42+
Here one can see an output that compares two indepdendant runs `a` and
43+
`b` and concludes that they only differ in 0.1%, and are thus probably
44+
identical results.
45+
46+
One can filter out the results in the comparison by either the benchmark
47+
name using `--filter` and `--filter-not` flags, and also by the column
48+
of the json output using `--columns`.
49+
"""
50+
51+
import argparse
52+
from collections import defaultdict
53+
import json
54+
import re
55+
56+
57+
def require(cond, msg):
58+
"""Fails with a message if condition is not true."""
59+
60+
if not cond: raise Exception(msg)
61+
62+
63+
def validate(file_name, parsed):
64+
"""Validates that given json object is a valid benchmarks result."""
65+
66+
require("benchmarks" in parsed,
67+
"{}: missing key 'benchmarks'.".format(file_name))
68+
require(len(parsed["benchmarks"]) > 0,
69+
"{}: must have at least one benchmark.".format(file_name))
70+
71+
for i, benchmark in enumerate(parsed["benchmarks"]):
72+
require("name" in benchmark,
73+
"{}: benchmark #{}: missing key 'name'.".format(file_name, i))
74+
75+
for k, v in benchmark.items():
76+
if k == "name": continue
77+
is_num = isinstance(v, int) or isinstance(v, float)
78+
template = "{}: benchmark #{}: values must be numbers."
79+
require(is_num, template.format(file_name, i))
80+
81+
82+
def parse_and_validate(args):
83+
"""Parse command-line args, parse given json files and validate their contents."""
84+
85+
runs = []
86+
87+
for file_name in args.file_names:
88+
with open(file_name) as f:
89+
parsed = None
90+
try:
91+
parsed = json.load(f)
92+
except Exception as err:
93+
raise Exception("failed to parse json: {}".format(err))
94+
validate(file_name, parsed)
95+
runs.append((file_name, parsed))
96+
97+
return runs
98+
99+
100+
def benchmark_predicate(args):
101+
"""Returns a predicate used to filter benchmark columns based on cli args."""
102+
103+
include = lambda x: True
104+
105+
if args.filter:
106+
regex = re.compile(args.filter)
107+
prev_include = include
108+
include = lambda x: regex.search(x) is not None and prev_include(x)
109+
110+
if args.filter_not:
111+
regex = re.compile(args.filter_not)
112+
prev_include = include
113+
include = lambda x: regex.search(x) is None and prev_include(x)
114+
115+
return include
116+
117+
118+
def collect_values(args, runs):
119+
"""Collect benchmark values for the comparison, excluding filtered out columns."""
120+
121+
baseline_name, baseline = runs[0]
122+
123+
include_benchmark = benchmark_predicate(args)
124+
include_column = lambda x: args.columns is None or x in args.columns
125+
126+
confs = []
127+
values = {}
128+
129+
for benchmark in baseline["benchmarks"]:
130+
benchmark_name = benchmark["name"]
131+
if not include_benchmark(benchmark_name):
132+
continue
133+
for column in benchmark.keys():
134+
if column == "name":
135+
continue
136+
if not include_column(column):
137+
continue
138+
conf = (benchmark_name, column)
139+
confs.append(conf)
140+
values[conf] = {}
141+
142+
for conf in confs:
143+
bench_name, column = conf
144+
for (file_name, run) in runs:
145+
for bench in run["benchmarks"]:
146+
if bench["name"] == bench_name:
147+
values[conf][file_name] = bench[column]
148+
149+
return (confs, values)
150+
151+
152+
def geomean(values):
153+
"""Compute geometric mean for the given sequence of values."""
154+
155+
product = 1.0
156+
for value in values:
157+
product *= value
158+
return product**(1.0 / len(values))
159+
160+
161+
def to_table(confs, args, values):
162+
"""Compute a table of relative results across all input files."""
163+
164+
baseline_file_name = args.baseline
165+
rows = []
166+
167+
# Header row.
168+
header = []
169+
header.append("benchmark")
170+
header.append("column")
171+
for (n, file_name) in enumerate(args.file_names):
172+
name = file_name.replace(".json", "")
173+
header.append(name)
174+
if n != 0:
175+
header.append("%")
176+
rows.append(header)
177+
178+
# Body rows.
179+
relative_values = defaultdict(lambda: defaultdict(list))
180+
for conf in confs:
181+
bench_name, column = conf
182+
row = []
183+
row.append(bench_name)
184+
row.append(column)
185+
for n, file_name in enumerate(args.file_names):
186+
base_value = values[conf][baseline_file_name]
187+
value = values[conf][file_name]
188+
row.append("{:.2f}".format(value))
189+
if n != 0:
190+
relative = value/base_value
191+
relative_values[column][file_name].append(relative)
192+
relative_percentage = (1 - relative ) * 100
193+
row.append("{:.2f}".format(relative_percentage))
194+
rows.append(row)
195+
196+
# Compute totals for each columsn as a geomean of all relative results.
197+
cols = []
198+
geomean_values = defaultdict(dict)
199+
for (_, col) in confs:
200+
if col not in cols:
201+
cols.append(col)
202+
for n, file_name in enumerate(args.file_names):
203+
if n != 0:
204+
vs = relative_values[col][file_name]
205+
geomean_values[col][file_name] = geomean(vs)
206+
207+
for col in cols:
208+
row = []
209+
row.append("")
210+
row.append(col)
211+
for n, file_name in enumerate(args.file_names):
212+
row.append("")
213+
if n != 0:
214+
value = geomean_values[col][file_name]
215+
percentage = (1 - value) * 100
216+
row.append("{:.2f}".format(percentage))
217+
rows.append(row)
218+
219+
return rows
220+
221+
222+
def pad(base, fill, count, right = False):
223+
"""Pad base string with given fill until count, on either left or right."""
224+
225+
while len(base) < count:
226+
if right:
227+
base += fill
228+
else:
229+
base = fill + base
230+
return base
231+
232+
233+
def print_table(table):
234+
"""Pretty print results table as aligned human-readable text."""
235+
236+
# Collect width of each max column.
237+
widths = defaultdict(lambda: 0)
238+
for row in table:
239+
for ncol, col in enumerate(row):
240+
widths[ncol] = max(widths[ncol], len(str(col)))
241+
242+
# Print results as an aligned text to stdout.
243+
totals = False
244+
for nrow, row in enumerate(table):
245+
if row[0] == '' and not totals:
246+
print("-" * (sum(widths.values()) + len(widths) - 1))
247+
totals = True
248+
line = []
249+
for ncol, col in enumerate(row):
250+
right = ncol == 0 or ncol == 1
251+
line.append(pad(str(col), " ", widths[ncol], right = right))
252+
print(" ".join(line))
253+
if nrow == 0:
254+
print("-" * (sum(widths.values()) + len(widths) - 1))
255+
256+
257+
def parse_args():
258+
"""Parse command-line flags into a configuration object, and return it."""
259+
260+
parser = argparse.ArgumentParser(description="Compare multiple swift-benchmark json files.")
261+
parser.add_argument("baseline", help="Baseline json file to compare against.")
262+
parser.add_argument("candidate", nargs="+",
263+
help="Candidate json files to compare against baseline.")
264+
parser.add_argument("--filter", help="Only show benchmarks that match the regular expression.")
265+
parser.add_argument("--filter-not", help="Exclude benchmarks whose names match the regular expression.")
266+
parser.add_argument("--columns", help="A comma-separated list of columns to show.")
267+
268+
args = parser.parse_args()
269+
args.file_names = [args.baseline]
270+
args.file_names.extend(args.candidate)
271+
if args.columns is not None:
272+
args.columns = set(args.columns.split(","))
273+
274+
return args
275+
276+
277+
def main():
278+
"""Command-line entry-point."""
279+
280+
args = parse_args()
281+
runs = parse_and_validate(args)
282+
confs, values = collect_values(args, runs)
283+
table = to_table(confs, args, values)
284+
print_table(table)
285+
286+
287+
if __name__ == "__main__":
288+
main()

0 commit comments

Comments
 (0)