Skip to content

Commit 5ab4983

Browse files
tools: Port analysis script to Python (#149)
1 parent 7ea6ca4 commit 5ab4983

File tree

5 files changed

+307
-130
lines changed

5 files changed

+307
-130
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,5 @@
66
/compile_commands.json
77

88
/ci.bazelrc
9+
10+
/**/__pycache__

tools/analyze-compdb.sh

Lines changed: 0 additions & 82 deletions
This file was deleted.

tools/analyze_compdb.py

Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import csv
5+
import enum
6+
import io
7+
import json
8+
from multiprocessing import Pool
9+
import os
10+
from pathlib import Path
11+
import re
12+
import subprocess
13+
import sys
14+
import tempfile
15+
import time
16+
import typing
17+
18+
from compdb import *
19+
20+
21+
class Measurement(enum.Enum):
22+
SLOC = "sloc"
23+
LINES = "lines"
24+
PP_SLOC = "pp_sloc"
25+
PP_LINES = "pp_lines"
26+
PP_TIME = "pp_time"
27+
SEMA_TIME = "sema_time"
28+
29+
def __str__(self):
30+
return self.value
31+
32+
33+
class PathList:
34+
groups: List[List[Path]]
35+
36+
def from_compdb(compdb: CompilationDatabase):
37+
paths = [
38+
entry.filepath
39+
if entry.filepath.is_absolute()
40+
else entry.directory.joinpath(entry.filepath)
41+
for entry in compdb.entries
42+
]
43+
return PathList.from_paths(paths)
44+
45+
def from_paths(paths: List[Path]):
46+
result = PathList()
47+
result.groups = [[]]
48+
last_group_size = 0
49+
for path in paths:
50+
if not path.exists():
51+
continue
52+
l = len(bytes(path))
53+
# The arglength limit on macOS is 1MiB, be more conservative
54+
if last_group_size + l > 900 * 1024:
55+
p.groups.append([path])
56+
last_group_size = l
57+
else:
58+
result.groups[-1].append(path)
59+
last_group_size += l
60+
assert (
61+
len(result.groups[0]) > 0
62+
) # all files in compilation database do not exist
63+
return result
64+
65+
66+
# ASSUMPTION: There aren't multiple compilation commands in the compilation
67+
# database which index the same file in two different ways.
68+
# Otherwise, we'd need to plumb through indexes properly rather than
69+
# working with a Path key.
70+
def compute_sloc(pathlist: PathList) -> dict[str, int]:
71+
sloc_dict = {}
72+
for group in pathlist.groups:
73+
# Tokei runs in parallel, so avoid extra process spawning overhead.
74+
cmd = ["tokei", "--files", "--output", "json"] + [str(s) for s in group]
75+
result = subprocess.run(cmd, capture_output=True, encoding="utf8")
76+
for _lang, lang_data in json.loads(result.stdout).items():
77+
for entry in lang_data["reports"]:
78+
sloc_dict[entry["name"]] = entry["stats"]["code"]
79+
return sloc_dict
80+
81+
82+
def compute_lines_impl(path: Path) -> int:
83+
result = subprocess.run(
84+
["wc", "-l", str(path)], capture_output=True, encoding="utf8"
85+
)
86+
# Q: Should we pass in the regex from outside?
87+
match = re.search(r"\s*(\d+)\s+(.*)\n", result.stdout)
88+
return int(match.group(1))
89+
90+
91+
def compute_lines(pool: Pool, pathlist: PathList) -> dict[str, int]:
92+
paths = [path for group in pathlist.groups for path in group]
93+
lines = pool.map(compute_lines_impl, paths)
94+
return {str(p): lines[i] for (i, p) in enumerate(paths)}
95+
96+
97+
A = typing.TypeVar("A")
98+
B = typing.TypeVar("B")
99+
100+
101+
def transpose(list_of_tuples: List[tuple[A, B]]) -> tuple[List[A], List[B]]:
102+
return tuple([list(x) for x in zip(*list_of_tuples)])
103+
104+
105+
def compute_pp_time_impl(entry: CompilationDatabaseEntry) -> tuple[Path, float]:
106+
handle, tmp_path = tempfile.mkstemp(prefix=entry.filepath.stem, suffix=".pp.cpp")
107+
os.close(handle)
108+
start = time.monotonic()
109+
entry.run_preprocessor_only(tmp_path)
110+
delta = time.monotonic() - start
111+
return (Path(tmp_path), delta)
112+
113+
114+
#
115+
def compute_pp_time(
116+
pool: Pool, entries: List[CompilationDatabaseEntry]
117+
) -> tuple[List[Path], dict[str, str], dict[str, List[float]]]:
118+
list_of_tuples = pool.map(compute_pp_time_impl, entries)
119+
tmp_files, timings = transpose(list_of_tuples)
120+
path_to_tmp_dict = {
121+
str(entries[i].filepath): str(p) for (i, p) in enumerate(tmp_files)
122+
}
123+
timing_dict = {
124+
str(entries[i].filepath): round(t, 3) for (i, t) in enumerate(timings)
125+
}
126+
return (tmp_files, path_to_tmp_dict, timing_dict)
127+
128+
129+
def compute_sema_time_impl(entry: CompilationDatabaseEntry) -> float:
130+
start = time.monotonic()
131+
entry.run_sema_only()
132+
return time.monotonic() - start
133+
134+
135+
def compute_sema_time(
136+
pool: Pool, entries: List[CompilationDatabaseEntry]
137+
) -> List[float]:
138+
timings = pool.map(compute_sema_time_impl, entries)
139+
return {str(entries[i].filepath): round(t, 3) for (i, t) in enumerate(timings)}
140+
141+
142+
def parse_arguments() -> tuple[str, dict[str, bool]]:
143+
parser = argparse.ArgumentParser(
144+
prog="analyze_compdb",
145+
description="Analyze various statistics about translation units in a compilation database",
146+
)
147+
parser.add_argument(
148+
"--compdb-path",
149+
help="Path to compilation database containing a single entry",
150+
default="compile_commands.json",
151+
)
152+
cases = [str(m) for m in list(Measurement)]
153+
case_help_text = ", ".join(cases[:-1]) + " and " + cases[-1] + " (or 'all')"
154+
parser.add_argument(
155+
"--measure",
156+
help=f"One or more of {case_help_text}. Can be supplied multiple times",
157+
action="append",
158+
)
159+
args = parser.parse_args()
160+
requested_measurements = {str(c): False for c in cases}
161+
for a in args.measure:
162+
if a not in requested_measurements:
163+
if a == "all":
164+
requested_measurements = {k: True for k in requested_measurements}
165+
else:
166+
raise ValueError(f"Expected one of {case_help_text} but found {a}")
167+
else:
168+
requested_measurements[a] = True
169+
return (args.compdb_path, requested_measurements)
170+
171+
172+
def default_main():
173+
compdb_path, requested_measurements = parse_arguments()
174+
compdb = CompilationDatabase.load(compdb_path)
175+
results = {}
176+
tu_main_file_pathlist = PathList.from_compdb(compdb)
177+
tmp_map = {}
178+
with Pool() as pool:
179+
if requested_measurements[str(Measurement.LINES)]:
180+
results[Measurement.LINES] = compute_lines(pool, tu_main_file_pathlist)
181+
if requested_measurements[str(Measurement.SLOC)]:
182+
results[Measurement.SLOC] = compute_sloc(tu_main_file_pathlist)
183+
pp_tmp_files = []
184+
if requested_measurements[str(Measurement.PP_TIME)]:
185+
pp_tmp_files, tmp_map, results[Measurement.PP_TIME] = compute_pp_time(
186+
pool, compdb.entries
187+
)
188+
if (
189+
requested_measurements[str(Measurement.PP_SLOC)]
190+
or requested_measurements[str(Measurement.PP_LINES)]
191+
):
192+
if not pp_tmp_files:
193+
pp_tmp_files, tmp_map, _ = compute_pp_time(pool, compdb.entries)
194+
pp_tmp_pathlist = PathList.from_paths(pp_tmp_files)
195+
if requested_measurements[str(Measurement.PP_SLOC)]:
196+
results[Measurement.PP_SLOC] = compute_sloc(pp_tmp_pathlist)
197+
if requested_measurements[str(Measurement.PP_LINES)]:
198+
results[Measurement.PP_LINES] = compute_lines(pool, pp_tmp_pathlist)
199+
for pp_tmp_file in pp_tmp_files:
200+
pp_tmp_file.unlink()
201+
if requested_measurements[str(Measurement.SEMA_TIME)]:
202+
results[Measurement.SEMA_TIME] = compute_sema_time(pool, compdb.entries)
203+
204+
results = {str(k): v for k, v in results.items()}
205+
206+
columns = [m for m, requested in requested_measurements.items() if requested]
207+
208+
result_table = []
209+
for group in tu_main_file_pathlist.groups:
210+
for path in group:
211+
result_table.append([str(path)])
212+
213+
for col in columns:
214+
data = results[col]
215+
for i, row in enumerate(result_table):
216+
p = row[0]
217+
try:
218+
val = data[p]
219+
except KeyError:
220+
try:
221+
tmp_path = tmp_map[p]
222+
val = data[tmp_path]
223+
except KeyError:
224+
val = -999
225+
row.append(val)
226+
227+
compdb = CompilationDatabase.load(args.compdb_path)
228+
writer = csv.writer(sys.stdout)
229+
writer.writerow(["path"] + columns)
230+
writer.writerows(result_table)
231+
232+
233+
if __name__ == "__main__":
234+
default_main()

tools/compdb.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import copy
2+
import json
3+
import pathlib
4+
import shlex
5+
import subprocess
6+
from typing import List
7+
8+
9+
class CompilationDatabaseEntry:
10+
filepath: pathlib.Path
11+
directory: pathlib.Path
12+
arguments: List[str]
13+
14+
def __init__(self, entry):
15+
self.filepath = pathlib.Path(entry["file"])
16+
self.directory = pathlib.Path(entry["directory"])
17+
try:
18+
self.arguments = shlex.split(entry["command"])
19+
except KeyError:
20+
self.arguments = entry(["arguments"])
21+
22+
def change_tu_filepath(self, new_path: str):
23+
old_path = str(self.filepath)
24+
# ASSUMPTION: In a normal compilation command, we only expect there to
25+
# be a path to a single TU in the argument list, since the compilation
26+
# command will involve generating object code for a single TU.
27+
#
28+
# Moreover, files typically aren't present at the project root.
29+
#
30+
# These two factors mean that it's very unlikely for the argument list
31+
# to have two files like 'cake.c' and 'dessert/cake.c'.
32+
self.arguments = [arg.replace(old_path, new_path) for arg in self.arguments]
33+
self.filepath = pathlib.Path(new_path)
34+
35+
def to_dict(self):
36+
return {
37+
"directory": str(self.directory),
38+
"file": str(self.filepath),
39+
"arguments": self.arguments[:], # defensive copy
40+
}
41+
42+
def run_preprocessor_only(self, preprocessed_tu_path: pathlib.Path | str):
43+
args = copy.deepcopy(self.arguments)
44+
args += ["-E", "-o", str(preprocessed_tu_path)]
45+
subprocess.run(args, cwd=self.directory).check_returncode()
46+
47+
def run_sema_only(self):
48+
args = copy.deepcopy(self.arguments)
49+
args += ["-fsyntax-only", "-o", "/dev/null"]
50+
subprocess.run(args, cwd=self.directory).check_returncode()
51+
52+
53+
class CompilationDatabase:
54+
entries: List[CompilationDatabaseEntry]
55+
56+
def load(path: pathlib.Path | str):
57+
db = CompilationDatabase()
58+
db.entries = []
59+
with open(path) as compdbFile:
60+
for entry in json.load(compdbFile):
61+
db.entries.append(CompilationDatabaseEntry(entry))
62+
return db

0 commit comments

Comments
 (0)