|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import argparse |
| 4 | +import csv |
| 5 | +import enum |
| 6 | +import io |
| 7 | +import json |
| 8 | +from multiprocessing import Pool |
| 9 | +import os |
| 10 | +from pathlib import Path |
| 11 | +import re |
| 12 | +import subprocess |
| 13 | +import sys |
| 14 | +import tempfile |
| 15 | +import time |
| 16 | +import typing |
| 17 | + |
| 18 | +from compdb import * |
| 19 | + |
| 20 | + |
| 21 | +class Measurement(enum.Enum): |
| 22 | + SLOC = "sloc" |
| 23 | + LINES = "lines" |
| 24 | + PP_SLOC = "pp_sloc" |
| 25 | + PP_LINES = "pp_lines" |
| 26 | + PP_TIME = "pp_time" |
| 27 | + SEMA_TIME = "sema_time" |
| 28 | + |
| 29 | + def __str__(self): |
| 30 | + return self.value |
| 31 | + |
| 32 | + |
| 33 | +class PathList: |
| 34 | + groups: List[List[Path]] |
| 35 | + |
| 36 | + def from_compdb(compdb: CompilationDatabase): |
| 37 | + paths = [ |
| 38 | + entry.filepath |
| 39 | + if entry.filepath.is_absolute() |
| 40 | + else entry.directory.joinpath(entry.filepath) |
| 41 | + for entry in compdb.entries |
| 42 | + ] |
| 43 | + return PathList.from_paths(paths) |
| 44 | + |
| 45 | + def from_paths(paths: List[Path]): |
| 46 | + result = PathList() |
| 47 | + result.groups = [[]] |
| 48 | + last_group_size = 0 |
| 49 | + for path in paths: |
| 50 | + if not path.exists(): |
| 51 | + continue |
| 52 | + l = len(bytes(path)) |
| 53 | + # The arglength limit on macOS is 1MiB, be more conservative |
| 54 | + if last_group_size + l > 900 * 1024: |
| 55 | + p.groups.append([path]) |
| 56 | + last_group_size = l |
| 57 | + else: |
| 58 | + result.groups[-1].append(path) |
| 59 | + last_group_size += l |
| 60 | + assert ( |
| 61 | + len(result.groups[0]) > 0 |
| 62 | + ) # all files in compilation database do not exist |
| 63 | + return result |
| 64 | + |
| 65 | + |
| 66 | +# ASSUMPTION: There aren't multiple compilation commands in the compilation |
| 67 | +# database which index the same file in two different ways. |
| 68 | +# Otherwise, we'd need to plumb through indexes properly rather than |
| 69 | +# working with a Path key. |
| 70 | +def compute_sloc(pathlist: PathList) -> dict[str, int]: |
| 71 | + sloc_dict = {} |
| 72 | + for group in pathlist.groups: |
| 73 | + # Tokei runs in parallel, so avoid extra process spawning overhead. |
| 74 | + cmd = ["tokei", "--files", "--output", "json"] + [str(s) for s in group] |
| 75 | + result = subprocess.run(cmd, capture_output=True, encoding="utf8") |
| 76 | + for _lang, lang_data in json.loads(result.stdout).items(): |
| 77 | + for entry in lang_data["reports"]: |
| 78 | + sloc_dict[entry["name"]] = entry["stats"]["code"] |
| 79 | + return sloc_dict |
| 80 | + |
| 81 | + |
| 82 | +def compute_lines_impl(path: Path) -> int: |
| 83 | + result = subprocess.run( |
| 84 | + ["wc", "-l", str(path)], capture_output=True, encoding="utf8" |
| 85 | + ) |
| 86 | + # Q: Should we pass in the regex from outside? |
| 87 | + match = re.search(r"\s*(\d+)\s+(.*)\n", result.stdout) |
| 88 | + return int(match.group(1)) |
| 89 | + |
| 90 | + |
| 91 | +def compute_lines(pool: Pool, pathlist: PathList) -> dict[str, int]: |
| 92 | + paths = [path for group in pathlist.groups for path in group] |
| 93 | + lines = pool.map(compute_lines_impl, paths) |
| 94 | + return {str(p): lines[i] for (i, p) in enumerate(paths)} |
| 95 | + |
| 96 | + |
| 97 | +A = typing.TypeVar("A") |
| 98 | +B = typing.TypeVar("B") |
| 99 | + |
| 100 | + |
| 101 | +def transpose(list_of_tuples: List[tuple[A, B]]) -> tuple[List[A], List[B]]: |
| 102 | + return tuple([list(x) for x in zip(*list_of_tuples)]) |
| 103 | + |
| 104 | + |
| 105 | +def compute_pp_time_impl(entry: CompilationDatabaseEntry) -> tuple[Path, float]: |
| 106 | + handle, tmp_path = tempfile.mkstemp(prefix=entry.filepath.stem, suffix=".pp.cpp") |
| 107 | + os.close(handle) |
| 108 | + start = time.monotonic() |
| 109 | + entry.run_preprocessor_only(tmp_path) |
| 110 | + delta = time.monotonic() - start |
| 111 | + return (Path(tmp_path), delta) |
| 112 | + |
| 113 | + |
| 114 | +# |
| 115 | +def compute_pp_time( |
| 116 | + pool: Pool, entries: List[CompilationDatabaseEntry] |
| 117 | +) -> tuple[List[Path], dict[str, str], dict[str, List[float]]]: |
| 118 | + list_of_tuples = pool.map(compute_pp_time_impl, entries) |
| 119 | + tmp_files, timings = transpose(list_of_tuples) |
| 120 | + path_to_tmp_dict = { |
| 121 | + str(entries[i].filepath): str(p) for (i, p) in enumerate(tmp_files) |
| 122 | + } |
| 123 | + timing_dict = { |
| 124 | + str(entries[i].filepath): round(t, 3) for (i, t) in enumerate(timings) |
| 125 | + } |
| 126 | + return (tmp_files, path_to_tmp_dict, timing_dict) |
| 127 | + |
| 128 | + |
| 129 | +def compute_sema_time_impl(entry: CompilationDatabaseEntry) -> float: |
| 130 | + start = time.monotonic() |
| 131 | + entry.run_sema_only() |
| 132 | + return time.monotonic() - start |
| 133 | + |
| 134 | + |
| 135 | +def compute_sema_time( |
| 136 | + pool: Pool, entries: List[CompilationDatabaseEntry] |
| 137 | +) -> List[float]: |
| 138 | + timings = pool.map(compute_sema_time_impl, entries) |
| 139 | + return {str(entries[i].filepath): round(t, 3) for (i, t) in enumerate(timings)} |
| 140 | + |
| 141 | + |
| 142 | +def parse_arguments() -> tuple[str, dict[str, bool]]: |
| 143 | + parser = argparse.ArgumentParser( |
| 144 | + prog="analyze_compdb", |
| 145 | + description="Analyze various statistics about translation units in a compilation database", |
| 146 | + ) |
| 147 | + parser.add_argument( |
| 148 | + "--compdb-path", |
| 149 | + help="Path to compilation database containing a single entry", |
| 150 | + default="compile_commands.json", |
| 151 | + ) |
| 152 | + cases = [str(m) for m in list(Measurement)] |
| 153 | + case_help_text = ", ".join(cases[:-1]) + " and " + cases[-1] + " (or 'all')" |
| 154 | + parser.add_argument( |
| 155 | + "--measure", |
| 156 | + help=f"One or more of {case_help_text}. Can be supplied multiple times", |
| 157 | + action="append", |
| 158 | + ) |
| 159 | + args = parser.parse_args() |
| 160 | + requested_measurements = {str(c): False for c in cases} |
| 161 | + for a in args.measure: |
| 162 | + if a not in requested_measurements: |
| 163 | + if a == "all": |
| 164 | + requested_measurements = {k: True for k in requested_measurements} |
| 165 | + else: |
| 166 | + raise ValueError(f"Expected one of {case_help_text} but found {a}") |
| 167 | + else: |
| 168 | + requested_measurements[a] = True |
| 169 | + return (args.compdb_path, requested_measurements) |
| 170 | + |
| 171 | + |
| 172 | +def default_main(): |
| 173 | + compdb_path, requested_measurements = parse_arguments() |
| 174 | + compdb = CompilationDatabase.load(compdb_path) |
| 175 | + results = {} |
| 176 | + tu_main_file_pathlist = PathList.from_compdb(compdb) |
| 177 | + tmp_map = {} |
| 178 | + with Pool() as pool: |
| 179 | + if requested_measurements[str(Measurement.LINES)]: |
| 180 | + results[Measurement.LINES] = compute_lines(pool, tu_main_file_pathlist) |
| 181 | + if requested_measurements[str(Measurement.SLOC)]: |
| 182 | + results[Measurement.SLOC] = compute_sloc(tu_main_file_pathlist) |
| 183 | + pp_tmp_files = [] |
| 184 | + if requested_measurements[str(Measurement.PP_TIME)]: |
| 185 | + pp_tmp_files, tmp_map, results[Measurement.PP_TIME] = compute_pp_time( |
| 186 | + pool, compdb.entries |
| 187 | + ) |
| 188 | + if ( |
| 189 | + requested_measurements[str(Measurement.PP_SLOC)] |
| 190 | + or requested_measurements[str(Measurement.PP_LINES)] |
| 191 | + ): |
| 192 | + if not pp_tmp_files: |
| 193 | + pp_tmp_files, tmp_map, _ = compute_pp_time(pool, compdb.entries) |
| 194 | + pp_tmp_pathlist = PathList.from_paths(pp_tmp_files) |
| 195 | + if requested_measurements[str(Measurement.PP_SLOC)]: |
| 196 | + results[Measurement.PP_SLOC] = compute_sloc(pp_tmp_pathlist) |
| 197 | + if requested_measurements[str(Measurement.PP_LINES)]: |
| 198 | + results[Measurement.PP_LINES] = compute_lines(pool, pp_tmp_pathlist) |
| 199 | + for pp_tmp_file in pp_tmp_files: |
| 200 | + pp_tmp_file.unlink() |
| 201 | + if requested_measurements[str(Measurement.SEMA_TIME)]: |
| 202 | + results[Measurement.SEMA_TIME] = compute_sema_time(pool, compdb.entries) |
| 203 | + |
| 204 | + results = {str(k): v for k, v in results.items()} |
| 205 | + |
| 206 | + columns = [m for m, requested in requested_measurements.items() if requested] |
| 207 | + |
| 208 | + result_table = [] |
| 209 | + for group in tu_main_file_pathlist.groups: |
| 210 | + for path in group: |
| 211 | + result_table.append([str(path)]) |
| 212 | + |
| 213 | + for col in columns: |
| 214 | + data = results[col] |
| 215 | + for i, row in enumerate(result_table): |
| 216 | + p = row[0] |
| 217 | + try: |
| 218 | + val = data[p] |
| 219 | + except KeyError: |
| 220 | + try: |
| 221 | + tmp_path = tmp_map[p] |
| 222 | + val = data[tmp_path] |
| 223 | + except KeyError: |
| 224 | + val = -999 |
| 225 | + row.append(val) |
| 226 | + |
| 227 | + compdb = CompilationDatabase.load(args.compdb_path) |
| 228 | + writer = csv.writer(sys.stdout) |
| 229 | + writer.writerow(["path"] + columns) |
| 230 | + writer.writerows(result_table) |
| 231 | + |
| 232 | + |
| 233 | +if __name__ == "__main__": |
| 234 | + default_main() |
0 commit comments