sourcegraph
diff --git a/‎.gitignore
Lines changed: 2 additions & 0 deletions b/‎.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎tools/analyze-compdb.sh
Lines changed: 0 additions & 82 deletions b/‎tools/analyze-compdb.sh
Lines changed: 0 additions & 82 deletions
diff --git a/‎tools/analyze_compdb.py
Lines changed: 234 additions & 0 deletions b/‎tools/analyze_compdb.py
Lines changed: 234 additions & 0 deletions
diff --git a/‎tools/compdb.py
Lines changed: 62 additions & 0 deletions b/‎tools/compdb.py
Lines changed: 62 additions & 0 deletions
@@ -6,3 +6,5 @@
 /compile_commands.json
 
 /ci.bazelrc
+
+/**/__pycache__
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import enum
+import io
+import json
+from multiprocessing import Pool
+import os
+from pathlib import Path
+import re
+import subprocess
+import sys
+import tempfile
+import time
+import typing
+
+from compdb import *
+
+
+class Measurement(enum.Enum):
+    SLOC = "sloc"
+    LINES = "lines"
+    PP_SLOC = "pp_sloc"
+    PP_LINES = "pp_lines"
+    PP_TIME = "pp_time"
+    SEMA_TIME = "sema_time"
+
+    def __str__(self):
+        return self.value
+
+
+class PathList:
+    groups: List[List[Path]]
+
+    def from_compdb(compdb: CompilationDatabase):
+        paths = [
+            entry.filepath
+            if entry.filepath.is_absolute()
+            else entry.directory.joinpath(entry.filepath)
+            for entry in compdb.entries
+        ]
+        return PathList.from_paths(paths)
+
+    def from_paths(paths: List[Path]):
+        result = PathList()
+        result.groups = [[]]
+        last_group_size = 0
+        for path in paths:
+            if not path.exists():
+                continue
+            l = len(bytes(path))
+            # The arglength limit on macOS is 1MiB, be more conservative
+            if last_group_size + l > 900 * 1024:
+                p.groups.append([path])
+                last_group_size = l
+            else:
+                result.groups[-1].append(path)
+                last_group_size += l
+        assert (
+            len(result.groups[0]) > 0
+        )  # all files in compilation database do not exist
+        return result
+
+
+# ASSUMPTION: There aren't multiple compilation commands in the compilation
+# database which index the same file in two different ways.
+# Otherwise, we'd need to plumb through indexes properly rather than
+# working with a Path key.
+def compute_sloc(pathlist: PathList) -> dict[str, int]:
+    sloc_dict = {}
+    for group in pathlist.groups:
+        # Tokei runs in parallel, so avoid extra process spawning overhead.
+        cmd = ["tokei", "--files", "--output", "json"] + [str(s) for s in group]
+        result = subprocess.run(cmd, capture_output=True, encoding="utf8")
+        for _lang, lang_data in json.loads(result.stdout).items():
+            for entry in lang_data["reports"]:
+                sloc_dict[entry["name"]] = entry["stats"]["code"]
+    return sloc_dict
+
+
+def compute_lines_impl(path: Path) -> int:
+    result = subprocess.run(
+        ["wc", "-l", str(path)], capture_output=True, encoding="utf8"
+    )
+    # Q: Should we pass in the regex from outside?
+    match = re.search(r"\s*(\d+)\s+(.*)\n", result.stdout)
+    return int(match.group(1))
+
+
+def compute_lines(pool: Pool, pathlist: PathList) -> dict[str, int]:
+    paths = [path for group in pathlist.groups for path in group]
+    lines = pool.map(compute_lines_impl, paths)
+    return {str(p): lines[i] for (i, p) in enumerate(paths)}
+
+
+A = typing.TypeVar("A")
+B = typing.TypeVar("B")
+
+
+def transpose(list_of_tuples: List[tuple[A, B]]) -> tuple[List[A], List[B]]:
+    return tuple([list(x) for x in zip(*list_of_tuples)])
+
+
+def compute_pp_time_impl(entry: CompilationDatabaseEntry) -> tuple[Path, float]:
+    handle, tmp_path = tempfile.mkstemp(prefix=entry.filepath.stem, suffix=".pp.cpp")
+    os.close(handle)
+    start = time.monotonic()
+    entry.run_preprocessor_only(tmp_path)
+    delta = time.monotonic() - start
+    return (Path(tmp_path), delta)
+
+
+#
+def compute_pp_time(
+    pool: Pool, entries: List[CompilationDatabaseEntry]
+) -> tuple[List[Path], dict[str, str], dict[str, List[float]]]:
+    list_of_tuples = pool.map(compute_pp_time_impl, entries)
+    tmp_files, timings = transpose(list_of_tuples)
+    path_to_tmp_dict = {
+        str(entries[i].filepath): str(p) for (i, p) in enumerate(tmp_files)
+    }
+    timing_dict = {
+        str(entries[i].filepath): round(t, 3) for (i, t) in enumerate(timings)
+    }
+    return (tmp_files, path_to_tmp_dict, timing_dict)
+
+
+def compute_sema_time_impl(entry: CompilationDatabaseEntry) -> float:
+    start = time.monotonic()
+    entry.run_sema_only()
+    return time.monotonic() - start
+
+
+def compute_sema_time(
+    pool: Pool, entries: List[CompilationDatabaseEntry]
+) -> List[float]:
+    timings = pool.map(compute_sema_time_impl, entries)
+    return {str(entries[i].filepath): round(t, 3) for (i, t) in enumerate(timings)}
+
+
+def parse_arguments() -> tuple[str, dict[str, bool]]:
+    parser = argparse.ArgumentParser(
+        prog="analyze_compdb",
+        description="Analyze various statistics about translation units in a compilation database",
+    )
+    parser.add_argument(
+        "--compdb-path",
+        help="Path to compilation database containing a single entry",
+        default="compile_commands.json",
+    )
+    cases = [str(m) for m in list(Measurement)]
+    case_help_text = ", ".join(cases[:-1]) + " and " + cases[-1] + " (or 'all')"
+    parser.add_argument(
+        "--measure",
+        help=f"One or more of {case_help_text}. Can be supplied multiple times",
+        action="append",
+    )
+    args = parser.parse_args()
+    requested_measurements = {str(c): False for c in cases}
+    for a in args.measure:
+        if a not in requested_measurements:
+            if a == "all":
+                requested_measurements = {k: True for k in requested_measurements}
+            else:
+                raise ValueError(f"Expected one of {case_help_text} but found {a}")
+        else:
+            requested_measurements[a] = True
+    return (args.compdb_path, requested_measurements)
+
+
+def default_main():
+    compdb_path, requested_measurements = parse_arguments()
+    compdb = CompilationDatabase.load(compdb_path)
+    results = {}
+    tu_main_file_pathlist = PathList.from_compdb(compdb)
+    tmp_map = {}
+    with Pool() as pool:
+        if requested_measurements[str(Measurement.LINES)]:
+            results[Measurement.LINES] = compute_lines(pool, tu_main_file_pathlist)
+        if requested_measurements[str(Measurement.SLOC)]:
+            results[Measurement.SLOC] = compute_sloc(tu_main_file_pathlist)
+        pp_tmp_files = []
+        if requested_measurements[str(Measurement.PP_TIME)]:
+            pp_tmp_files, tmp_map, results[Measurement.PP_TIME] = compute_pp_time(
+                pool, compdb.entries
+            )
+        if (
+            requested_measurements[str(Measurement.PP_SLOC)]
+            or requested_measurements[str(Measurement.PP_LINES)]
+        ):
+            if not pp_tmp_files:
+                pp_tmp_files, tmp_map, _ = compute_pp_time(pool, compdb.entries)
+            pp_tmp_pathlist = PathList.from_paths(pp_tmp_files)
+            if requested_measurements[str(Measurement.PP_SLOC)]:
+                results[Measurement.PP_SLOC] = compute_sloc(pp_tmp_pathlist)
+            if requested_measurements[str(Measurement.PP_LINES)]:
+                results[Measurement.PP_LINES] = compute_lines(pool, pp_tmp_pathlist)
+        for pp_tmp_file in pp_tmp_files:
+            pp_tmp_file.unlink()
+        if requested_measurements[str(Measurement.SEMA_TIME)]:
+            results[Measurement.SEMA_TIME] = compute_sema_time(pool, compdb.entries)
+
+    results = {str(k): v for k, v in results.items()}
+
+    columns = [m for m, requested in requested_measurements.items() if requested]
+
+    result_table = []
+    for group in tu_main_file_pathlist.groups:
+        for path in group:
+            result_table.append([str(path)])
+
+    for col in columns:
+        data = results[col]
+        for i, row in enumerate(result_table):
+            p = row[0]
+            try:
+                val = data[p]
+            except KeyError:
+                try:
+                    tmp_path = tmp_map[p]
+                    val = data[tmp_path]
+                except KeyError:
+                    val = -999
+            row.append(val)
+
+    compdb = CompilationDatabase.load(args.compdb_path)
+    writer = csv.writer(sys.stdout)
+    writer.writerow(["path"] + columns)
+    writer.writerows(result_table)
+
+
+if __name__ == "__main__":
+    default_main()
@@ -0,0 +1,62 @@
+import copy
+import json
+import pathlib
+import shlex
+import subprocess
+from typing import List
+
+
+class CompilationDatabaseEntry:
+    filepath: pathlib.Path
+    directory: pathlib.Path
+    arguments: List[str]
+
+    def __init__(self, entry):
+        self.filepath = pathlib.Path(entry["file"])
+        self.directory = pathlib.Path(entry["directory"])
+        try:
+            self.arguments = shlex.split(entry["command"])
+        except KeyError:
+            self.arguments = entry(["arguments"])
+
+    def change_tu_filepath(self, new_path: str):
+        old_path = str(self.filepath)
+        # ASSUMPTION: In a normal compilation command, we only expect there to
+        # be a path to a single TU in the argument list, since the compilation
+        # command will involve generating object code for a single TU.
+        #
+        # Moreover, files typically aren't present at the project root.
+        #
+        # These two factors mean that it's very unlikely for the argument list
+        # to have two files like 'cake.c' and 'dessert/cake.c'.
+        self.arguments = [arg.replace(old_path, new_path) for arg in self.arguments]
+        self.filepath = pathlib.Path(new_path)
+
+    def to_dict(self):
+        return {
+            "directory": str(self.directory),
+            "file": str(self.filepath),
+            "arguments": self.arguments[:],  # defensive copy
+        }
+
+    def run_preprocessor_only(self, preprocessed_tu_path: pathlib.Path | str):
+        args = copy.deepcopy(self.arguments)
+        args += ["-E", "-o", str(preprocessed_tu_path)]
+        subprocess.run(args, cwd=self.directory).check_returncode()
+
+    def run_sema_only(self):
+        args = copy.deepcopy(self.arguments)
+        args += ["-fsyntax-only", "-o", "/dev/null"]
+        subprocess.run(args, cwd=self.directory).check_returncode()
+
+
+class CompilationDatabase:
+    entries: List[CompilationDatabaseEntry]
+
+    def load(path: pathlib.Path | str):
+        db = CompilationDatabase()
+        db.entries = []
+        with open(path) as compdbFile:
+            for entry in json.load(compdbFile):
+                db.entries.append(CompilationDatabaseEntry(entry))
+        return db