Skip to content

Commit 3e8eae2

Browse files
authored
Merge pull request #98 from Pennycook/codebase
Refactor codebase dictionary into CodeBase class
2 parents e003bc3 + f7e6b02 commit 3e8eae2

File tree

23 files changed

+363
-306
lines changed

23 files changed

+363
-306
lines changed

bin/codebasin

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import logging
1010
import os
1111
import sys
1212

13-
from codebasin import config, finder, report, util
13+
from codebasin import CodeBase, config, finder, report, util
1414
from codebasin.walkers.platform_mapper import PlatformMapper
1515

1616
version = "1.2.0"
@@ -107,14 +107,7 @@ def main():
107107
# Determine the root directory based on where codebasin is run.
108108
rootdir = os.path.realpath(os.getcwd())
109109

110-
# Set up a default codebase and configuration object.
111-
codebase = {
112-
"files": [],
113-
"platforms": [],
114-
"exclude_files": set(),
115-
"exclude_patterns": args.excludes,
116-
"rootdir": rootdir,
117-
}
110+
# Set up a default configuration object.
118111
configuration = {}
119112

120113
# Load the analysis file if it exists.
@@ -132,8 +125,7 @@ def main():
132125

133126
if "codebase" in analysis_toml:
134127
if "exclude" in analysis_toml["codebase"]:
135-
excludes = analysis_toml["codebase"]["exclude"]
136-
codebase["exclude_patterns"] += excludes
128+
args.excludes += analysis_toml["codebase"]["exclude"]
137129

138130
for name in args.platforms:
139131
if name not in analysis_toml["platform"].keys():
@@ -142,16 +134,20 @@ def main():
142134
+ "does not exist in the configuration file.",
143135
)
144136

137+
cmd_platforms = args.platforms.copy()
145138
for name in analysis_toml["platform"].keys():
146-
if args.platforms and name not in args.platforms:
139+
if cmd_platforms and name not in cmd_platforms:
147140
continue
148141
if "commands" not in analysis_toml["platform"][name]:
149142
raise ValueError(f"Missing 'commands' for platform {name}")
150143
p = analysis_toml["platform"][name]["commands"]
151144
db = config.load_database(p, rootdir)
152-
codebase["platforms"].append(name)
145+
args.platforms.append(name)
153146
configuration.update({name: db})
154147

148+
# Construct a codebase object associated with the root directory.
149+
codebase = CodeBase(rootdir, exclude_patterns=args.excludes)
150+
155151
# Parse the source tree, and determine source line associations.
156152
# The trees and associations are housed in state.
157153
state = finder.find(
@@ -180,8 +176,7 @@ def main():
180176
if report_enabled("clustering"):
181177
basename = os.path.basename(args.analysis_file)
182178
filename = os.path.splitext(basename)[0]
183-
platform_names = [p for p in codebase["platforms"]]
184-
output_prefix = "-".join([filename] + platform_names)
179+
output_prefix = "-".join([filename] + args.platforms)
185180

186181
clustering_output_name = output_prefix + "-dendrogram.png"
187182
clustering = report.clustering(clustering_output_name, setmap)

codebasin/__init__.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
# Copyright (C) 2019-2024 Intel Corporation
22
# SPDX-License-Identifier: BSD-3-Clause
3+
import os
34
import shlex
45
import warnings
6+
from collections.abc import Iterable
7+
from pathlib import Path
8+
9+
import pathspec
510

611
import codebasin.source
712
import codebasin.walkers
@@ -123,3 +128,112 @@ def from_json(cls, instance: dict):
123128
command=command,
124129
output=output,
125130
)
131+
132+
133+
class CodeBase:
134+
"""
135+
A representation of all source files in the code base.
136+
137+
Attributes
138+
----------
139+
directories: list[str | os.PathLike[str]]
140+
The set of source directories that make up the code base.
141+
142+
exclude_patterns: list[str]
143+
A set of patterns describing source files excluded from the code base.
144+
"""
145+
146+
def __init__(
147+
self,
148+
*directories: str | os.PathLike[str],
149+
exclude_patterns: Iterable[str] = [],
150+
):
151+
"""
152+
Raises
153+
------
154+
TypeError
155+
If any directory in `directories` is not a path.
156+
If `exclude_patterns` is not a list of strings.
157+
"""
158+
if not isinstance(exclude_patterns, list):
159+
raise TypeError("'exclude_patterns' must be a list.")
160+
if not all([isinstance(d, (str, os.PathLike)) for d in directories]):
161+
raise TypeError(
162+
"Each directory in 'directories' must be PathLike.",
163+
)
164+
if not all([isinstance(p, str) for p in exclude_patterns]):
165+
raise TypeError(
166+
"Each pattern in 'exclude_patterns' must be a string.",
167+
)
168+
self._directories = [Path(d).resolve() for d in directories]
169+
self._excludes = exclude_patterns
170+
171+
def __repr__(self):
172+
return (
173+
f"CodeBase(directories={self.directories}, "
174+
+ f"exclude_patterns={self.exclude_patterns})"
175+
)
176+
177+
@property
178+
def directories(self):
179+
return [str(d) for d in self._directories]
180+
181+
@property
182+
def exclude_patterns(self):
183+
return self._excludes
184+
185+
def __contains__(self, path: os.PathLike) -> bool:
186+
"""
187+
Returns
188+
-------
189+
bool
190+
True if `path` is a recognized source file in one of the code
191+
base's listed directories and does not match any exclude
192+
pattern(s).
193+
"""
194+
path = Path(path).resolve()
195+
196+
# Files that don't exist aren't part of the code base.
197+
if not path.exists():
198+
return False
199+
200+
# Directories cannot be source files.
201+
if path.is_dir():
202+
return False
203+
204+
# Files with unrecognized extensions are not source files.
205+
if not codebasin.source.is_source_file(path):
206+
return False
207+
208+
# Files outside of any directory are not in the code base.
209+
# Store the root for evaluation of relative exclude paths later.
210+
root = None
211+
for directory in self.directories:
212+
if path.is_relative_to(directory):
213+
root = directory
214+
break
215+
if root is None:
216+
return False
217+
218+
# Files matching an exclude pattern are not in the code base.
219+
#
220+
# Use GitIgnoreSpec to match git behavior in weird corner cases.
221+
# Convert relative paths to match .gitignore subdirectory behavior.
222+
spec = pathspec.GitIgnoreSpec.from_lines(self.exclude_patterns)
223+
try:
224+
relative_path = path.relative_to(root)
225+
if spec.match_file(relative_path):
226+
return False
227+
except ValueError:
228+
pass
229+
230+
return True
231+
232+
def __iter__(self):
233+
"""
234+
Iterate over all files in the code base by walking each directory.
235+
"""
236+
for directory in self.directories:
237+
for path in Path(directory).rglob("*"):
238+
if self.__contains__(path):
239+
yield str(path)

codebasin/finder.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import collections
99
import logging
1010
import os
11+
from pathlib import Path
1112

1213
from codebasin import file_parser, platform, preprocessor, util
1314
from codebasin.language import FileLanguage
@@ -140,13 +141,18 @@ def find(
140141
lines to platforms.
141142
"""
142143

144+
# Ensure rootdir is a string for compatibility with legacy code.
145+
# TODO: Remove this once all other functionality is ported to Path.
146+
if isinstance(rootdir, Path):
147+
rootdir = str(rootdir)
148+
143149
# Build a tree for each unique file for all platforms.
144150
state = ParserState(summarize_only)
145-
for f in codebase["files"]:
151+
for f in codebase:
146152
state.insert_file(f)
147153
for p in configuration:
148154
for e in configuration[p]:
149-
if e["file"] not in codebase["files"]:
155+
if e["file"] not in codebase:
150156
filename = e["file"]
151157
if legacy_warnings:
152158
log.warning(

codebasin/walkers/exporter.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
from codebasin import util
88
from codebasin.preprocessor import CodeNode, FileNode
9-
from codebasin.walkers.platform_mapper import exclude
109
from codebasin.walkers.tree_walker import TreeWalker
1110

1211
log = logging.getLogger("codebasin")
@@ -38,10 +37,7 @@ def walk(self, state):
3837
def _export_node(self, _filename, _node, _map):
3938
# Do not export files that the user does not consider to be part of
4039
# the codebase
41-
if isinstance(_node, FileNode) and exclude(
42-
_node.filename,
43-
self.codebase,
44-
):
40+
if isinstance(_node, FileNode) and _node.filename not in self.codebase:
4541
return
4642

4743
if isinstance(_node, CodeNode):

codebasin/walkers/platform_mapper.py

Lines changed: 1 addition & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,44 +2,13 @@
22
# SPDX-License-Identifier: BSD-3-Clause
33

44
import logging
5-
import os
6-
7-
import pathspec
85

96
from codebasin.preprocessor import CodeNode, FileNode
107
from codebasin.walkers.tree_mapper import TreeMapper
118

129
log = logging.getLogger("codebasin")
1310

1411

15-
def exclude(filename, cb):
16-
# Always exclude files that were explicitly listed as excluded.
17-
if filename in cb["exclude_files"]:
18-
log.info(f"Excluding {filename}; matches 'exclude_files'.")
19-
return True
20-
21-
# Only exclude files outside of the root directory if they weren't
22-
# explicitly listed as part of the codebase.
23-
path = os.path.realpath(filename)
24-
if not path.startswith(cb["rootdir"]):
25-
if filename in cb["files"]:
26-
return False
27-
log.info(f"Excluding {filename}; outside of root directory.")
28-
return True
29-
30-
# Exclude files matching an exclude pattern.
31-
#
32-
# Use GitIgnoreSpec to match git behavior in weird corner cases.
33-
# Convert relative paths to match .gitignore subdirectory behavior.
34-
spec = pathspec.GitIgnoreSpec.from_lines(cb["exclude_patterns"])
35-
rel = os.path.relpath(path, cb["rootdir"])
36-
if spec.match_file(rel):
37-
log.info(f"Excluding {filename}; matches exclude pattern.")
38-
return True
39-
40-
return False
41-
42-
4312
class PlatformMapper(TreeMapper):
4413
"""
4514
Specific TreeMapper that builds a mapping of nodes to platforms.
@@ -57,10 +26,7 @@ def _map_node(self, _node, _map):
5726
"""
5827
# Do not map files that the user does not consider to be part of
5928
# the codebase
60-
if isinstance(_node, FileNode) and exclude(
61-
_node.filename,
62-
self.codebase,
63-
):
29+
if isinstance(_node, FileNode) and _node.filename not in self.codebase:
6430
return
6531

6632
if isinstance(_node, CodeNode):

docs/source/analysis.rst

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,17 +36,24 @@ The table's name is the name of the platform, and we can use any meaningful
3636
string. The ``commands`` key tells CBI where to find the compilation database
3737
for this platform.
3838

39+
.. important::
40+
41+
By default, ``codebasin`` searches the current working directory for source
42+
files to include in its analysis. Since we'll be running in the ``src``
43+
directory, we need to specify the ``commands`` paths relative to the
44+
``src`` directory or as absolute paths.
45+
3946
In our example, we have two platforms that we're calling "cpu" and "gpu",
4047
and our build directories are called ``build-cpu`` and ``build-gpu``, so
4148
our platform definitions should look like this:
4249

4350
.. code-block:: toml
4451
4552
[platform.cpu]
46-
commands = "build-cpu/compile_commands.json"
53+
commands = "../build-cpu/compile_commands.json"
4754
4855
[platform.gpu]
49-
commands = "build-gpu/compile_commands.json"
56+
commands = "../build-gpu/compile_commands.json"
5057
5158
.. warning::
5259
Platform names are case sensitive! The names "cpu" and "CPU" would refer to
@@ -56,7 +63,8 @@ our platform definitions should look like this:
5663
Running ``codebasin``
5764
#####################
5865

59-
Running ``codebasin`` with this analysis file gives the following output:
66+
Running ``codebasin`` in the ``src`` directory with this analysis file gives
67+
the following output:
6068

6169
.. code-block:: text
6270
:emphasize-lines: 4,5,6,7,9
@@ -86,6 +94,15 @@ used only by the GPU compilation, and 17 lines of code shared by both
8694
platforms. Plugging these numbers into the equation for code divergence gives
8795
0.45.
8896

97+
.. caution::
98+
If we had run ``codebasin`` in the parent directory, everything in the
99+
``src``, ``build-cpu`` and ``build-gpu`` directories would have been
100+
included in the analysis. For our sample code base, this would have
101+
resulted in over 2000 lines of code being identified as unused! Why so
102+
many? CMake generates multiple ``*.cpp`` files, which it uses as part of
103+
the build process. ``codebasin`` will analyze such files unless we tell it
104+
not to (more on that later).
105+
89106

90107
Filtering Platforms
91108
###################

0 commit comments

Comments
 (0)