llvm · thevinster · Jul 4, 2025 · Jul 4, 2025 · Jul 4, 2025 · Jul 7, 2025
diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py
@@ -0,0 +1,52 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+r"""Combine multiple training corpus into a single training corpus.
+
+Currently only support the case that multiple corpus share the same
+configurables except the "modules" field.
+
+Usage: we'd like to combine training corpus corpus1 and corpus2 into
+combinedcorpus; we first structure the files as follows:
+
+combinedcorpus
+combinedcorpus/corpus1
+combinedcorpus/corpus2
+
+Running this script with
+
+python3 \
+compiler_opt/tools/combine_training_corpus.py \
+  --root_dir=$PATH_TO_combinedcorpus
+
+generates combinedcorpus/corpus_description.json file. In this way corpus1
+and corpus2 are combined into combinedcorpus.
+"""
+
+import argparse
+import logging
+
+from mlgo.corpus import combine_training_corpus_lib
+from mlgo.corpus import flags
+
+
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for combining multiple training corpora"
+    )
+    parser.add_argument(
+        "--root_dir", type=str, help="The root dir of module paths to combine."
+    )
+    flags.add_verbosity_arguments(parser)
+    args = parser.parse_args()
+    main(args)
+
+
+def main(args):
+    logging.basicConfig(level=args.verbosity)
+
+    combine_training_corpus_lib.combine_corpus(args.root_dir)
+
+
+if __name__ == "__main__":
+    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py
@@ -0,0 +1,184 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Extract IR for training.
+
+Extract IR for training, either from a compile_commands.json file produced by
+cmake, or a linker parameter list file.
+
+Only run with
+'python compiler_opt/tools/extract_ir.py ...'
+
+The compilation is assumed to have been performed with clang, using
+-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
+
+In a distributed ThinLTO case, the compilation is assumed to have been performed
+specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
+
+In a local ThinLTO case, the compilation is assumedto have been performed
+specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
+
+To change the logging verbosity, set the --verbosity flag to the desired level.
+Setting it to a specific level will enable all messages at that level and
+higher. Exact values can be found by invoking the script with --help.
+"""
+
+import argparse
+import json
+import logging
+
+from mlgo.corpus import extract_ir_lib
+from mlgo.corpus import flags
+
+
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for making a corpus from build artifacts"
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        help="Input file or directory - either compile_commands.json, a linker "
+        "parameter list, or a path to a directory containing object files.",
+    )
+    parser.add_argument(
+        "--input_type",
+        type=str,
+        help="Input file type - JSON, LLD params, directory, or bazel aquery.",
+        choices=["json", "params", "directory", "bazel_aquery"],
+        default="json",
+        nargs="?",
+    )
+    parser.add_argument("--output_dir", type=str, help="Output directory")
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        help="Number of parallel works for objcopy. `None` for maximum available.",
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--llvm_objcopy_path",
+        type=str,
+        help="Path to llvm-objcopy",
+        default="llvm-objcopy",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--obj_base_dir",
+        type=str,
+        help="Base directory for object files. Defaults to current working dir.",
+        default="",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--cmd_filter",
+        type=str,
+        help="Include only those modules with a command line matching this regular "
+        "expression. Set it to None to not perform any filtering. Note that the "
+        "regular expression is applied independently for each separate command line "
+        "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
+        "with thinlto_build=lld.",
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--thinlto_build",
+        type=str,
+        help="Set if the build was performed with either 'distributed' or 'local' "
+        "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
+        "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
+        "the distributed case or -Wl,--save-temps=import and "
+        "-Wl,--thinlto-emit-index-files passed in the local case",
+        choices=["distributed", "local"],
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--cmd_section_name",
+        type=str,
+        help="The section name passed to llvm-objcopy. For ELF object files, the "
+        "default .llvmcmd is correct. For Mach-O object files, one should use "
+        "something like __LLVM,__cmdline",
+        default=".llvmcmd",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--bitcode_section_name",
+        type=str,
+        help="The section name passed to llvm-objcopy. For ELF object files, the "
+        "default .llvmbc is correct. For Mach-O object files, one should use "
+        "__LLVM,__bitcode",
+        default=".llvmbc",
+        nargs="?",
+    )
+    flags.add_verbosity_arguments(parser)
+    args = parser.parse_args()
+    main(args)
+
+
+def main(args):
+    logging.basicConfig(level=args.verbosity)
+
+    objs = []
+    if args.input is not None and args.thinlto_build == "local":
+        raise ValueError("--thinlto_build=local cannot be run with --input")
+    if args.input is None:
+        if args.thinlto_build != "local":
+            raise ValueError("--input or --thinlto_build=local must be provided")
+        objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
+    elif args.input_type == "json":
+        with open(args.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_compile_commands(
+                json.load(f), args.output_dir
+            )
+    elif args.input_type == "params":
+        if not args.obj_base_dir:
+            logging.info(
+                "-obj_base_dir is unspecified, assuming current directory. "
+                "If no objects are found, use this option to specify the root "
+                "directory for the object file paths in the input file."
+            )
+        with open(args.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_lld_params(
+                [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
+            )
+    elif args.input_type == "directory":
+        logging.warning(
+            "Using the directory input is only recommended if the build system "
+            "your project uses does not support any structured output that "
+            "ml-compiler-opt understands. If your build system provides a "
+            "structured compilation database, use that instead"
+        )
+        objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
+    elif args.input_type == "bazel_aquery":
+        with open(args.input, encoding="utf-8") as aquery_json_handle:
+            objs = extract_ir_lib.load_bazel_aquery(
+                json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
+            )
+    else:
+        logging.error("Unknown input type: %s", args.input_type)
+
+    relative_output_paths = extract_ir_lib.run_extraction(
+        objs,
+        args.num_workers,
+        args.llvm_objcopy_path,
+        args.cmd_filter,
+        args.thinlto_build,
+        args.cmd_section_name,
+        args.bitcode_section_name,
+    )
+
+    extract_ir_lib.write_corpus_manifest(
+        args.thinlto_build, relative_output_paths, args.output_dir
+    )
+
+    logging.info(
+        "Converted %d files out of %d",
+        len(objs) - relative_output_paths.count(None),
+        len(objs),
+    )
+
+
+if __name__ == "__main__":
+    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py
@@ -0,0 +1,53 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Tool for making a corpus from arbitrary bitcode.
+
+To create a corpus from a set of bitcode files in an input directory, run
+the following command:
+
+PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
+  --input_dir=<path to input directory> \
+  --output_dir=<path to output directory> \
+  --default_args="<list of space separated flags>"
+"""
+
+import argparse
+import logging
+
+from mlgo.corpus import make_corpus_lib
+
+
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for making a corpus from arbitrary bitcode"
+    )
+    parser.add_argument("--input_dir", type=str, help="The input directory.")
+    parser.add_argument("--output_dir", type=str, help="The output directory.")
+    parser.add_argument(
+        "--default_args",
+        type=str,
+        help="The compiler flags to compile with when using downstream tooling.",
+        default="",
+        nargs="?",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
+def main(args):
+    logging.warning(
+        "Using this tool does not guarantee that the bitcode is taken at "
+        "the correct stage for consumption during model training. Make "
+        "sure to validate assumptions about where the bitcode is coming "
+        "from before using it in production."
+    )
+    relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
+    make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
+    make_corpus_lib.write_corpus_manifest(
+        relative_paths, args.output_dir, args.default_args.split()
+    )
+
+
+if __name__ == "__main__":
+    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
@@ -0,0 +1 @@
+../../combine_training_corpus.py