From 497d8f76d2df3dbd396c8b734b96ce81b12dcea7 Mon Sep 17 00:00:00 2001 From: Vincent Lee Date: Thu, 3 Jul 2025 18:22:55 -0700 Subject: [PATCH 1/7] [mlgo-utils] Hoist entry script out to the correct directory --- .../mlgo-utils/{mlgo/corpus => }/combine_training_corpus.py | 0 llvm/utils/mlgo-utils/{mlgo/corpus => }/extract_ir.py | 0 llvm/utils/mlgo-utils/{mlgo/corpus => }/make_corpus.py | 0 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 6 +++--- 4 files changed, 3 insertions(+), 3 deletions(-) rename llvm/utils/mlgo-utils/{mlgo/corpus => }/combine_training_corpus.py (100%) rename llvm/utils/mlgo-utils/{mlgo/corpus => }/extract_ir.py (100%) rename llvm/utils/mlgo-utils/{mlgo/corpus => }/make_corpus.py (100%) diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py similarity index 100% rename from llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py rename to llvm/utils/mlgo-utils/combine_training_corpus.py diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py similarity index 100% rename from llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py rename to llvm/utils/mlgo-utils/extract_ir.py diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py similarity index 100% rename from llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py rename to llvm/utils/mlgo-utils/make_corpus.py diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index b618c74c19da1..db8a92fd25de6 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -5210,8 +5210,8 @@ py_binary( py_binary( name = "extract_ir", srcs = [ + "utils/mlgo-utils/extract_ir.py", "utils/mlgo-utils/mlgo/__init__.py", - "utils/mlgo-utils/mlgo/corpus/extract_ir.py", "utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py", "utils/mlgo-utils/mlgo/corpus/flags.py", ], @@ -5221,8 +5221,8 @@ py_binary( py_binary( name = "combine_training_corpus", srcs = [ + "utils/mlgo-utils/combine_training_corpus.py", "utils/mlgo-utils/mlgo/__init__.py", - "utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py", "utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py", "utils/mlgo-utils/mlgo/corpus/flags.py", ], @@ -5232,8 +5232,8 @@ py_binary( py_binary( name = "make_corpus", srcs = [ + "utils/mlgo-utils/make_corpus.py", "utils/mlgo-utils/mlgo/__init__.py", - "utils/mlgo-utils/mlgo/corpus/make_corpus.py", "utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py", ], imports = ["utils/mlgo-utils"], From ee72352a0ec409fe3f88bb09136261923c24d8ba Mon Sep 17 00:00:00 2001 From: Vincent Lee Date: Fri, 4 Jul 2025 11:13:02 -0700 Subject: [PATCH 2/7] Add wrapper instead of moving --- .../mlgo-utils/combine_training_corpus.py | 57 +----- llvm/utils/mlgo-utils/extract_ir.py | 189 +----------------- llvm/utils/mlgo-utils/make_corpus.py | 58 +----- .../mlgo/corpus/combine_training_corpus.py | 52 +++++ .../mlgo-utils/mlgo/corpus/extract_ir.py | 184 +++++++++++++++++ .../mlgo-utils/mlgo/corpus/make_corpus.py | 53 +++++ 6 files changed, 310 insertions(+), 283 deletions(-) mode change 100644 => 100755 llvm/utils/mlgo-utils/combine_training_corpus.py mode change 100644 => 100755 llvm/utils/mlgo-utils/extract_ir.py mode change 100644 => 100755 llvm/utils/mlgo-utils/make_corpus.py create mode 100644 llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py create mode 100644 llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py create mode 100644 llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py old mode 100644 new mode 100755 index 9884d6696a43f..7a1d870ad7e38 --- a/llvm/utils/mlgo-utils/combine_training_corpus.py +++ b/llvm/utils/mlgo-utils/combine_training_corpus.py @@ -1,52 +1,9 @@ -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -r"""Combine multiple training corpus into a single training corpus. +#!/usr/bin/env python3 -Currently only support the case that multiple corpus share the same -configurables except the "modules" field. +import re +import sys +from mlgo.corpus.combine_training_corpus import parse_args_and_run +if __name__ == '__main__': + sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) + sys.exit(parse_args_and_run()) -Usage: we'd like to combine training corpus corpus1 and corpus2 into -combinedcorpus; we first structure the files as follows: - -combinedcorpus -combinedcorpus/corpus1 -combinedcorpus/corpus2 - -Running this script with - -python3 \ -compiler_opt/tools/combine_training_corpus.py \ - --root_dir=$PATH_TO_combinedcorpus - -generates combinedcorpus/corpus_description.json file. In this way corpus1 -and corpus2 are combined into combinedcorpus. -""" - -import argparse -import logging - -from mlgo.corpus import combine_training_corpus_lib -from mlgo.corpus import flags - - -def parse_args_and_run(): - parser = argparse.ArgumentParser( - description="A tool for combining multiple training corpora" - ) - parser.add_argument( - "--root_dir", type=str, help="The root dir of module paths to combine." - ) - flags.add_verbosity_arguments(parser) - args = parser.parse_args() - main(args) - - -def main(args): - logging.basicConfig(level=args.verbosity) - - combine_training_corpus_lib.combine_corpus(args.root_dir) - - -if __name__ == "__main__": - parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py old mode 100644 new mode 100755 index 3101cef196b4a..589a5c50af726 --- a/llvm/utils/mlgo-utils/extract_ir.py +++ b/llvm/utils/mlgo-utils/extract_ir.py @@ -1,184 +1,9 @@ -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -"""Extract IR for training. +#!/usr/bin/env python3 -Extract IR for training, either from a compile_commands.json file produced by -cmake, or a linker parameter list file. +import re +import sys +from mlgo.corpus.extract_ir import parse_args_and_run +if __name__ == '__main__': + sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) + sys.exit(parse_args_and_run()) -Only run with -'python compiler_opt/tools/extract_ir.py ...' - -The compilation is assumed to have been performed with clang, using --fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all) - -In a distributed ThinLTO case, the compilation is assumed to have been performed -specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt. - -In a local ThinLTO case, the compilation is assumedto have been performed -specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files - -To change the logging verbosity, set the --verbosity flag to the desired level. -Setting it to a specific level will enable all messages at that level and -higher. Exact values can be found by invoking the script with --help. -""" - -import argparse -import json -import logging - -from mlgo.corpus import extract_ir_lib -from mlgo.corpus import flags - - -def parse_args_and_run(): - parser = argparse.ArgumentParser( - description="A tool for making a corpus from build artifacts" - ) - parser.add_argument( - "--input", - type=str, - help="Input file or directory - either compile_commands.json, a linker " - "parameter list, or a path to a directory containing object files.", - ) - parser.add_argument( - "--input_type", - type=str, - help="Input file type - JSON, LLD params, directory, or bazel aquery.", - choices=["json", "params", "directory", "bazel_aquery"], - default="json", - nargs="?", - ) - parser.add_argument("--output_dir", type=str, help="Output directory") - parser.add_argument( - "--num_workers", - type=int, - help="Number of parallel works for objcopy. `None` for maximum available.", - default=None, - nargs="?", - ) - parser.add_argument( - "--llvm_objcopy_path", - type=str, - help="Path to llvm-objcopy", - default="llvm-objcopy", - nargs="?", - ) - parser.add_argument( - "--obj_base_dir", - type=str, - help="Base directory for object files. Defaults to current working dir.", - default="", - nargs="?", - ) - parser.add_argument( - "--cmd_filter", - type=str, - help="Include only those modules with a command line matching this regular " - "expression. Set it to None to not perform any filtering. Note that the " - "regular expression is applied independently for each separate command line " - "option. For example, ^-Oz$ will match Oz built binaries. This does not work " - "with thinlto_build=lld.", - default=None, - nargs="?", - ) - parser.add_argument( - "--thinlto_build", - type=str, - help="Set if the build was performed with either 'distributed' or 'local' " - "ThinLTO. This ensures the thinlto.bc files are also copied. The build is " - "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in " - "the distributed case or -Wl,--save-temps=import and " - "-Wl,--thinlto-emit-index-files passed in the local case", - choices=["distributed", "local"], - default=None, - nargs="?", - ) - parser.add_argument( - "--cmd_section_name", - type=str, - help="The section name passed to llvm-objcopy. For ELF object files, the " - "default .llvmcmd is correct. For Mach-O object files, one should use " - "something like __LLVM,__cmdline", - default=".llvmcmd", - nargs="?", - ) - parser.add_argument( - "--bitcode_section_name", - type=str, - help="The section name passed to llvm-objcopy. For ELF object files, the " - "default .llvmbc is correct. For Mach-O object files, one should use " - "__LLVM,__bitcode", - default=".llvmbc", - nargs="?", - ) - flags.add_verbosity_arguments(parser) - args = parser.parse_args() - main(args) - - -def main(args): - logging.basicConfig(level=args.verbosity) - - objs = [] - if args.input is not None and args.thinlto_build == "local": - raise ValueError("--thinlto_build=local cannot be run with --input") - if args.input is None: - if args.thinlto_build != "local": - raise ValueError("--input or --thinlto_build=local must be provided") - objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir) - elif args.input_type == "json": - with open(args.input, encoding="utf-8") as f: - objs = extract_ir_lib.load_from_compile_commands( - json.load(f), args.output_dir - ) - elif args.input_type == "params": - if not args.obj_base_dir: - logging.info( - "-obj_base_dir is unspecified, assuming current directory. " - "If no objects are found, use this option to specify the root " - "directory for the object file paths in the input file." - ) - with open(args.input, encoding="utf-8") as f: - objs = extract_ir_lib.load_from_lld_params( - [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir - ) - elif args.input_type == "directory": - logging.warning( - "Using the directory input is only recommended if the build system " - "your project uses does not support any structured output that " - "ml-compiler-opt understands. If your build system provides a " - "structured compilation database, use that instead" - ) - objs = extract_ir_lib.load_from_directory(args.input, args.output_dir) - elif args.input_type == "bazel_aquery": - with open(args.input, encoding="utf-8") as aquery_json_handle: - objs = extract_ir_lib.load_bazel_aquery( - json.load(aquery_json_handle), args.obj_base_dir, args.output_dir - ) - else: - logging.error("Unknown input type: %s", args.input_type) - - relative_output_paths = extract_ir_lib.run_extraction( - objs, - args.num_workers, - args.llvm_objcopy_path, - args.cmd_filter, - args.thinlto_build, - args.cmd_section_name, - args.bitcode_section_name, - ) - - extract_ir_lib.write_corpus_manifest( - args.thinlto_build, relative_output_paths, args.output_dir - ) - - logging.info( - "Converted %d files out of %d", - len(objs) - relative_output_paths.count(None), - len(objs), - ) - - -if __name__ == "__main__": - parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py old mode 100644 new mode 100755 index 221486e16c6e0..5b4a9bef486ff --- a/llvm/utils/mlgo-utils/make_corpus.py +++ b/llvm/utils/mlgo-utils/make_corpus.py @@ -1,53 +1,9 @@ -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -"""Tool for making a corpus from arbitrary bitcode. +#!/usr/bin/env python3 -To create a corpus from a set of bitcode files in an input directory, run -the following command: +import re +import sys +from mlgo.corpus.make_corpus import parse_args_and_run +if __name__ == '__main__': + sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) + sys.exit(parse_args_and_run()) -PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \ - --input_dir= \ - --output_dir= \ - --default_args="" -""" - -import argparse -import logging - -from mlgo.corpus import make_corpus_lib - - -def parse_args_and_run(): - parser = argparse.ArgumentParser( - description="A tool for making a corpus from arbitrary bitcode" - ) - parser.add_argument("--input_dir", type=str, help="The input directory.") - parser.add_argument("--output_dir", type=str, help="The output directory.") - parser.add_argument( - "--default_args", - type=str, - help="The compiler flags to compile with when using downstream tooling.", - default="", - nargs="?", - ) - args = parser.parse_args() - main(args) - - -def main(args): - logging.warning( - "Using this tool does not guarantee that the bitcode is taken at " - "the correct stage for consumption during model training. Make " - "sure to validate assumptions about where the bitcode is coming " - "from before using it in production." - ) - relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir) - make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir) - make_corpus_lib.write_corpus_manifest( - relative_paths, args.output_dir, args.default_args.split() - ) - - -if __name__ == "__main__": - parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py new file mode 100644 index 0000000000000..9884d6696a43f --- /dev/null +++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py @@ -0,0 +1,52 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +r"""Combine multiple training corpus into a single training corpus. + +Currently only support the case that multiple corpus share the same +configurables except the "modules" field. + +Usage: we'd like to combine training corpus corpus1 and corpus2 into +combinedcorpus; we first structure the files as follows: + +combinedcorpus +combinedcorpus/corpus1 +combinedcorpus/corpus2 + +Running this script with + +python3 \ +compiler_opt/tools/combine_training_corpus.py \ + --root_dir=$PATH_TO_combinedcorpus + +generates combinedcorpus/corpus_description.json file. In this way corpus1 +and corpus2 are combined into combinedcorpus. +""" + +import argparse +import logging + +from mlgo.corpus import combine_training_corpus_lib +from mlgo.corpus import flags + + +def parse_args_and_run(): + parser = argparse.ArgumentParser( + description="A tool for combining multiple training corpora" + ) + parser.add_argument( + "--root_dir", type=str, help="The root dir of module paths to combine." + ) + flags.add_verbosity_arguments(parser) + args = parser.parse_args() + main(args) + + +def main(args): + logging.basicConfig(level=args.verbosity) + + combine_training_corpus_lib.combine_corpus(args.root_dir) + + +if __name__ == "__main__": + parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py new file mode 100644 index 0000000000000..3101cef196b4a --- /dev/null +++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py @@ -0,0 +1,184 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +"""Extract IR for training. + +Extract IR for training, either from a compile_commands.json file produced by +cmake, or a linker parameter list file. + +Only run with +'python compiler_opt/tools/extract_ir.py ...' + +The compilation is assumed to have been performed with clang, using +-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all) + +In a distributed ThinLTO case, the compilation is assumed to have been performed +specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt. + +In a local ThinLTO case, the compilation is assumedto have been performed +specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files + +To change the logging verbosity, set the --verbosity flag to the desired level. +Setting it to a specific level will enable all messages at that level and +higher. Exact values can be found by invoking the script with --help. +""" + +import argparse +import json +import logging + +from mlgo.corpus import extract_ir_lib +from mlgo.corpus import flags + + +def parse_args_and_run(): + parser = argparse.ArgumentParser( + description="A tool for making a corpus from build artifacts" + ) + parser.add_argument( + "--input", + type=str, + help="Input file or directory - either compile_commands.json, a linker " + "parameter list, or a path to a directory containing object files.", + ) + parser.add_argument( + "--input_type", + type=str, + help="Input file type - JSON, LLD params, directory, or bazel aquery.", + choices=["json", "params", "directory", "bazel_aquery"], + default="json", + nargs="?", + ) + parser.add_argument("--output_dir", type=str, help="Output directory") + parser.add_argument( + "--num_workers", + type=int, + help="Number of parallel works for objcopy. `None` for maximum available.", + default=None, + nargs="?", + ) + parser.add_argument( + "--llvm_objcopy_path", + type=str, + help="Path to llvm-objcopy", + default="llvm-objcopy", + nargs="?", + ) + parser.add_argument( + "--obj_base_dir", + type=str, + help="Base directory for object files. Defaults to current working dir.", + default="", + nargs="?", + ) + parser.add_argument( + "--cmd_filter", + type=str, + help="Include only those modules with a command line matching this regular " + "expression. Set it to None to not perform any filtering. Note that the " + "regular expression is applied independently for each separate command line " + "option. For example, ^-Oz$ will match Oz built binaries. This does not work " + "with thinlto_build=lld.", + default=None, + nargs="?", + ) + parser.add_argument( + "--thinlto_build", + type=str, + help="Set if the build was performed with either 'distributed' or 'local' " + "ThinLTO. This ensures the thinlto.bc files are also copied. The build is " + "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in " + "the distributed case or -Wl,--save-temps=import and " + "-Wl,--thinlto-emit-index-files passed in the local case", + choices=["distributed", "local"], + default=None, + nargs="?", + ) + parser.add_argument( + "--cmd_section_name", + type=str, + help="The section name passed to llvm-objcopy. For ELF object files, the " + "default .llvmcmd is correct. For Mach-O object files, one should use " + "something like __LLVM,__cmdline", + default=".llvmcmd", + nargs="?", + ) + parser.add_argument( + "--bitcode_section_name", + type=str, + help="The section name passed to llvm-objcopy. For ELF object files, the " + "default .llvmbc is correct. For Mach-O object files, one should use " + "__LLVM,__bitcode", + default=".llvmbc", + nargs="?", + ) + flags.add_verbosity_arguments(parser) + args = parser.parse_args() + main(args) + + +def main(args): + logging.basicConfig(level=args.verbosity) + + objs = [] + if args.input is not None and args.thinlto_build == "local": + raise ValueError("--thinlto_build=local cannot be run with --input") + if args.input is None: + if args.thinlto_build != "local": + raise ValueError("--input or --thinlto_build=local must be provided") + objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir) + elif args.input_type == "json": + with open(args.input, encoding="utf-8") as f: + objs = extract_ir_lib.load_from_compile_commands( + json.load(f), args.output_dir + ) + elif args.input_type == "params": + if not args.obj_base_dir: + logging.info( + "-obj_base_dir is unspecified, assuming current directory. " + "If no objects are found, use this option to specify the root " + "directory for the object file paths in the input file." + ) + with open(args.input, encoding="utf-8") as f: + objs = extract_ir_lib.load_from_lld_params( + [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir + ) + elif args.input_type == "directory": + logging.warning( + "Using the directory input is only recommended if the build system " + "your project uses does not support any structured output that " + "ml-compiler-opt understands. If your build system provides a " + "structured compilation database, use that instead" + ) + objs = extract_ir_lib.load_from_directory(args.input, args.output_dir) + elif args.input_type == "bazel_aquery": + with open(args.input, encoding="utf-8") as aquery_json_handle: + objs = extract_ir_lib.load_bazel_aquery( + json.load(aquery_json_handle), args.obj_base_dir, args.output_dir + ) + else: + logging.error("Unknown input type: %s", args.input_type) + + relative_output_paths = extract_ir_lib.run_extraction( + objs, + args.num_workers, + args.llvm_objcopy_path, + args.cmd_filter, + args.thinlto_build, + args.cmd_section_name, + args.bitcode_section_name, + ) + + extract_ir_lib.write_corpus_manifest( + args.thinlto_build, relative_output_paths, args.output_dir + ) + + logging.info( + "Converted %d files out of %d", + len(objs) - relative_output_paths.count(None), + len(objs), + ) + + +if __name__ == "__main__": + parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py new file mode 100644 index 0000000000000..221486e16c6e0 --- /dev/null +++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py @@ -0,0 +1,53 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +"""Tool for making a corpus from arbitrary bitcode. + +To create a corpus from a set of bitcode files in an input directory, run +the following command: + +PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \ + --input_dir= \ + --output_dir= \ + --default_args="" +""" + +import argparse +import logging + +from mlgo.corpus import make_corpus_lib + + +def parse_args_and_run(): + parser = argparse.ArgumentParser( + description="A tool for making a corpus from arbitrary bitcode" + ) + parser.add_argument("--input_dir", type=str, help="The input directory.") + parser.add_argument("--output_dir", type=str, help="The output directory.") + parser.add_argument( + "--default_args", + type=str, + help="The compiler flags to compile with when using downstream tooling.", + default="", + nargs="?", + ) + args = parser.parse_args() + main(args) + + +def main(args): + logging.warning( + "Using this tool does not guarantee that the bitcode is taken at " + "the correct stage for consumption during model training. Make " + "sure to validate assumptions about where the bitcode is coming " + "from before using it in production." + ) + relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir) + make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir) + make_corpus_lib.write_corpus_manifest( + relative_paths, args.output_dir, args.default_args.split() + ) + + +if __name__ == "__main__": + parse_args_and_run() From d51bd453fe228504f5cf06db2836798860399880 Mon Sep 17 00:00:00 2001 From: Vincent Lee Date: Fri, 4 Jul 2025 11:20:34 -0700 Subject: [PATCH 3/7] format --- llvm/utils/mlgo-utils/combine_training_corpus.py | 2 +- llvm/utils/mlgo-utils/extract_ir.py | 2 +- llvm/utils/mlgo-utils/make_corpus.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py index 7a1d870ad7e38..563801091f2d2 100755 --- a/llvm/utils/mlgo-utils/combine_training_corpus.py +++ b/llvm/utils/mlgo-utils/combine_training_corpus.py @@ -3,7 +3,7 @@ import re import sys from mlgo.corpus.combine_training_corpus import parse_args_and_run + if __name__ == '__main__': sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) sys.exit(parse_args_and_run()) - diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py index 589a5c50af726..1ed7d2a13f43b 100755 --- a/llvm/utils/mlgo-utils/extract_ir.py +++ b/llvm/utils/mlgo-utils/extract_ir.py @@ -3,7 +3,7 @@ import re import sys from mlgo.corpus.extract_ir import parse_args_and_run + if __name__ == '__main__': sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) sys.exit(parse_args_and_run()) - diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py index 5b4a9bef486ff..3e1a4fcca8cb6 100755 --- a/llvm/utils/mlgo-utils/make_corpus.py +++ b/llvm/utils/mlgo-utils/make_corpus.py @@ -3,7 +3,7 @@ import re import sys from mlgo.corpus.make_corpus import parse_args_and_run + if __name__ == '__main__': sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) sys.exit(parse_args_and_run()) - From 370f7ac40c13a84bafea103fd6b92357078e9c49 Mon Sep 17 00:00:00 2001 From: Vincent Lee Date: Mon, 7 Jul 2025 15:16:12 -0700 Subject: [PATCH 4/7] Use double quotes --- llvm/utils/mlgo-utils/combine_training_corpus.py | 4 ++-- llvm/utils/mlgo-utils/extract_ir.py | 4 ++-- llvm/utils/mlgo-utils/make_corpus.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py index 563801091f2d2..b8c247ecb181c 100755 --- a/llvm/utils/mlgo-utils/combine_training_corpus.py +++ b/llvm/utils/mlgo-utils/combine_training_corpus.py @@ -4,6 +4,6 @@ import sys from mlgo.corpus.combine_training_corpus import parse_args_and_run -if __name__ == '__main__': - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) +if __name__ == "__main__": + sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0]) sys.exit(parse_args_and_run()) diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py index 1ed7d2a13f43b..85f05b9a72ce8 100755 --- a/llvm/utils/mlgo-utils/extract_ir.py +++ b/llvm/utils/mlgo-utils/extract_ir.py @@ -4,6 +4,6 @@ import sys from mlgo.corpus.extract_ir import parse_args_and_run -if __name__ == '__main__': - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) +if __name__ == "__main__": + sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0]) sys.exit(parse_args_and_run()) diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py index 3e1a4fcca8cb6..725ac7f3461a0 100755 --- a/llvm/utils/mlgo-utils/make_corpus.py +++ b/llvm/utils/mlgo-utils/make_corpus.py @@ -4,6 +4,6 @@ import sys from mlgo.corpus.make_corpus import parse_args_and_run -if __name__ == '__main__': - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) +if __name__ == "__main__": + sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0]) sys.exit(parse_args_and_run()) From 7479f5a3bbd1569270742c32f79b45528033a7ca Mon Sep 17 00:00:00 2001 From: Vincent Lee Date: Wed, 9 Jul 2025 02:03:37 -0700 Subject: [PATCH 5/7] Use symlinks --- .../mlgo-utils/combine_training_corpus.py | 55 +++++- llvm/utils/mlgo-utils/extract_ir.py | 187 +++++++++++++++++- llvm/utils/mlgo-utils/make_corpus.py | 59 +++++- .../mlgo/corpus/combine_training_corpus.py | 53 +---- .../mlgo-utils/mlgo/corpus/extract_ir.py | 185 +---------------- .../mlgo-utils/mlgo/corpus/make_corpus.py | 54 +---- 6 files changed, 287 insertions(+), 306 deletions(-) mode change 100755 => 100644 llvm/utils/mlgo-utils/combine_training_corpus.py mode change 100755 => 100644 llvm/utils/mlgo-utils/extract_ir.py mode change 100755 => 100644 llvm/utils/mlgo-utils/make_corpus.py mode change 100644 => 120000 llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py mode change 100644 => 120000 llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py mode change 100644 => 120000 llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py old mode 100755 new mode 100644 index b8c247ecb181c..9884d6696a43f --- a/llvm/utils/mlgo-utils/combine_training_corpus.py +++ b/llvm/utils/mlgo-utils/combine_training_corpus.py @@ -1,9 +1,52 @@ -#!/usr/bin/env python3 +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +r"""Combine multiple training corpus into a single training corpus. + +Currently only support the case that multiple corpus share the same +configurables except the "modules" field. + +Usage: we'd like to combine training corpus corpus1 and corpus2 into +combinedcorpus; we first structure the files as follows: + +combinedcorpus +combinedcorpus/corpus1 +combinedcorpus/corpus2 + +Running this script with + +python3 \ +compiler_opt/tools/combine_training_corpus.py \ + --root_dir=$PATH_TO_combinedcorpus + +generates combinedcorpus/corpus_description.json file. In this way corpus1 +and corpus2 are combined into combinedcorpus. +""" + +import argparse +import logging + +from mlgo.corpus import combine_training_corpus_lib +from mlgo.corpus import flags + + +def parse_args_and_run(): + parser = argparse.ArgumentParser( + description="A tool for combining multiple training corpora" + ) + parser.add_argument( + "--root_dir", type=str, help="The root dir of module paths to combine." + ) + flags.add_verbosity_arguments(parser) + args = parser.parse_args() + main(args) + + +def main(args): + logging.basicConfig(level=args.verbosity) + + combine_training_corpus_lib.combine_corpus(args.root_dir) -import re -import sys -from mlgo.corpus.combine_training_corpus import parse_args_and_run if __name__ == "__main__": - sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0]) - sys.exit(parse_args_and_run()) + parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py old mode 100755 new mode 100644 index 85f05b9a72ce8..3101cef196b4a --- a/llvm/utils/mlgo-utils/extract_ir.py +++ b/llvm/utils/mlgo-utils/extract_ir.py @@ -1,9 +1,184 @@ -#!/usr/bin/env python3 +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +"""Extract IR for training. + +Extract IR for training, either from a compile_commands.json file produced by +cmake, or a linker parameter list file. + +Only run with +'python compiler_opt/tools/extract_ir.py ...' + +The compilation is assumed to have been performed with clang, using +-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all) + +In a distributed ThinLTO case, the compilation is assumed to have been performed +specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt. + +In a local ThinLTO case, the compilation is assumedto have been performed +specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files + +To change the logging verbosity, set the --verbosity flag to the desired level. +Setting it to a specific level will enable all messages at that level and +higher. Exact values can be found by invoking the script with --help. +""" + +import argparse +import json +import logging + +from mlgo.corpus import extract_ir_lib +from mlgo.corpus import flags + + +def parse_args_and_run(): + parser = argparse.ArgumentParser( + description="A tool for making a corpus from build artifacts" + ) + parser.add_argument( + "--input", + type=str, + help="Input file or directory - either compile_commands.json, a linker " + "parameter list, or a path to a directory containing object files.", + ) + parser.add_argument( + "--input_type", + type=str, + help="Input file type - JSON, LLD params, directory, or bazel aquery.", + choices=["json", "params", "directory", "bazel_aquery"], + default="json", + nargs="?", + ) + parser.add_argument("--output_dir", type=str, help="Output directory") + parser.add_argument( + "--num_workers", + type=int, + help="Number of parallel works for objcopy. `None` for maximum available.", + default=None, + nargs="?", + ) + parser.add_argument( + "--llvm_objcopy_path", + type=str, + help="Path to llvm-objcopy", + default="llvm-objcopy", + nargs="?", + ) + parser.add_argument( + "--obj_base_dir", + type=str, + help="Base directory for object files. Defaults to current working dir.", + default="", + nargs="?", + ) + parser.add_argument( + "--cmd_filter", + type=str, + help="Include only those modules with a command line matching this regular " + "expression. Set it to None to not perform any filtering. Note that the " + "regular expression is applied independently for each separate command line " + "option. For example, ^-Oz$ will match Oz built binaries. This does not work " + "with thinlto_build=lld.", + default=None, + nargs="?", + ) + parser.add_argument( + "--thinlto_build", + type=str, + help="Set if the build was performed with either 'distributed' or 'local' " + "ThinLTO. This ensures the thinlto.bc files are also copied. The build is " + "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in " + "the distributed case or -Wl,--save-temps=import and " + "-Wl,--thinlto-emit-index-files passed in the local case", + choices=["distributed", "local"], + default=None, + nargs="?", + ) + parser.add_argument( + "--cmd_section_name", + type=str, + help="The section name passed to llvm-objcopy. For ELF object files, the " + "default .llvmcmd is correct. For Mach-O object files, one should use " + "something like __LLVM,__cmdline", + default=".llvmcmd", + nargs="?", + ) + parser.add_argument( + "--bitcode_section_name", + type=str, + help="The section name passed to llvm-objcopy. For ELF object files, the " + "default .llvmbc is correct. For Mach-O object files, one should use " + "__LLVM,__bitcode", + default=".llvmbc", + nargs="?", + ) + flags.add_verbosity_arguments(parser) + args = parser.parse_args() + main(args) + + +def main(args): + logging.basicConfig(level=args.verbosity) + + objs = [] + if args.input is not None and args.thinlto_build == "local": + raise ValueError("--thinlto_build=local cannot be run with --input") + if args.input is None: + if args.thinlto_build != "local": + raise ValueError("--input or --thinlto_build=local must be provided") + objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir) + elif args.input_type == "json": + with open(args.input, encoding="utf-8") as f: + objs = extract_ir_lib.load_from_compile_commands( + json.load(f), args.output_dir + ) + elif args.input_type == "params": + if not args.obj_base_dir: + logging.info( + "-obj_base_dir is unspecified, assuming current directory. " + "If no objects are found, use this option to specify the root " + "directory for the object file paths in the input file." + ) + with open(args.input, encoding="utf-8") as f: + objs = extract_ir_lib.load_from_lld_params( + [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir + ) + elif args.input_type == "directory": + logging.warning( + "Using the directory input is only recommended if the build system " + "your project uses does not support any structured output that " + "ml-compiler-opt understands. If your build system provides a " + "structured compilation database, use that instead" + ) + objs = extract_ir_lib.load_from_directory(args.input, args.output_dir) + elif args.input_type == "bazel_aquery": + with open(args.input, encoding="utf-8") as aquery_json_handle: + objs = extract_ir_lib.load_bazel_aquery( + json.load(aquery_json_handle), args.obj_base_dir, args.output_dir + ) + else: + logging.error("Unknown input type: %s", args.input_type) + + relative_output_paths = extract_ir_lib.run_extraction( + objs, + args.num_workers, + args.llvm_objcopy_path, + args.cmd_filter, + args.thinlto_build, + args.cmd_section_name, + args.bitcode_section_name, + ) + + extract_ir_lib.write_corpus_manifest( + args.thinlto_build, relative_output_paths, args.output_dir + ) + + logging.info( + "Converted %d files out of %d", + len(objs) - relative_output_paths.count(None), + len(objs), + ) -import re -import sys -from mlgo.corpus.extract_ir import parse_args_and_run if __name__ == "__main__": - sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0]) - sys.exit(parse_args_and_run()) + parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py old mode 100755 new mode 100644 index 725ac7f3461a0..92aab4d969d4d --- a/llvm/utils/mlgo-utils/make_corpus.py +++ b/llvm/utils/mlgo-utils/make_corpus.py @@ -1,9 +1,58 @@ -#!/usr/bin/env python3 +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +"""Tool for making a corpus from arbitrary bitcode. -import re +To create a corpus from a set of bitcode files in an input directory, run +the following command: + +PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \ + --input_dir= \ + --output_dir= \ + --default_args="" +""" + +import argparse +import logging import sys -from mlgo.corpus.make_corpus import parse_args_and_run +import pathlib +print(pathlib.Path(__file__).parent.parent.parent) + +sys.path.insert(0, pathlib.Path(__file__).parent.parent.parent) + +from mlgo.corpus import make_corpus_lib + + +def parse_args_and_run(): + parser = argparse.ArgumentParser( + description="A tool for making a corpus from arbitrary bitcode" + ) + parser.add_argument("--input_dir", type=str, help="The input directory.") + parser.add_argument("--output_dir", type=str, help="The output directory.") + parser.add_argument( + "--default_args", + type=str, + help="The compiler flags to compile with when using downstream tooling.", + default="", + nargs="?", + ) + args = parser.parse_args() + main(args) + + +def main(args): + logging.warning( + "Using this tool does not guarantee that the bitcode is taken at " + "the correct stage for consumption during model training. Make " + "sure to validate assumptions about where the bitcode is coming " + "from before using it in production." + ) + relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir) + make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir) + make_corpus_lib.write_corpus_manifest( + relative_paths, args.output_dir, args.default_args.split() + ) + if __name__ == "__main__": - sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0]) - sys.exit(parse_args_and_run()) + parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py deleted file mode 100644 index 9884d6696a43f..0000000000000 --- a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py +++ /dev/null @@ -1,52 +0,0 @@ -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -r"""Combine multiple training corpus into a single training corpus. - -Currently only support the case that multiple corpus share the same -configurables except the "modules" field. - -Usage: we'd like to combine training corpus corpus1 and corpus2 into -combinedcorpus; we first structure the files as follows: - -combinedcorpus -combinedcorpus/corpus1 -combinedcorpus/corpus2 - -Running this script with - -python3 \ -compiler_opt/tools/combine_training_corpus.py \ - --root_dir=$PATH_TO_combinedcorpus - -generates combinedcorpus/corpus_description.json file. In this way corpus1 -and corpus2 are combined into combinedcorpus. -""" - -import argparse -import logging - -from mlgo.corpus import combine_training_corpus_lib -from mlgo.corpus import flags - - -def parse_args_and_run(): - parser = argparse.ArgumentParser( - description="A tool for combining multiple training corpora" - ) - parser.add_argument( - "--root_dir", type=str, help="The root dir of module paths to combine." - ) - flags.add_verbosity_arguments(parser) - args = parser.parse_args() - main(args) - - -def main(args): - logging.basicConfig(level=args.verbosity) - - combine_training_corpus_lib.combine_corpus(args.root_dir) - - -if __name__ == "__main__": - parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py new file mode 120000 index 0000000000000..5a6885a6d1fa2 --- /dev/null +++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py @@ -0,0 +1 @@ +../../combine_training_corpus.py \ No newline at end of file diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py deleted file mode 100644 index 3101cef196b4a..0000000000000 --- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py +++ /dev/null @@ -1,184 +0,0 @@ -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -"""Extract IR for training. - -Extract IR for training, either from a compile_commands.json file produced by -cmake, or a linker parameter list file. - -Only run with -'python compiler_opt/tools/extract_ir.py ...' - -The compilation is assumed to have been performed with clang, using --fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all) - -In a distributed ThinLTO case, the compilation is assumed to have been performed -specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt. - -In a local ThinLTO case, the compilation is assumedto have been performed -specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files - -To change the logging verbosity, set the --verbosity flag to the desired level. -Setting it to a specific level will enable all messages at that level and -higher. Exact values can be found by invoking the script with --help. -""" - -import argparse -import json -import logging - -from mlgo.corpus import extract_ir_lib -from mlgo.corpus import flags - - -def parse_args_and_run(): - parser = argparse.ArgumentParser( - description="A tool for making a corpus from build artifacts" - ) - parser.add_argument( - "--input", - type=str, - help="Input file or directory - either compile_commands.json, a linker " - "parameter list, or a path to a directory containing object files.", - ) - parser.add_argument( - "--input_type", - type=str, - help="Input file type - JSON, LLD params, directory, or bazel aquery.", - choices=["json", "params", "directory", "bazel_aquery"], - default="json", - nargs="?", - ) - parser.add_argument("--output_dir", type=str, help="Output directory") - parser.add_argument( - "--num_workers", - type=int, - help="Number of parallel works for objcopy. `None` for maximum available.", - default=None, - nargs="?", - ) - parser.add_argument( - "--llvm_objcopy_path", - type=str, - help="Path to llvm-objcopy", - default="llvm-objcopy", - nargs="?", - ) - parser.add_argument( - "--obj_base_dir", - type=str, - help="Base directory for object files. Defaults to current working dir.", - default="", - nargs="?", - ) - parser.add_argument( - "--cmd_filter", - type=str, - help="Include only those modules with a command line matching this regular " - "expression. Set it to None to not perform any filtering. Note that the " - "regular expression is applied independently for each separate command line " - "option. For example, ^-Oz$ will match Oz built binaries. This does not work " - "with thinlto_build=lld.", - default=None, - nargs="?", - ) - parser.add_argument( - "--thinlto_build", - type=str, - help="Set if the build was performed with either 'distributed' or 'local' " - "ThinLTO. This ensures the thinlto.bc files are also copied. The build is " - "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in " - "the distributed case or -Wl,--save-temps=import and " - "-Wl,--thinlto-emit-index-files passed in the local case", - choices=["distributed", "local"], - default=None, - nargs="?", - ) - parser.add_argument( - "--cmd_section_name", - type=str, - help="The section name passed to llvm-objcopy. For ELF object files, the " - "default .llvmcmd is correct. For Mach-O object files, one should use " - "something like __LLVM,__cmdline", - default=".llvmcmd", - nargs="?", - ) - parser.add_argument( - "--bitcode_section_name", - type=str, - help="The section name passed to llvm-objcopy. For ELF object files, the " - "default .llvmbc is correct. For Mach-O object files, one should use " - "__LLVM,__bitcode", - default=".llvmbc", - nargs="?", - ) - flags.add_verbosity_arguments(parser) - args = parser.parse_args() - main(args) - - -def main(args): - logging.basicConfig(level=args.verbosity) - - objs = [] - if args.input is not None and args.thinlto_build == "local": - raise ValueError("--thinlto_build=local cannot be run with --input") - if args.input is None: - if args.thinlto_build != "local": - raise ValueError("--input or --thinlto_build=local must be provided") - objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir) - elif args.input_type == "json": - with open(args.input, encoding="utf-8") as f: - objs = extract_ir_lib.load_from_compile_commands( - json.load(f), args.output_dir - ) - elif args.input_type == "params": - if not args.obj_base_dir: - logging.info( - "-obj_base_dir is unspecified, assuming current directory. " - "If no objects are found, use this option to specify the root " - "directory for the object file paths in the input file." - ) - with open(args.input, encoding="utf-8") as f: - objs = extract_ir_lib.load_from_lld_params( - [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir - ) - elif args.input_type == "directory": - logging.warning( - "Using the directory input is only recommended if the build system " - "your project uses does not support any structured output that " - "ml-compiler-opt understands. If your build system provides a " - "structured compilation database, use that instead" - ) - objs = extract_ir_lib.load_from_directory(args.input, args.output_dir) - elif args.input_type == "bazel_aquery": - with open(args.input, encoding="utf-8") as aquery_json_handle: - objs = extract_ir_lib.load_bazel_aquery( - json.load(aquery_json_handle), args.obj_base_dir, args.output_dir - ) - else: - logging.error("Unknown input type: %s", args.input_type) - - relative_output_paths = extract_ir_lib.run_extraction( - objs, - args.num_workers, - args.llvm_objcopy_path, - args.cmd_filter, - args.thinlto_build, - args.cmd_section_name, - args.bitcode_section_name, - ) - - extract_ir_lib.write_corpus_manifest( - args.thinlto_build, relative_output_paths, args.output_dir - ) - - logging.info( - "Converted %d files out of %d", - len(objs) - relative_output_paths.count(None), - len(objs), - ) - - -if __name__ == "__main__": - parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py new file mode 120000 index 0000000000000..ce3baa062b3e1 --- /dev/null +++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py @@ -0,0 +1 @@ +../../extract_ir.py \ No newline at end of file diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py deleted file mode 100644 index 221486e16c6e0..0000000000000 --- a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py +++ /dev/null @@ -1,53 +0,0 @@ -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -"""Tool for making a corpus from arbitrary bitcode. - -To create a corpus from a set of bitcode files in an input directory, run -the following command: - -PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \ - --input_dir= \ - --output_dir= \ - --default_args="" -""" - -import argparse -import logging - -from mlgo.corpus import make_corpus_lib - - -def parse_args_and_run(): - parser = argparse.ArgumentParser( - description="A tool for making a corpus from arbitrary bitcode" - ) - parser.add_argument("--input_dir", type=str, help="The input directory.") - parser.add_argument("--output_dir", type=str, help="The output directory.") - parser.add_argument( - "--default_args", - type=str, - help="The compiler flags to compile with when using downstream tooling.", - default="", - nargs="?", - ) - args = parser.parse_args() - main(args) - - -def main(args): - logging.warning( - "Using this tool does not guarantee that the bitcode is taken at " - "the correct stage for consumption during model training. Make " - "sure to validate assumptions about where the bitcode is coming " - "from before using it in production." - ) - relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir) - make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir) - make_corpus_lib.write_corpus_manifest( - relative_paths, args.output_dir, args.default_args.split() - ) - - -if __name__ == "__main__": - parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py new file mode 120000 index 0000000000000..7ea4447a76efc --- /dev/null +++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py @@ -0,0 +1 @@ +../../make_corpus.py \ No newline at end of file From 9ea9dce4d154712b728ef874cd9aa915605e10ae Mon Sep 17 00:00:00 2001 From: Vincent Lee Date: Wed, 9 Jul 2025 02:11:04 -0700 Subject: [PATCH 6/7] Remove testing code --- llvm/utils/mlgo-utils/make_corpus.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py index 92aab4d969d4d..221486e16c6e0 100644 --- a/llvm/utils/mlgo-utils/make_corpus.py +++ b/llvm/utils/mlgo-utils/make_corpus.py @@ -14,11 +14,6 @@ import argparse import logging -import sys -import pathlib -print(pathlib.Path(__file__).parent.parent.parent) - -sys.path.insert(0, pathlib.Path(__file__).parent.parent.parent) from mlgo.corpus import make_corpus_lib From 2142d8dd28c34d041acdcab9ad6af20ed4244d5e Mon Sep 17 00:00:00 2001 From: Vincent Lee Date: Wed, 9 Jul 2025 15:13:09 -0700 Subject: [PATCH 7/7] Put symlinks in root --- .../mlgo-utils/combine_training_corpus.py | 53 +---- llvm/utils/mlgo-utils/extract_ir.py | 185 +----------------- llvm/utils/mlgo-utils/make_corpus.py | 54 +---- .../mlgo/corpus/combine_training_corpus.py | 53 ++++- .../mlgo-utils/mlgo/corpus/extract_ir.py | 185 +++++++++++++++++- .../mlgo-utils/mlgo/corpus/make_corpus.py | 54 ++++- 6 files changed, 292 insertions(+), 292 deletions(-) mode change 100644 => 120000 llvm/utils/mlgo-utils/combine_training_corpus.py mode change 100644 => 120000 llvm/utils/mlgo-utils/extract_ir.py mode change 100644 => 120000 llvm/utils/mlgo-utils/make_corpus.py mode change 120000 => 100644 llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py mode change 120000 => 100644 llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py mode change 120000 => 100644 llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py deleted file mode 100644 index 9884d6696a43f..0000000000000 --- a/llvm/utils/mlgo-utils/combine_training_corpus.py +++ /dev/null @@ -1,52 +0,0 @@ -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -r"""Combine multiple training corpus into a single training corpus. - -Currently only support the case that multiple corpus share the same -configurables except the "modules" field. - -Usage: we'd like to combine training corpus corpus1 and corpus2 into -combinedcorpus; we first structure the files as follows: - -combinedcorpus -combinedcorpus/corpus1 -combinedcorpus/corpus2 - -Running this script with - -python3 \ -compiler_opt/tools/combine_training_corpus.py \ - --root_dir=$PATH_TO_combinedcorpus - -generates combinedcorpus/corpus_description.json file. In this way corpus1 -and corpus2 are combined into combinedcorpus. -""" - -import argparse -import logging - -from mlgo.corpus import combine_training_corpus_lib -from mlgo.corpus import flags - - -def parse_args_and_run(): - parser = argparse.ArgumentParser( - description="A tool for combining multiple training corpora" - ) - parser.add_argument( - "--root_dir", type=str, help="The root dir of module paths to combine." - ) - flags.add_verbosity_arguments(parser) - args = parser.parse_args() - main(args) - - -def main(args): - logging.basicConfig(level=args.verbosity) - - combine_training_corpus_lib.combine_corpus(args.root_dir) - - -if __name__ == "__main__": - parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py new file mode 120000 index 0000000000000..d86f4ab284901 --- /dev/null +++ b/llvm/utils/mlgo-utils/combine_training_corpus.py @@ -0,0 +1 @@ +mlgo/corpus/combine_training_corpus.py \ No newline at end of file diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py deleted file mode 100644 index 3101cef196b4a..0000000000000 --- a/llvm/utils/mlgo-utils/extract_ir.py +++ /dev/null @@ -1,184 +0,0 @@ -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -"""Extract IR for training. - -Extract IR for training, either from a compile_commands.json file produced by -cmake, or a linker parameter list file. - -Only run with -'python compiler_opt/tools/extract_ir.py ...' - -The compilation is assumed to have been performed with clang, using --fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all) - -In a distributed ThinLTO case, the compilation is assumed to have been performed -specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt. - -In a local ThinLTO case, the compilation is assumedto have been performed -specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files - -To change the logging verbosity, set the --verbosity flag to the desired level. -Setting it to a specific level will enable all messages at that level and -higher. Exact values can be found by invoking the script with --help. -""" - -import argparse -import json -import logging - -from mlgo.corpus import extract_ir_lib -from mlgo.corpus import flags - - -def parse_args_and_run(): - parser = argparse.ArgumentParser( - description="A tool for making a corpus from build artifacts" - ) - parser.add_argument( - "--input", - type=str, - help="Input file or directory - either compile_commands.json, a linker " - "parameter list, or a path to a directory containing object files.", - ) - parser.add_argument( - "--input_type", - type=str, - help="Input file type - JSON, LLD params, directory, or bazel aquery.", - choices=["json", "params", "directory", "bazel_aquery"], - default="json", - nargs="?", - ) - parser.add_argument("--output_dir", type=str, help="Output directory") - parser.add_argument( - "--num_workers", - type=int, - help="Number of parallel works for objcopy. `None` for maximum available.", - default=None, - nargs="?", - ) - parser.add_argument( - "--llvm_objcopy_path", - type=str, - help="Path to llvm-objcopy", - default="llvm-objcopy", - nargs="?", - ) - parser.add_argument( - "--obj_base_dir", - type=str, - help="Base directory for object files. Defaults to current working dir.", - default="", - nargs="?", - ) - parser.add_argument( - "--cmd_filter", - type=str, - help="Include only those modules with a command line matching this regular " - "expression. Set it to None to not perform any filtering. Note that the " - "regular expression is applied independently for each separate command line " - "option. For example, ^-Oz$ will match Oz built binaries. This does not work " - "with thinlto_build=lld.", - default=None, - nargs="?", - ) - parser.add_argument( - "--thinlto_build", - type=str, - help="Set if the build was performed with either 'distributed' or 'local' " - "ThinLTO. This ensures the thinlto.bc files are also copied. The build is " - "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in " - "the distributed case or -Wl,--save-temps=import and " - "-Wl,--thinlto-emit-index-files passed in the local case", - choices=["distributed", "local"], - default=None, - nargs="?", - ) - parser.add_argument( - "--cmd_section_name", - type=str, - help="The section name passed to llvm-objcopy. For ELF object files, the " - "default .llvmcmd is correct. For Mach-O object files, one should use " - "something like __LLVM,__cmdline", - default=".llvmcmd", - nargs="?", - ) - parser.add_argument( - "--bitcode_section_name", - type=str, - help="The section name passed to llvm-objcopy. For ELF object files, the " - "default .llvmbc is correct. For Mach-O object files, one should use " - "__LLVM,__bitcode", - default=".llvmbc", - nargs="?", - ) - flags.add_verbosity_arguments(parser) - args = parser.parse_args() - main(args) - - -def main(args): - logging.basicConfig(level=args.verbosity) - - objs = [] - if args.input is not None and args.thinlto_build == "local": - raise ValueError("--thinlto_build=local cannot be run with --input") - if args.input is None: - if args.thinlto_build != "local": - raise ValueError("--input or --thinlto_build=local must be provided") - objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir) - elif args.input_type == "json": - with open(args.input, encoding="utf-8") as f: - objs = extract_ir_lib.load_from_compile_commands( - json.load(f), args.output_dir - ) - elif args.input_type == "params": - if not args.obj_base_dir: - logging.info( - "-obj_base_dir is unspecified, assuming current directory. " - "If no objects are found, use this option to specify the root " - "directory for the object file paths in the input file." - ) - with open(args.input, encoding="utf-8") as f: - objs = extract_ir_lib.load_from_lld_params( - [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir - ) - elif args.input_type == "directory": - logging.warning( - "Using the directory input is only recommended if the build system " - "your project uses does not support any structured output that " - "ml-compiler-opt understands. If your build system provides a " - "structured compilation database, use that instead" - ) - objs = extract_ir_lib.load_from_directory(args.input, args.output_dir) - elif args.input_type == "bazel_aquery": - with open(args.input, encoding="utf-8") as aquery_json_handle: - objs = extract_ir_lib.load_bazel_aquery( - json.load(aquery_json_handle), args.obj_base_dir, args.output_dir - ) - else: - logging.error("Unknown input type: %s", args.input_type) - - relative_output_paths = extract_ir_lib.run_extraction( - objs, - args.num_workers, - args.llvm_objcopy_path, - args.cmd_filter, - args.thinlto_build, - args.cmd_section_name, - args.bitcode_section_name, - ) - - extract_ir_lib.write_corpus_manifest( - args.thinlto_build, relative_output_paths, args.output_dir - ) - - logging.info( - "Converted %d files out of %d", - len(objs) - relative_output_paths.count(None), - len(objs), - ) - - -if __name__ == "__main__": - parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py new file mode 120000 index 0000000000000..c9b96abad0647 --- /dev/null +++ b/llvm/utils/mlgo-utils/extract_ir.py @@ -0,0 +1 @@ +mlgo/corpus/extract_ir.py \ No newline at end of file diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py deleted file mode 100644 index 221486e16c6e0..0000000000000 --- a/llvm/utils/mlgo-utils/make_corpus.py +++ /dev/null @@ -1,53 +0,0 @@ -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -"""Tool for making a corpus from arbitrary bitcode. - -To create a corpus from a set of bitcode files in an input directory, run -the following command: - -PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \ - --input_dir= \ - --output_dir= \ - --default_args="" -""" - -import argparse -import logging - -from mlgo.corpus import make_corpus_lib - - -def parse_args_and_run(): - parser = argparse.ArgumentParser( - description="A tool for making a corpus from arbitrary bitcode" - ) - parser.add_argument("--input_dir", type=str, help="The input directory.") - parser.add_argument("--output_dir", type=str, help="The output directory.") - parser.add_argument( - "--default_args", - type=str, - help="The compiler flags to compile with when using downstream tooling.", - default="", - nargs="?", - ) - args = parser.parse_args() - main(args) - - -def main(args): - logging.warning( - "Using this tool does not guarantee that the bitcode is taken at " - "the correct stage for consumption during model training. Make " - "sure to validate assumptions about where the bitcode is coming " - "from before using it in production." - ) - relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir) - make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir) - make_corpus_lib.write_corpus_manifest( - relative_paths, args.output_dir, args.default_args.split() - ) - - -if __name__ == "__main__": - parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py new file mode 120000 index 0000000000000..3e34a693dd2cc --- /dev/null +++ b/llvm/utils/mlgo-utils/make_corpus.py @@ -0,0 +1 @@ +mlgo/corpus/make_corpus.py \ No newline at end of file diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py deleted file mode 120000 index 5a6885a6d1fa2..0000000000000 --- a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py +++ /dev/null @@ -1 +0,0 @@ -../../combine_training_corpus.py \ No newline at end of file diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py new file mode 100644 index 0000000000000..9884d6696a43f --- /dev/null +++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py @@ -0,0 +1,52 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +r"""Combine multiple training corpus into a single training corpus. + +Currently only support the case that multiple corpus share the same +configurables except the "modules" field. + +Usage: we'd like to combine training corpus corpus1 and corpus2 into +combinedcorpus; we first structure the files as follows: + +combinedcorpus +combinedcorpus/corpus1 +combinedcorpus/corpus2 + +Running this script with + +python3 \ +compiler_opt/tools/combine_training_corpus.py \ + --root_dir=$PATH_TO_combinedcorpus + +generates combinedcorpus/corpus_description.json file. In this way corpus1 +and corpus2 are combined into combinedcorpus. +""" + +import argparse +import logging + +from mlgo.corpus import combine_training_corpus_lib +from mlgo.corpus import flags + + +def parse_args_and_run(): + parser = argparse.ArgumentParser( + description="A tool for combining multiple training corpora" + ) + parser.add_argument( + "--root_dir", type=str, help="The root dir of module paths to combine." + ) + flags.add_verbosity_arguments(parser) + args = parser.parse_args() + main(args) + + +def main(args): + logging.basicConfig(level=args.verbosity) + + combine_training_corpus_lib.combine_corpus(args.root_dir) + + +if __name__ == "__main__": + parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py deleted file mode 120000 index ce3baa062b3e1..0000000000000 --- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py +++ /dev/null @@ -1 +0,0 @@ -../../extract_ir.py \ No newline at end of file diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py new file mode 100644 index 0000000000000..3101cef196b4a --- /dev/null +++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py @@ -0,0 +1,184 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +"""Extract IR for training. + +Extract IR for training, either from a compile_commands.json file produced by +cmake, or a linker parameter list file. + +Only run with +'python compiler_opt/tools/extract_ir.py ...' + +The compilation is assumed to have been performed with clang, using +-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all) + +In a distributed ThinLTO case, the compilation is assumed to have been performed +specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt. + +In a local ThinLTO case, the compilation is assumedto have been performed +specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files + +To change the logging verbosity, set the --verbosity flag to the desired level. +Setting it to a specific level will enable all messages at that level and +higher. Exact values can be found by invoking the script with --help. +""" + +import argparse +import json +import logging + +from mlgo.corpus import extract_ir_lib +from mlgo.corpus import flags + + +def parse_args_and_run(): + parser = argparse.ArgumentParser( + description="A tool for making a corpus from build artifacts" + ) + parser.add_argument( + "--input", + type=str, + help="Input file or directory - either compile_commands.json, a linker " + "parameter list, or a path to a directory containing object files.", + ) + parser.add_argument( + "--input_type", + type=str, + help="Input file type - JSON, LLD params, directory, or bazel aquery.", + choices=["json", "params", "directory", "bazel_aquery"], + default="json", + nargs="?", + ) + parser.add_argument("--output_dir", type=str, help="Output directory") + parser.add_argument( + "--num_workers", + type=int, + help="Number of parallel works for objcopy. `None` for maximum available.", + default=None, + nargs="?", + ) + parser.add_argument( + "--llvm_objcopy_path", + type=str, + help="Path to llvm-objcopy", + default="llvm-objcopy", + nargs="?", + ) + parser.add_argument( + "--obj_base_dir", + type=str, + help="Base directory for object files. Defaults to current working dir.", + default="", + nargs="?", + ) + parser.add_argument( + "--cmd_filter", + type=str, + help="Include only those modules with a command line matching this regular " + "expression. Set it to None to not perform any filtering. Note that the " + "regular expression is applied independently for each separate command line " + "option. For example, ^-Oz$ will match Oz built binaries. This does not work " + "with thinlto_build=lld.", + default=None, + nargs="?", + ) + parser.add_argument( + "--thinlto_build", + type=str, + help="Set if the build was performed with either 'distributed' or 'local' " + "ThinLTO. This ensures the thinlto.bc files are also copied. The build is " + "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in " + "the distributed case or -Wl,--save-temps=import and " + "-Wl,--thinlto-emit-index-files passed in the local case", + choices=["distributed", "local"], + default=None, + nargs="?", + ) + parser.add_argument( + "--cmd_section_name", + type=str, + help="The section name passed to llvm-objcopy. For ELF object files, the " + "default .llvmcmd is correct. For Mach-O object files, one should use " + "something like __LLVM,__cmdline", + default=".llvmcmd", + nargs="?", + ) + parser.add_argument( + "--bitcode_section_name", + type=str, + help="The section name passed to llvm-objcopy. For ELF object files, the " + "default .llvmbc is correct. For Mach-O object files, one should use " + "__LLVM,__bitcode", + default=".llvmbc", + nargs="?", + ) + flags.add_verbosity_arguments(parser) + args = parser.parse_args() + main(args) + + +def main(args): + logging.basicConfig(level=args.verbosity) + + objs = [] + if args.input is not None and args.thinlto_build == "local": + raise ValueError("--thinlto_build=local cannot be run with --input") + if args.input is None: + if args.thinlto_build != "local": + raise ValueError("--input or --thinlto_build=local must be provided") + objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir) + elif args.input_type == "json": + with open(args.input, encoding="utf-8") as f: + objs = extract_ir_lib.load_from_compile_commands( + json.load(f), args.output_dir + ) + elif args.input_type == "params": + if not args.obj_base_dir: + logging.info( + "-obj_base_dir is unspecified, assuming current directory. " + "If no objects are found, use this option to specify the root " + "directory for the object file paths in the input file." + ) + with open(args.input, encoding="utf-8") as f: + objs = extract_ir_lib.load_from_lld_params( + [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir + ) + elif args.input_type == "directory": + logging.warning( + "Using the directory input is only recommended if the build system " + "your project uses does not support any structured output that " + "ml-compiler-opt understands. If your build system provides a " + "structured compilation database, use that instead" + ) + objs = extract_ir_lib.load_from_directory(args.input, args.output_dir) + elif args.input_type == "bazel_aquery": + with open(args.input, encoding="utf-8") as aquery_json_handle: + objs = extract_ir_lib.load_bazel_aquery( + json.load(aquery_json_handle), args.obj_base_dir, args.output_dir + ) + else: + logging.error("Unknown input type: %s", args.input_type) + + relative_output_paths = extract_ir_lib.run_extraction( + objs, + args.num_workers, + args.llvm_objcopy_path, + args.cmd_filter, + args.thinlto_build, + args.cmd_section_name, + args.bitcode_section_name, + ) + + extract_ir_lib.write_corpus_manifest( + args.thinlto_build, relative_output_paths, args.output_dir + ) + + logging.info( + "Converted %d files out of %d", + len(objs) - relative_output_paths.count(None), + len(objs), + ) + + +if __name__ == "__main__": + parse_args_and_run() diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py deleted file mode 120000 index 7ea4447a76efc..0000000000000 --- a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py +++ /dev/null @@ -1 +0,0 @@ -../../make_corpus.py \ No newline at end of file diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py new file mode 100644 index 0000000000000..221486e16c6e0 --- /dev/null +++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py @@ -0,0 +1,53 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +"""Tool for making a corpus from arbitrary bitcode. + +To create a corpus from a set of bitcode files in an input directory, run +the following command: + +PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \ + --input_dir= \ + --output_dir= \ + --default_args="" +""" + +import argparse +import logging + +from mlgo.corpus import make_corpus_lib + + +def parse_args_and_run(): + parser = argparse.ArgumentParser( + description="A tool for making a corpus from arbitrary bitcode" + ) + parser.add_argument("--input_dir", type=str, help="The input directory.") + parser.add_argument("--output_dir", type=str, help="The output directory.") + parser.add_argument( + "--default_args", + type=str, + help="The compiler flags to compile with when using downstream tooling.", + default="", + nargs="?", + ) + args = parser.parse_args() + main(args) + + +def main(args): + logging.warning( + "Using this tool does not guarantee that the bitcode is taken at " + "the correct stage for consumption during model training. Make " + "sure to validate assumptions about where the bitcode is coming " + "from before using it in production." + ) + relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir) + make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir) + make_corpus_lib.write_corpus_manifest( + relative_paths, args.output_dir, args.default_args.split() + ) + + +if __name__ == "__main__": + parse_args_and_run()