From 497d8f76d2df3dbd396c8b734b96ce81b12dcea7 Mon Sep 17 00:00:00 2001
From: Vincent Lee <leevince@fb.com>
Date: Thu, 3 Jul 2025 18:22:55 -0700
Subject: [PATCH 1/7] [mlgo-utils] Hoist entry script out to the correct
 directory

---
 .../mlgo-utils/{mlgo/corpus => }/combine_training_corpus.py | 0
 llvm/utils/mlgo-utils/{mlgo/corpus => }/extract_ir.py       | 0
 llvm/utils/mlgo-utils/{mlgo/corpus => }/make_corpus.py      | 0
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel           | 6 +++---
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename llvm/utils/mlgo-utils/{mlgo/corpus => }/combine_training_corpus.py (100%)
 rename llvm/utils/mlgo-utils/{mlgo/corpus => }/extract_ir.py (100%)
 rename llvm/utils/mlgo-utils/{mlgo/corpus => }/make_corpus.py (100%)

diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py
similarity index 100%
rename from llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
rename to llvm/utils/mlgo-utils/combine_training_corpus.py
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py
similarity index 100%
rename from llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
rename to llvm/utils/mlgo-utils/extract_ir.py
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py
similarity index 100%
rename from llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
rename to llvm/utils/mlgo-utils/make_corpus.py
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index b618c74c19da1..db8a92fd25de6 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -5210,8 +5210,8 @@ py_binary(
 py_binary(
     name = "extract_ir",
     srcs = [
+        "utils/mlgo-utils/extract_ir.py",
         "utils/mlgo-utils/mlgo/__init__.py",
-        "utils/mlgo-utils/mlgo/corpus/extract_ir.py",
         "utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py",
         "utils/mlgo-utils/mlgo/corpus/flags.py",
     ],
@@ -5221,8 +5221,8 @@ py_binary(
 py_binary(
     name = "combine_training_corpus",
     srcs = [
+        "utils/mlgo-utils/combine_training_corpus.py",
         "utils/mlgo-utils/mlgo/__init__.py",
-        "utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py",
         "utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py",
         "utils/mlgo-utils/mlgo/corpus/flags.py",
     ],
@@ -5232,8 +5232,8 @@ py_binary(
 py_binary(
     name = "make_corpus",
     srcs = [
+        "utils/mlgo-utils/make_corpus.py",
         "utils/mlgo-utils/mlgo/__init__.py",
-        "utils/mlgo-utils/mlgo/corpus/make_corpus.py",
         "utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py",
     ],
     imports = ["utils/mlgo-utils"],

From ee72352a0ec409fe3f88bb09136261923c24d8ba Mon Sep 17 00:00:00 2001
From: Vincent Lee <leevince@fb.com>
Date: Fri, 4 Jul 2025 11:13:02 -0700
Subject: [PATCH 2/7] Add wrapper instead of moving

---
 .../mlgo-utils/combine_training_corpus.py     |  57 +-----
 llvm/utils/mlgo-utils/extract_ir.py           | 189 +-----------------
 llvm/utils/mlgo-utils/make_corpus.py          |  58 +-----
 .../mlgo/corpus/combine_training_corpus.py    |  52 +++++
 .../mlgo-utils/mlgo/corpus/extract_ir.py      | 184 +++++++++++++++++
 .../mlgo-utils/mlgo/corpus/make_corpus.py     |  53 +++++
 6 files changed, 310 insertions(+), 283 deletions(-)
 mode change 100644 => 100755 llvm/utils/mlgo-utils/combine_training_corpus.py
 mode change 100644 => 100755 llvm/utils/mlgo-utils/extract_ir.py
 mode change 100644 => 100755 llvm/utils/mlgo-utils/make_corpus.py
 create mode 100644 llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
 create mode 100644 llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
 create mode 100644 llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py

diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py
old mode 100644
new mode 100755
index 9884d6696a43f..7a1d870ad7e38
--- a/llvm/utils/mlgo-utils/combine_training_corpus.py
+++ b/llvm/utils/mlgo-utils/combine_training_corpus.py
@@ -1,52 +1,9 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-r"""Combine multiple training corpus into a single training corpus.
+#!/usr/bin/env python3
 
-Currently only support the case that multiple corpus share the same
-configurables except the "modules" field.
+import re
+import sys
+from mlgo.corpus.combine_training_corpus import parse_args_and_run
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(parse_args_and_run())
 
-Usage: we'd like to combine training corpus corpus1 and corpus2 into
-combinedcorpus; we first structure the files as follows:
-
-combinedcorpus
-combinedcorpus/corpus1
-combinedcorpus/corpus2
-
-Running this script with
-
-python3 \
-compiler_opt/tools/combine_training_corpus.py \
-  --root_dir=$PATH_TO_combinedcorpus
-
-generates combinedcorpus/corpus_description.json file. In this way corpus1
-and corpus2 are combined into combinedcorpus.
-"""
-
-import argparse
-import logging
-
-from mlgo.corpus import combine_training_corpus_lib
-from mlgo.corpus import flags
-
-
-def parse_args_and_run():
-    parser = argparse.ArgumentParser(
-        description="A tool for combining multiple training corpora"
-    )
-    parser.add_argument(
-        "--root_dir", type=str, help="The root dir of module paths to combine."
-    )
-    flags.add_verbosity_arguments(parser)
-    args = parser.parse_args()
-    main(args)
-
-
-def main(args):
-    logging.basicConfig(level=args.verbosity)
-
-    combine_training_corpus_lib.combine_corpus(args.root_dir)
-
-
-if __name__ == "__main__":
-    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py
old mode 100644
new mode 100755
index 3101cef196b4a..589a5c50af726
--- a/llvm/utils/mlgo-utils/extract_ir.py
+++ b/llvm/utils/mlgo-utils/extract_ir.py
@@ -1,184 +1,9 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Extract IR for training.
+#!/usr/bin/env python3
 
-Extract IR for training, either from a compile_commands.json file produced by
-cmake, or a linker parameter list file.
+import re
+import sys
+from mlgo.corpus.extract_ir import parse_args_and_run
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(parse_args_and_run())
 
-Only run with
-'python compiler_opt/tools/extract_ir.py ...'
-
-The compilation is assumed to have been performed with clang, using
--fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
-
-In a distributed ThinLTO case, the compilation is assumed to have been performed
-specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
-
-In a local ThinLTO case, the compilation is assumedto have been performed
-specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
-
-To change the logging verbosity, set the --verbosity flag to the desired level.
-Setting it to a specific level will enable all messages at that level and
-higher. Exact values can be found by invoking the script with --help.
-"""
-
-import argparse
-import json
-import logging
-
-from mlgo.corpus import extract_ir_lib
-from mlgo.corpus import flags
-
-
-def parse_args_and_run():
-    parser = argparse.ArgumentParser(
-        description="A tool for making a corpus from build artifacts"
-    )
-    parser.add_argument(
-        "--input",
-        type=str,
-        help="Input file or directory - either compile_commands.json, a linker "
-        "parameter list, or a path to a directory containing object files.",
-    )
-    parser.add_argument(
-        "--input_type",
-        type=str,
-        help="Input file type - JSON, LLD params, directory, or bazel aquery.",
-        choices=["json", "params", "directory", "bazel_aquery"],
-        default="json",
-        nargs="?",
-    )
-    parser.add_argument("--output_dir", type=str, help="Output directory")
-    parser.add_argument(
-        "--num_workers",
-        type=int,
-        help="Number of parallel works for objcopy. `None` for maximum available.",
-        default=None,
-        nargs="?",
-    )
-    parser.add_argument(
-        "--llvm_objcopy_path",
-        type=str,
-        help="Path to llvm-objcopy",
-        default="llvm-objcopy",
-        nargs="?",
-    )
-    parser.add_argument(
-        "--obj_base_dir",
-        type=str,
-        help="Base directory for object files. Defaults to current working dir.",
-        default="",
-        nargs="?",
-    )
-    parser.add_argument(
-        "--cmd_filter",
-        type=str,
-        help="Include only those modules with a command line matching this regular "
-        "expression. Set it to None to not perform any filtering. Note that the "
-        "regular expression is applied independently for each separate command line "
-        "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
-        "with thinlto_build=lld.",
-        default=None,
-        nargs="?",
-    )
-    parser.add_argument(
-        "--thinlto_build",
-        type=str,
-        help="Set if the build was performed with either 'distributed' or 'local' "
-        "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
-        "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
-        "the distributed case or -Wl,--save-temps=import and "
-        "-Wl,--thinlto-emit-index-files passed in the local case",
-        choices=["distributed", "local"],
-        default=None,
-        nargs="?",
-    )
-    parser.add_argument(
-        "--cmd_section_name",
-        type=str,
-        help="The section name passed to llvm-objcopy. For ELF object files, the "
-        "default .llvmcmd is correct. For Mach-O object files, one should use "
-        "something like __LLVM,__cmdline",
-        default=".llvmcmd",
-        nargs="?",
-    )
-    parser.add_argument(
-        "--bitcode_section_name",
-        type=str,
-        help="The section name passed to llvm-objcopy. For ELF object files, the "
-        "default .llvmbc is correct. For Mach-O object files, one should use "
-        "__LLVM,__bitcode",
-        default=".llvmbc",
-        nargs="?",
-    )
-    flags.add_verbosity_arguments(parser)
-    args = parser.parse_args()
-    main(args)
-
-
-def main(args):
-    logging.basicConfig(level=args.verbosity)
-
-    objs = []
-    if args.input is not None and args.thinlto_build == "local":
-        raise ValueError("--thinlto_build=local cannot be run with --input")
-    if args.input is None:
-        if args.thinlto_build != "local":
-            raise ValueError("--input or --thinlto_build=local must be provided")
-        objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
-    elif args.input_type == "json":
-        with open(args.input, encoding="utf-8") as f:
-            objs = extract_ir_lib.load_from_compile_commands(
-                json.load(f), args.output_dir
-            )
-    elif args.input_type == "params":
-        if not args.obj_base_dir:
-            logging.info(
-                "-obj_base_dir is unspecified, assuming current directory. "
-                "If no objects are found, use this option to specify the root "
-                "directory for the object file paths in the input file."
-            )
-        with open(args.input, encoding="utf-8") as f:
-            objs = extract_ir_lib.load_from_lld_params(
-                [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
-            )
-    elif args.input_type == "directory":
-        logging.warning(
-            "Using the directory input is only recommended if the build system "
-            "your project uses does not support any structured output that "
-            "ml-compiler-opt understands. If your build system provides a "
-            "structured compilation database, use that instead"
-        )
-        objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
-    elif args.input_type == "bazel_aquery":
-        with open(args.input, encoding="utf-8") as aquery_json_handle:
-            objs = extract_ir_lib.load_bazel_aquery(
-                json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
-            )
-    else:
-        logging.error("Unknown input type: %s", args.input_type)
-
-    relative_output_paths = extract_ir_lib.run_extraction(
-        objs,
-        args.num_workers,
-        args.llvm_objcopy_path,
-        args.cmd_filter,
-        args.thinlto_build,
-        args.cmd_section_name,
-        args.bitcode_section_name,
-    )
-
-    extract_ir_lib.write_corpus_manifest(
-        args.thinlto_build, relative_output_paths, args.output_dir
-    )
-
-    logging.info(
-        "Converted %d files out of %d",
-        len(objs) - relative_output_paths.count(None),
-        len(objs),
-    )
-
-
-if __name__ == "__main__":
-    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py
old mode 100644
new mode 100755
index 221486e16c6e0..5b4a9bef486ff
--- a/llvm/utils/mlgo-utils/make_corpus.py
+++ b/llvm/utils/mlgo-utils/make_corpus.py
@@ -1,53 +1,9 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Tool for making a corpus from arbitrary bitcode.
+#!/usr/bin/env python3
 
-To create a corpus from a set of bitcode files in an input directory, run
-the following command:
+import re
+import sys
+from mlgo.corpus.make_corpus import parse_args_and_run
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(parse_args_and_run())
 
-PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
-  --input_dir=<path to input directory> \
-  --output_dir=<path to output directory> \
-  --default_args="<list of space separated flags>"
-"""
-
-import argparse
-import logging
-
-from mlgo.corpus import make_corpus_lib
-
-
-def parse_args_and_run():
-    parser = argparse.ArgumentParser(
-        description="A tool for making a corpus from arbitrary bitcode"
-    )
-    parser.add_argument("--input_dir", type=str, help="The input directory.")
-    parser.add_argument("--output_dir", type=str, help="The output directory.")
-    parser.add_argument(
-        "--default_args",
-        type=str,
-        help="The compiler flags to compile with when using downstream tooling.",
-        default="",
-        nargs="?",
-    )
-    args = parser.parse_args()
-    main(args)
-
-
-def main(args):
-    logging.warning(
-        "Using this tool does not guarantee that the bitcode is taken at "
-        "the correct stage for consumption during model training. Make "
-        "sure to validate assumptions about where the bitcode is coming "
-        "from before using it in production."
-    )
-    relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
-    make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
-    make_corpus_lib.write_corpus_manifest(
-        relative_paths, args.output_dir, args.default_args.split()
-    )
-
-
-if __name__ == "__main__":
-    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
new file mode 100644
index 0000000000000..9884d6696a43f
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
@@ -0,0 +1,52 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+r"""Combine multiple training corpus into a single training corpus.
+
+Currently only support the case that multiple corpus share the same
+configurables except the "modules" field.
+
+Usage: we'd like to combine training corpus corpus1 and corpus2 into
+combinedcorpus; we first structure the files as follows:
+
+combinedcorpus
+combinedcorpus/corpus1
+combinedcorpus/corpus2
+
+Running this script with
+
+python3 \
+compiler_opt/tools/combine_training_corpus.py \
+  --root_dir=$PATH_TO_combinedcorpus
+
+generates combinedcorpus/corpus_description.json file. In this way corpus1
+and corpus2 are combined into combinedcorpus.
+"""
+
+import argparse
+import logging
+
+from mlgo.corpus import combine_training_corpus_lib
+from mlgo.corpus import flags
+
+
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for combining multiple training corpora"
+    )
+    parser.add_argument(
+        "--root_dir", type=str, help="The root dir of module paths to combine."
+    )
+    flags.add_verbosity_arguments(parser)
+    args = parser.parse_args()
+    main(args)
+
+
+def main(args):
+    logging.basicConfig(level=args.verbosity)
+
+    combine_training_corpus_lib.combine_corpus(args.root_dir)
+
+
+if __name__ == "__main__":
+    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
new file mode 100644
index 0000000000000..3101cef196b4a
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -0,0 +1,184 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Extract IR for training.
+
+Extract IR for training, either from a compile_commands.json file produced by
+cmake, or a linker parameter list file.
+
+Only run with
+'python compiler_opt/tools/extract_ir.py ...'
+
+The compilation is assumed to have been performed with clang, using
+-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
+
+In a distributed ThinLTO case, the compilation is assumed to have been performed
+specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
+
+In a local ThinLTO case, the compilation is assumedto have been performed
+specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
+
+To change the logging verbosity, set the --verbosity flag to the desired level.
+Setting it to a specific level will enable all messages at that level and
+higher. Exact values can be found by invoking the script with --help.
+"""
+
+import argparse
+import json
+import logging
+
+from mlgo.corpus import extract_ir_lib
+from mlgo.corpus import flags
+
+
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for making a corpus from build artifacts"
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        help="Input file or directory - either compile_commands.json, a linker "
+        "parameter list, or a path to a directory containing object files.",
+    )
+    parser.add_argument(
+        "--input_type",
+        type=str,
+        help="Input file type - JSON, LLD params, directory, or bazel aquery.",
+        choices=["json", "params", "directory", "bazel_aquery"],
+        default="json",
+        nargs="?",
+    )
+    parser.add_argument("--output_dir", type=str, help="Output directory")
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        help="Number of parallel works for objcopy. `None` for maximum available.",
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--llvm_objcopy_path",
+        type=str,
+        help="Path to llvm-objcopy",
+        default="llvm-objcopy",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--obj_base_dir",
+        type=str,
+        help="Base directory for object files. Defaults to current working dir.",
+        default="",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--cmd_filter",
+        type=str,
+        help="Include only those modules with a command line matching this regular "
+        "expression. Set it to None to not perform any filtering. Note that the "
+        "regular expression is applied independently for each separate command line "
+        "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
+        "with thinlto_build=lld.",
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--thinlto_build",
+        type=str,
+        help="Set if the build was performed with either 'distributed' or 'local' "
+        "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
+        "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
+        "the distributed case or -Wl,--save-temps=import and "
+        "-Wl,--thinlto-emit-index-files passed in the local case",
+        choices=["distributed", "local"],
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--cmd_section_name",
+        type=str,
+        help="The section name passed to llvm-objcopy. For ELF object files, the "
+        "default .llvmcmd is correct. For Mach-O object files, one should use "
+        "something like __LLVM,__cmdline",
+        default=".llvmcmd",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--bitcode_section_name",
+        type=str,
+        help="The section name passed to llvm-objcopy. For ELF object files, the "
+        "default .llvmbc is correct. For Mach-O object files, one should use "
+        "__LLVM,__bitcode",
+        default=".llvmbc",
+        nargs="?",
+    )
+    flags.add_verbosity_arguments(parser)
+    args = parser.parse_args()
+    main(args)
+
+
+def main(args):
+    logging.basicConfig(level=args.verbosity)
+
+    objs = []
+    if args.input is not None and args.thinlto_build == "local":
+        raise ValueError("--thinlto_build=local cannot be run with --input")
+    if args.input is None:
+        if args.thinlto_build != "local":
+            raise ValueError("--input or --thinlto_build=local must be provided")
+        objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
+    elif args.input_type == "json":
+        with open(args.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_compile_commands(
+                json.load(f), args.output_dir
+            )
+    elif args.input_type == "params":
+        if not args.obj_base_dir:
+            logging.info(
+                "-obj_base_dir is unspecified, assuming current directory. "
+                "If no objects are found, use this option to specify the root "
+                "directory for the object file paths in the input file."
+            )
+        with open(args.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_lld_params(
+                [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
+            )
+    elif args.input_type == "directory":
+        logging.warning(
+            "Using the directory input is only recommended if the build system "
+            "your project uses does not support any structured output that "
+            "ml-compiler-opt understands. If your build system provides a "
+            "structured compilation database, use that instead"
+        )
+        objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
+    elif args.input_type == "bazel_aquery":
+        with open(args.input, encoding="utf-8") as aquery_json_handle:
+            objs = extract_ir_lib.load_bazel_aquery(
+                json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
+            )
+    else:
+        logging.error("Unknown input type: %s", args.input_type)
+
+    relative_output_paths = extract_ir_lib.run_extraction(
+        objs,
+        args.num_workers,
+        args.llvm_objcopy_path,
+        args.cmd_filter,
+        args.thinlto_build,
+        args.cmd_section_name,
+        args.bitcode_section_name,
+    )
+
+    extract_ir_lib.write_corpus_manifest(
+        args.thinlto_build, relative_output_paths, args.output_dir
+    )
+
+    logging.info(
+        "Converted %d files out of %d",
+        len(objs) - relative_output_paths.count(None),
+        len(objs),
+    )
+
+
+if __name__ == "__main__":
+    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
new file mode 100644
index 0000000000000..221486e16c6e0
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
@@ -0,0 +1,53 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Tool for making a corpus from arbitrary bitcode.
+
+To create a corpus from a set of bitcode files in an input directory, run
+the following command:
+
+PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
+  --input_dir=<path to input directory> \
+  --output_dir=<path to output directory> \
+  --default_args="<list of space separated flags>"
+"""
+
+import argparse
+import logging
+
+from mlgo.corpus import make_corpus_lib
+
+
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for making a corpus from arbitrary bitcode"
+    )
+    parser.add_argument("--input_dir", type=str, help="The input directory.")
+    parser.add_argument("--output_dir", type=str, help="The output directory.")
+    parser.add_argument(
+        "--default_args",
+        type=str,
+        help="The compiler flags to compile with when using downstream tooling.",
+        default="",
+        nargs="?",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
+def main(args):
+    logging.warning(
+        "Using this tool does not guarantee that the bitcode is taken at "
+        "the correct stage for consumption during model training. Make "
+        "sure to validate assumptions about where the bitcode is coming "
+        "from before using it in production."
+    )
+    relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
+    make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
+    make_corpus_lib.write_corpus_manifest(
+        relative_paths, args.output_dir, args.default_args.split()
+    )
+
+
+if __name__ == "__main__":
+    parse_args_and_run()

From d51bd453fe228504f5cf06db2836798860399880 Mon Sep 17 00:00:00 2001
From: Vincent Lee <leevince@fb.com>
Date: Fri, 4 Jul 2025 11:20:34 -0700
Subject: [PATCH 3/7] format

---
 llvm/utils/mlgo-utils/combine_training_corpus.py | 2 +-
 llvm/utils/mlgo-utils/extract_ir.py              | 2 +-
 llvm/utils/mlgo-utils/make_corpus.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py
index 7a1d870ad7e38..563801091f2d2 100755
--- a/llvm/utils/mlgo-utils/combine_training_corpus.py
+++ b/llvm/utils/mlgo-utils/combine_training_corpus.py
@@ -3,7 +3,7 @@
 import re
 import sys
 from mlgo.corpus.combine_training_corpus import parse_args_and_run
+
 if __name__ == '__main__':
     sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
     sys.exit(parse_args_and_run())
-
diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py
index 589a5c50af726..1ed7d2a13f43b 100755
--- a/llvm/utils/mlgo-utils/extract_ir.py
+++ b/llvm/utils/mlgo-utils/extract_ir.py
@@ -3,7 +3,7 @@
 import re
 import sys
 from mlgo.corpus.extract_ir import parse_args_and_run
+
 if __name__ == '__main__':
     sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
     sys.exit(parse_args_and_run())
-
diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py
index 5b4a9bef486ff..3e1a4fcca8cb6 100755
--- a/llvm/utils/mlgo-utils/make_corpus.py
+++ b/llvm/utils/mlgo-utils/make_corpus.py
@@ -3,7 +3,7 @@
 import re
 import sys
 from mlgo.corpus.make_corpus import parse_args_and_run
+
 if __name__ == '__main__':
     sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
     sys.exit(parse_args_and_run())
-

From 370f7ac40c13a84bafea103fd6b92357078e9c49 Mon Sep 17 00:00:00 2001
From: Vincent Lee <leevince@fb.com>
Date: Mon, 7 Jul 2025 15:16:12 -0700
Subject: [PATCH 4/7] Use double quotes

---
 llvm/utils/mlgo-utils/combine_training_corpus.py | 4 ++--
 llvm/utils/mlgo-utils/extract_ir.py              | 4 ++--
 llvm/utils/mlgo-utils/make_corpus.py             | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py
index 563801091f2d2..b8c247ecb181c 100755
--- a/llvm/utils/mlgo-utils/combine_training_corpus.py
+++ b/llvm/utils/mlgo-utils/combine_training_corpus.py
@@ -4,6 +4,6 @@
 import sys
 from mlgo.corpus.combine_training_corpus import parse_args_and_run
 
-if __name__ == '__main__':
-    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+if __name__ == "__main__":
+    sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0])
     sys.exit(parse_args_and_run())
diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py
index 1ed7d2a13f43b..85f05b9a72ce8 100755
--- a/llvm/utils/mlgo-utils/extract_ir.py
+++ b/llvm/utils/mlgo-utils/extract_ir.py
@@ -4,6 +4,6 @@
 import sys
 from mlgo.corpus.extract_ir import parse_args_and_run
 
-if __name__ == '__main__':
-    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+if __name__ == "__main__":
+    sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0])
     sys.exit(parse_args_and_run())
diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py
index 3e1a4fcca8cb6..725ac7f3461a0 100755
--- a/llvm/utils/mlgo-utils/make_corpus.py
+++ b/llvm/utils/mlgo-utils/make_corpus.py
@@ -4,6 +4,6 @@
 import sys
 from mlgo.corpus.make_corpus import parse_args_and_run
 
-if __name__ == '__main__':
-    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+if __name__ == "__main__":
+    sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0])
     sys.exit(parse_args_and_run())

From 7479f5a3bbd1569270742c32f79b45528033a7ca Mon Sep 17 00:00:00 2001
From: Vincent Lee <leevince@fb.com>
Date: Wed, 9 Jul 2025 02:03:37 -0700
Subject: [PATCH 5/7] Use symlinks

---
 .../mlgo-utils/combine_training_corpus.py     |  55 +++++-
 llvm/utils/mlgo-utils/extract_ir.py           | 187 +++++++++++++++++-
 llvm/utils/mlgo-utils/make_corpus.py          |  59 +++++-
 .../mlgo/corpus/combine_training_corpus.py    |  53 +----
 .../mlgo-utils/mlgo/corpus/extract_ir.py      | 185 +----------------
 .../mlgo-utils/mlgo/corpus/make_corpus.py     |  54 +----
 6 files changed, 287 insertions(+), 306 deletions(-)
 mode change 100755 => 100644 llvm/utils/mlgo-utils/combine_training_corpus.py
 mode change 100755 => 100644 llvm/utils/mlgo-utils/extract_ir.py
 mode change 100755 => 100644 llvm/utils/mlgo-utils/make_corpus.py
 mode change 100644 => 120000 llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
 mode change 100644 => 120000 llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
 mode change 100644 => 120000 llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py

diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py
old mode 100755
new mode 100644
index b8c247ecb181c..9884d6696a43f
--- a/llvm/utils/mlgo-utils/combine_training_corpus.py
+++ b/llvm/utils/mlgo-utils/combine_training_corpus.py
@@ -1,9 +1,52 @@
-#!/usr/bin/env python3
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+r"""Combine multiple training corpus into a single training corpus.
+
+Currently only support the case that multiple corpus share the same
+configurables except the "modules" field.
+
+Usage: we'd like to combine training corpus corpus1 and corpus2 into
+combinedcorpus; we first structure the files as follows:
+
+combinedcorpus
+combinedcorpus/corpus1
+combinedcorpus/corpus2
+
+Running this script with
+
+python3 \
+compiler_opt/tools/combine_training_corpus.py \
+  --root_dir=$PATH_TO_combinedcorpus
+
+generates combinedcorpus/corpus_description.json file. In this way corpus1
+and corpus2 are combined into combinedcorpus.
+"""
+
+import argparse
+import logging
+
+from mlgo.corpus import combine_training_corpus_lib
+from mlgo.corpus import flags
+
+
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for combining multiple training corpora"
+    )
+    parser.add_argument(
+        "--root_dir", type=str, help="The root dir of module paths to combine."
+    )
+    flags.add_verbosity_arguments(parser)
+    args = parser.parse_args()
+    main(args)
+
+
+def main(args):
+    logging.basicConfig(level=args.verbosity)
+
+    combine_training_corpus_lib.combine_corpus(args.root_dir)
 
-import re
-import sys
-from mlgo.corpus.combine_training_corpus import parse_args_and_run
 
 if __name__ == "__main__":
-    sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0])
-    sys.exit(parse_args_and_run())
+    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py
old mode 100755
new mode 100644
index 85f05b9a72ce8..3101cef196b4a
--- a/llvm/utils/mlgo-utils/extract_ir.py
+++ b/llvm/utils/mlgo-utils/extract_ir.py
@@ -1,9 +1,184 @@
-#!/usr/bin/env python3
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Extract IR for training.
+
+Extract IR for training, either from a compile_commands.json file produced by
+cmake, or a linker parameter list file.
+
+Only run with
+'python compiler_opt/tools/extract_ir.py ...'
+
+The compilation is assumed to have been performed with clang, using
+-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
+
+In a distributed ThinLTO case, the compilation is assumed to have been performed
+specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
+
+In a local ThinLTO case, the compilation is assumedto have been performed
+specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
+
+To change the logging verbosity, set the --verbosity flag to the desired level.
+Setting it to a specific level will enable all messages at that level and
+higher. Exact values can be found by invoking the script with --help.
+"""
+
+import argparse
+import json
+import logging
+
+from mlgo.corpus import extract_ir_lib
+from mlgo.corpus import flags
+
+
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for making a corpus from build artifacts"
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        help="Input file or directory - either compile_commands.json, a linker "
+        "parameter list, or a path to a directory containing object files.",
+    )
+    parser.add_argument(
+        "--input_type",
+        type=str,
+        help="Input file type - JSON, LLD params, directory, or bazel aquery.",
+        choices=["json", "params", "directory", "bazel_aquery"],
+        default="json",
+        nargs="?",
+    )
+    parser.add_argument("--output_dir", type=str, help="Output directory")
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        help="Number of parallel works for objcopy. `None` for maximum available.",
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--llvm_objcopy_path",
+        type=str,
+        help="Path to llvm-objcopy",
+        default="llvm-objcopy",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--obj_base_dir",
+        type=str,
+        help="Base directory for object files. Defaults to current working dir.",
+        default="",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--cmd_filter",
+        type=str,
+        help="Include only those modules with a command line matching this regular "
+        "expression. Set it to None to not perform any filtering. Note that the "
+        "regular expression is applied independently for each separate command line "
+        "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
+        "with thinlto_build=lld.",
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--thinlto_build",
+        type=str,
+        help="Set if the build was performed with either 'distributed' or 'local' "
+        "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
+        "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
+        "the distributed case or -Wl,--save-temps=import and "
+        "-Wl,--thinlto-emit-index-files passed in the local case",
+        choices=["distributed", "local"],
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--cmd_section_name",
+        type=str,
+        help="The section name passed to llvm-objcopy. For ELF object files, the "
+        "default .llvmcmd is correct. For Mach-O object files, one should use "
+        "something like __LLVM,__cmdline",
+        default=".llvmcmd",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--bitcode_section_name",
+        type=str,
+        help="The section name passed to llvm-objcopy. For ELF object files, the "
+        "default .llvmbc is correct. For Mach-O object files, one should use "
+        "__LLVM,__bitcode",
+        default=".llvmbc",
+        nargs="?",
+    )
+    flags.add_verbosity_arguments(parser)
+    args = parser.parse_args()
+    main(args)
+
+
+def main(args):
+    logging.basicConfig(level=args.verbosity)
+
+    objs = []
+    if args.input is not None and args.thinlto_build == "local":
+        raise ValueError("--thinlto_build=local cannot be run with --input")
+    if args.input is None:
+        if args.thinlto_build != "local":
+            raise ValueError("--input or --thinlto_build=local must be provided")
+        objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
+    elif args.input_type == "json":
+        with open(args.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_compile_commands(
+                json.load(f), args.output_dir
+            )
+    elif args.input_type == "params":
+        if not args.obj_base_dir:
+            logging.info(
+                "-obj_base_dir is unspecified, assuming current directory. "
+                "If no objects are found, use this option to specify the root "
+                "directory for the object file paths in the input file."
+            )
+        with open(args.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_lld_params(
+                [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
+            )
+    elif args.input_type == "directory":
+        logging.warning(
+            "Using the directory input is only recommended if the build system "
+            "your project uses does not support any structured output that "
+            "ml-compiler-opt understands. If your build system provides a "
+            "structured compilation database, use that instead"
+        )
+        objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
+    elif args.input_type == "bazel_aquery":
+        with open(args.input, encoding="utf-8") as aquery_json_handle:
+            objs = extract_ir_lib.load_bazel_aquery(
+                json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
+            )
+    else:
+        logging.error("Unknown input type: %s", args.input_type)
+
+    relative_output_paths = extract_ir_lib.run_extraction(
+        objs,
+        args.num_workers,
+        args.llvm_objcopy_path,
+        args.cmd_filter,
+        args.thinlto_build,
+        args.cmd_section_name,
+        args.bitcode_section_name,
+    )
+
+    extract_ir_lib.write_corpus_manifest(
+        args.thinlto_build, relative_output_paths, args.output_dir
+    )
+
+    logging.info(
+        "Converted %d files out of %d",
+        len(objs) - relative_output_paths.count(None),
+        len(objs),
+    )
 
-import re
-import sys
-from mlgo.corpus.extract_ir import parse_args_and_run
 
 if __name__ == "__main__":
-    sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0])
-    sys.exit(parse_args_and_run())
+    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py
old mode 100755
new mode 100644
index 725ac7f3461a0..92aab4d969d4d
--- a/llvm/utils/mlgo-utils/make_corpus.py
+++ b/llvm/utils/mlgo-utils/make_corpus.py
@@ -1,9 +1,58 @@
-#!/usr/bin/env python3
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Tool for making a corpus from arbitrary bitcode.
 
-import re
+To create a corpus from a set of bitcode files in an input directory, run
+the following command:
+
+PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
+  --input_dir=<path to input directory> \
+  --output_dir=<path to output directory> \
+  --default_args="<list of space separated flags>"
+"""
+
+import argparse
+import logging
 import sys
-from mlgo.corpus.make_corpus import parse_args_and_run
+import pathlib
+print(pathlib.Path(__file__).parent.parent.parent)
+
+sys.path.insert(0, pathlib.Path(__file__).parent.parent.parent)
+
+from mlgo.corpus import make_corpus_lib
+
+
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for making a corpus from arbitrary bitcode"
+    )
+    parser.add_argument("--input_dir", type=str, help="The input directory.")
+    parser.add_argument("--output_dir", type=str, help="The output directory.")
+    parser.add_argument(
+        "--default_args",
+        type=str,
+        help="The compiler flags to compile with when using downstream tooling.",
+        default="",
+        nargs="?",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
+def main(args):
+    logging.warning(
+        "Using this tool does not guarantee that the bitcode is taken at "
+        "the correct stage for consumption during model training. Make "
+        "sure to validate assumptions about where the bitcode is coming "
+        "from before using it in production."
+    )
+    relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
+    make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
+    make_corpus_lib.write_corpus_manifest(
+        relative_paths, args.output_dir, args.default_args.split()
+    )
+
 
 if __name__ == "__main__":
-    sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0])
-    sys.exit(parse_args_and_run())
+    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
deleted file mode 100644
index 9884d6696a43f..0000000000000
--- a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-r"""Combine multiple training corpus into a single training corpus.
-
-Currently only support the case that multiple corpus share the same
-configurables except the "modules" field.
-
-Usage: we'd like to combine training corpus corpus1 and corpus2 into
-combinedcorpus; we first structure the files as follows:
-
-combinedcorpus
-combinedcorpus/corpus1
-combinedcorpus/corpus2
-
-Running this script with
-
-python3 \
-compiler_opt/tools/combine_training_corpus.py \
-  --root_dir=$PATH_TO_combinedcorpus
-
-generates combinedcorpus/corpus_description.json file. In this way corpus1
-and corpus2 are combined into combinedcorpus.
-"""
-
-import argparse
-import logging
-
-from mlgo.corpus import combine_training_corpus_lib
-from mlgo.corpus import flags
-
-
-def parse_args_and_run():
-    parser = argparse.ArgumentParser(
-        description="A tool for combining multiple training corpora"
-    )
-    parser.add_argument(
-        "--root_dir", type=str, help="The root dir of module paths to combine."
-    )
-    flags.add_verbosity_arguments(parser)
-    args = parser.parse_args()
-    main(args)
-
-
-def main(args):
-    logging.basicConfig(level=args.verbosity)
-
-    combine_training_corpus_lib.combine_corpus(args.root_dir)
-
-
-if __name__ == "__main__":
-    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
new file mode 120000
index 0000000000000..5a6885a6d1fa2
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
@@ -0,0 +1 @@
+../../combine_training_corpus.py
\ No newline at end of file
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
deleted file mode 100644
index 3101cef196b4a..0000000000000
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Extract IR for training.
-
-Extract IR for training, either from a compile_commands.json file produced by
-cmake, or a linker parameter list file.
-
-Only run with
-'python compiler_opt/tools/extract_ir.py ...'
-
-The compilation is assumed to have been performed with clang, using
--fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
-
-In a distributed ThinLTO case, the compilation is assumed to have been performed
-specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
-
-In a local ThinLTO case, the compilation is assumedto have been performed
-specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
-
-To change the logging verbosity, set the --verbosity flag to the desired level.
-Setting it to a specific level will enable all messages at that level and
-higher. Exact values can be found by invoking the script with --help.
-"""
-
-import argparse
-import json
-import logging
-
-from mlgo.corpus import extract_ir_lib
-from mlgo.corpus import flags
-
-
-def parse_args_and_run():
-    parser = argparse.ArgumentParser(
-        description="A tool for making a corpus from build artifacts"
-    )
-    parser.add_argument(
-        "--input",
-        type=str,
-        help="Input file or directory - either compile_commands.json, a linker "
-        "parameter list, or a path to a directory containing object files.",
-    )
-    parser.add_argument(
-        "--input_type",
-        type=str,
-        help="Input file type - JSON, LLD params, directory, or bazel aquery.",
-        choices=["json", "params", "directory", "bazel_aquery"],
-        default="json",
-        nargs="?",
-    )
-    parser.add_argument("--output_dir", type=str, help="Output directory")
-    parser.add_argument(
-        "--num_workers",
-        type=int,
-        help="Number of parallel works for objcopy. `None` for maximum available.",
-        default=None,
-        nargs="?",
-    )
-    parser.add_argument(
-        "--llvm_objcopy_path",
-        type=str,
-        help="Path to llvm-objcopy",
-        default="llvm-objcopy",
-        nargs="?",
-    )
-    parser.add_argument(
-        "--obj_base_dir",
-        type=str,
-        help="Base directory for object files. Defaults to current working dir.",
-        default="",
-        nargs="?",
-    )
-    parser.add_argument(
-        "--cmd_filter",
-        type=str,
-        help="Include only those modules with a command line matching this regular "
-        "expression. Set it to None to not perform any filtering. Note that the "
-        "regular expression is applied independently for each separate command line "
-        "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
-        "with thinlto_build=lld.",
-        default=None,
-        nargs="?",
-    )
-    parser.add_argument(
-        "--thinlto_build",
-        type=str,
-        help="Set if the build was performed with either 'distributed' or 'local' "
-        "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
-        "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
-        "the distributed case or -Wl,--save-temps=import and "
-        "-Wl,--thinlto-emit-index-files passed in the local case",
-        choices=["distributed", "local"],
-        default=None,
-        nargs="?",
-    )
-    parser.add_argument(
-        "--cmd_section_name",
-        type=str,
-        help="The section name passed to llvm-objcopy. For ELF object files, the "
-        "default .llvmcmd is correct. For Mach-O object files, one should use "
-        "something like __LLVM,__cmdline",
-        default=".llvmcmd",
-        nargs="?",
-    )
-    parser.add_argument(
-        "--bitcode_section_name",
-        type=str,
-        help="The section name passed to llvm-objcopy. For ELF object files, the "
-        "default .llvmbc is correct. For Mach-O object files, one should use "
-        "__LLVM,__bitcode",
-        default=".llvmbc",
-        nargs="?",
-    )
-    flags.add_verbosity_arguments(parser)
-    args = parser.parse_args()
-    main(args)
-
-
-def main(args):
-    logging.basicConfig(level=args.verbosity)
-
-    objs = []
-    if args.input is not None and args.thinlto_build == "local":
-        raise ValueError("--thinlto_build=local cannot be run with --input")
-    if args.input is None:
-        if args.thinlto_build != "local":
-            raise ValueError("--input or --thinlto_build=local must be provided")
-        objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
-    elif args.input_type == "json":
-        with open(args.input, encoding="utf-8") as f:
-            objs = extract_ir_lib.load_from_compile_commands(
-                json.load(f), args.output_dir
-            )
-    elif args.input_type == "params":
-        if not args.obj_base_dir:
-            logging.info(
-                "-obj_base_dir is unspecified, assuming current directory. "
-                "If no objects are found, use this option to specify the root "
-                "directory for the object file paths in the input file."
-            )
-        with open(args.input, encoding="utf-8") as f:
-            objs = extract_ir_lib.load_from_lld_params(
-                [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
-            )
-    elif args.input_type == "directory":
-        logging.warning(
-            "Using the directory input is only recommended if the build system "
-            "your project uses does not support any structured output that "
-            "ml-compiler-opt understands. If your build system provides a "
-            "structured compilation database, use that instead"
-        )
-        objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
-    elif args.input_type == "bazel_aquery":
-        with open(args.input, encoding="utf-8") as aquery_json_handle:
-            objs = extract_ir_lib.load_bazel_aquery(
-                json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
-            )
-    else:
-        logging.error("Unknown input type: %s", args.input_type)
-
-    relative_output_paths = extract_ir_lib.run_extraction(
-        objs,
-        args.num_workers,
-        args.llvm_objcopy_path,
-        args.cmd_filter,
-        args.thinlto_build,
-        args.cmd_section_name,
-        args.bitcode_section_name,
-    )
-
-    extract_ir_lib.write_corpus_manifest(
-        args.thinlto_build, relative_output_paths, args.output_dir
-    )
-
-    logging.info(
-        "Converted %d files out of %d",
-        len(objs) - relative_output_paths.count(None),
-        len(objs),
-    )
-
-
-if __name__ == "__main__":
-    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
new file mode 120000
index 0000000000000..ce3baa062b3e1
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -0,0 +1 @@
+../../extract_ir.py
\ No newline at end of file
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
deleted file mode 100644
index 221486e16c6e0..0000000000000
--- a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Tool for making a corpus from arbitrary bitcode.
-
-To create a corpus from a set of bitcode files in an input directory, run
-the following command:
-
-PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
-  --input_dir=<path to input directory> \
-  --output_dir=<path to output directory> \
-  --default_args="<list of space separated flags>"
-"""
-
-import argparse
-import logging
-
-from mlgo.corpus import make_corpus_lib
-
-
-def parse_args_and_run():
-    parser = argparse.ArgumentParser(
-        description="A tool for making a corpus from arbitrary bitcode"
-    )
-    parser.add_argument("--input_dir", type=str, help="The input directory.")
-    parser.add_argument("--output_dir", type=str, help="The output directory.")
-    parser.add_argument(
-        "--default_args",
-        type=str,
-        help="The compiler flags to compile with when using downstream tooling.",
-        default="",
-        nargs="?",
-    )
-    args = parser.parse_args()
-    main(args)
-
-
-def main(args):
-    logging.warning(
-        "Using this tool does not guarantee that the bitcode is taken at "
-        "the correct stage for consumption during model training. Make "
-        "sure to validate assumptions about where the bitcode is coming "
-        "from before using it in production."
-    )
-    relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
-    make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
-    make_corpus_lib.write_corpus_manifest(
-        relative_paths, args.output_dir, args.default_args.split()
-    )
-
-
-if __name__ == "__main__":
-    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
new file mode 120000
index 0000000000000..7ea4447a76efc
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
@@ -0,0 +1 @@
+../../make_corpus.py
\ No newline at end of file

From 9ea9dce4d154712b728ef874cd9aa915605e10ae Mon Sep 17 00:00:00 2001
From: Vincent Lee <leevince@fb.com>
Date: Wed, 9 Jul 2025 02:11:04 -0700
Subject: [PATCH 6/7] Remove testing code

---
 llvm/utils/mlgo-utils/make_corpus.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py
index 92aab4d969d4d..221486e16c6e0 100644
--- a/llvm/utils/mlgo-utils/make_corpus.py
+++ b/llvm/utils/mlgo-utils/make_corpus.py
@@ -14,11 +14,6 @@
 
 import argparse
 import logging
-import sys
-import pathlib
-print(pathlib.Path(__file__).parent.parent.parent)
-
-sys.path.insert(0, pathlib.Path(__file__).parent.parent.parent)
 
 from mlgo.corpus import make_corpus_lib
 

From 2142d8dd28c34d041acdcab9ad6af20ed4244d5e Mon Sep 17 00:00:00 2001
From: Vincent Lee <leevince@fb.com>
Date: Wed, 9 Jul 2025 15:13:09 -0700
Subject: [PATCH 7/7] Put symlinks in root

---
 .../mlgo-utils/combine_training_corpus.py     |  53 +----
 llvm/utils/mlgo-utils/extract_ir.py           | 185 +-----------------
 llvm/utils/mlgo-utils/make_corpus.py          |  54 +----
 .../mlgo/corpus/combine_training_corpus.py    |  53 ++++-
 .../mlgo-utils/mlgo/corpus/extract_ir.py      | 185 +++++++++++++++++-
 .../mlgo-utils/mlgo/corpus/make_corpus.py     |  54 ++++-
 6 files changed, 292 insertions(+), 292 deletions(-)
 mode change 100644 => 120000 llvm/utils/mlgo-utils/combine_training_corpus.py
 mode change 100644 => 120000 llvm/utils/mlgo-utils/extract_ir.py
 mode change 100644 => 120000 llvm/utils/mlgo-utils/make_corpus.py
 mode change 120000 => 100644 llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
 mode change 120000 => 100644 llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
 mode change 120000 => 100644 llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py

diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py
deleted file mode 100644
index 9884d6696a43f..0000000000000
--- a/llvm/utils/mlgo-utils/combine_training_corpus.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-r"""Combine multiple training corpus into a single training corpus.
-
-Currently only support the case that multiple corpus share the same
-configurables except the "modules" field.
-
-Usage: we'd like to combine training corpus corpus1 and corpus2 into
-combinedcorpus; we first structure the files as follows:
-
-combinedcorpus
-combinedcorpus/corpus1
-combinedcorpus/corpus2
-
-Running this script with
-
-python3 \
-compiler_opt/tools/combine_training_corpus.py \
-  --root_dir=$PATH_TO_combinedcorpus
-
-generates combinedcorpus/corpus_description.json file. In this way corpus1
-and corpus2 are combined into combinedcorpus.
-"""
-
-import argparse
-import logging
-
-from mlgo.corpus import combine_training_corpus_lib
-from mlgo.corpus import flags
-
-
-def parse_args_and_run():
-    parser = argparse.ArgumentParser(
-        description="A tool for combining multiple training corpora"
-    )
-    parser.add_argument(
-        "--root_dir", type=str, help="The root dir of module paths to combine."
-    )
-    flags.add_verbosity_arguments(parser)
-    args = parser.parse_args()
-    main(args)
-
-
-def main(args):
-    logging.basicConfig(level=args.verbosity)
-
-    combine_training_corpus_lib.combine_corpus(args.root_dir)
-
-
-if __name__ == "__main__":
-    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py
new file mode 120000
index 0000000000000..d86f4ab284901
--- /dev/null
+++ b/llvm/utils/mlgo-utils/combine_training_corpus.py
@@ -0,0 +1 @@
+mlgo/corpus/combine_training_corpus.py
\ No newline at end of file
diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py
deleted file mode 100644
index 3101cef196b4a..0000000000000
--- a/llvm/utils/mlgo-utils/extract_ir.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Extract IR for training.
-
-Extract IR for training, either from a compile_commands.json file produced by
-cmake, or a linker parameter list file.
-
-Only run with
-'python compiler_opt/tools/extract_ir.py ...'
-
-The compilation is assumed to have been performed with clang, using
--fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
-
-In a distributed ThinLTO case, the compilation is assumed to have been performed
-specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
-
-In a local ThinLTO case, the compilation is assumedto have been performed
-specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
-
-To change the logging verbosity, set the --verbosity flag to the desired level.
-Setting it to a specific level will enable all messages at that level and
-higher. Exact values can be found by invoking the script with --help.
-"""
-
-import argparse
-import json
-import logging
-
-from mlgo.corpus import extract_ir_lib
-from mlgo.corpus import flags
-
-
-def parse_args_and_run():
-    parser = argparse.ArgumentParser(
-        description="A tool for making a corpus from build artifacts"
-    )
-    parser.add_argument(
-        "--input",
-        type=str,
-        help="Input file or directory - either compile_commands.json, a linker "
-        "parameter list, or a path to a directory containing object files.",
-    )
-    parser.add_argument(
-        "--input_type",
-        type=str,
-        help="Input file type - JSON, LLD params, directory, or bazel aquery.",
-        choices=["json", "params", "directory", "bazel_aquery"],
-        default="json",
-        nargs="?",
-    )
-    parser.add_argument("--output_dir", type=str, help="Output directory")
-    parser.add_argument(
-        "--num_workers",
-        type=int,
-        help="Number of parallel works for objcopy. `None` for maximum available.",
-        default=None,
-        nargs="?",
-    )
-    parser.add_argument(
-        "--llvm_objcopy_path",
-        type=str,
-        help="Path to llvm-objcopy",
-        default="llvm-objcopy",
-        nargs="?",
-    )
-    parser.add_argument(
-        "--obj_base_dir",
-        type=str,
-        help="Base directory for object files. Defaults to current working dir.",
-        default="",
-        nargs="?",
-    )
-    parser.add_argument(
-        "--cmd_filter",
-        type=str,
-        help="Include only those modules with a command line matching this regular "
-        "expression. Set it to None to not perform any filtering. Note that the "
-        "regular expression is applied independently for each separate command line "
-        "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
-        "with thinlto_build=lld.",
-        default=None,
-        nargs="?",
-    )
-    parser.add_argument(
-        "--thinlto_build",
-        type=str,
-        help="Set if the build was performed with either 'distributed' or 'local' "
-        "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
-        "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
-        "the distributed case or -Wl,--save-temps=import and "
-        "-Wl,--thinlto-emit-index-files passed in the local case",
-        choices=["distributed", "local"],
-        default=None,
-        nargs="?",
-    )
-    parser.add_argument(
-        "--cmd_section_name",
-        type=str,
-        help="The section name passed to llvm-objcopy. For ELF object files, the "
-        "default .llvmcmd is correct. For Mach-O object files, one should use "
-        "something like __LLVM,__cmdline",
-        default=".llvmcmd",
-        nargs="?",
-    )
-    parser.add_argument(
-        "--bitcode_section_name",
-        type=str,
-        help="The section name passed to llvm-objcopy. For ELF object files, the "
-        "default .llvmbc is correct. For Mach-O object files, one should use "
-        "__LLVM,__bitcode",
-        default=".llvmbc",
-        nargs="?",
-    )
-    flags.add_verbosity_arguments(parser)
-    args = parser.parse_args()
-    main(args)
-
-
-def main(args):
-    logging.basicConfig(level=args.verbosity)
-
-    objs = []
-    if args.input is not None and args.thinlto_build == "local":
-        raise ValueError("--thinlto_build=local cannot be run with --input")
-    if args.input is None:
-        if args.thinlto_build != "local":
-            raise ValueError("--input or --thinlto_build=local must be provided")
-        objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
-    elif args.input_type == "json":
-        with open(args.input, encoding="utf-8") as f:
-            objs = extract_ir_lib.load_from_compile_commands(
-                json.load(f), args.output_dir
-            )
-    elif args.input_type == "params":
-        if not args.obj_base_dir:
-            logging.info(
-                "-obj_base_dir is unspecified, assuming current directory. "
-                "If no objects are found, use this option to specify the root "
-                "directory for the object file paths in the input file."
-            )
-        with open(args.input, encoding="utf-8") as f:
-            objs = extract_ir_lib.load_from_lld_params(
-                [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
-            )
-    elif args.input_type == "directory":
-        logging.warning(
-            "Using the directory input is only recommended if the build system "
-            "your project uses does not support any structured output that "
-            "ml-compiler-opt understands. If your build system provides a "
-            "structured compilation database, use that instead"
-        )
-        objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
-    elif args.input_type == "bazel_aquery":
-        with open(args.input, encoding="utf-8") as aquery_json_handle:
-            objs = extract_ir_lib.load_bazel_aquery(
-                json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
-            )
-    else:
-        logging.error("Unknown input type: %s", args.input_type)
-
-    relative_output_paths = extract_ir_lib.run_extraction(
-        objs,
-        args.num_workers,
-        args.llvm_objcopy_path,
-        args.cmd_filter,
-        args.thinlto_build,
-        args.cmd_section_name,
-        args.bitcode_section_name,
-    )
-
-    extract_ir_lib.write_corpus_manifest(
-        args.thinlto_build, relative_output_paths, args.output_dir
-    )
-
-    logging.info(
-        "Converted %d files out of %d",
-        len(objs) - relative_output_paths.count(None),
-        len(objs),
-    )
-
-
-if __name__ == "__main__":
-    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py
new file mode 120000
index 0000000000000..c9b96abad0647
--- /dev/null
+++ b/llvm/utils/mlgo-utils/extract_ir.py
@@ -0,0 +1 @@
+mlgo/corpus/extract_ir.py
\ No newline at end of file
diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py
deleted file mode 100644
index 221486e16c6e0..0000000000000
--- a/llvm/utils/mlgo-utils/make_corpus.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Tool for making a corpus from arbitrary bitcode.
-
-To create a corpus from a set of bitcode files in an input directory, run
-the following command:
-
-PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
-  --input_dir=<path to input directory> \
-  --output_dir=<path to output directory> \
-  --default_args="<list of space separated flags>"
-"""
-
-import argparse
-import logging
-
-from mlgo.corpus import make_corpus_lib
-
-
-def parse_args_and_run():
-    parser = argparse.ArgumentParser(
-        description="A tool for making a corpus from arbitrary bitcode"
-    )
-    parser.add_argument("--input_dir", type=str, help="The input directory.")
-    parser.add_argument("--output_dir", type=str, help="The output directory.")
-    parser.add_argument(
-        "--default_args",
-        type=str,
-        help="The compiler flags to compile with when using downstream tooling.",
-        default="",
-        nargs="?",
-    )
-    args = parser.parse_args()
-    main(args)
-
-
-def main(args):
-    logging.warning(
-        "Using this tool does not guarantee that the bitcode is taken at "
-        "the correct stage for consumption during model training. Make "
-        "sure to validate assumptions about where the bitcode is coming "
-        "from before using it in production."
-    )
-    relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
-    make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
-    make_corpus_lib.write_corpus_manifest(
-        relative_paths, args.output_dir, args.default_args.split()
-    )
-
-
-if __name__ == "__main__":
-    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py
new file mode 120000
index 0000000000000..3e34a693dd2cc
--- /dev/null
+++ b/llvm/utils/mlgo-utils/make_corpus.py
@@ -0,0 +1 @@
+mlgo/corpus/make_corpus.py
\ No newline at end of file
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
deleted file mode 120000
index 5a6885a6d1fa2..0000000000000
--- a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
+++ /dev/null
@@ -1 +0,0 @@
-../../combine_training_corpus.py
\ No newline at end of file
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
new file mode 100644
index 0000000000000..9884d6696a43f
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
@@ -0,0 +1,52 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+r"""Combine multiple training corpus into a single training corpus.
+
+Currently only support the case that multiple corpus share the same
+configurables except the "modules" field.
+
+Usage: we'd like to combine training corpus corpus1 and corpus2 into
+combinedcorpus; we first structure the files as follows:
+
+combinedcorpus
+combinedcorpus/corpus1
+combinedcorpus/corpus2
+
+Running this script with
+
+python3 \
+compiler_opt/tools/combine_training_corpus.py \
+  --root_dir=$PATH_TO_combinedcorpus
+
+generates combinedcorpus/corpus_description.json file. In this way corpus1
+and corpus2 are combined into combinedcorpus.
+"""
+
+import argparse
+import logging
+
+from mlgo.corpus import combine_training_corpus_lib
+from mlgo.corpus import flags
+
+
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for combining multiple training corpora"
+    )
+    parser.add_argument(
+        "--root_dir", type=str, help="The root dir of module paths to combine."
+    )
+    flags.add_verbosity_arguments(parser)
+    args = parser.parse_args()
+    main(args)
+
+
+def main(args):
+    logging.basicConfig(level=args.verbosity)
+
+    combine_training_corpus_lib.combine_corpus(args.root_dir)
+
+
+if __name__ == "__main__":
+    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
deleted file mode 120000
index ce3baa062b3e1..0000000000000
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
+++ /dev/null
@@ -1 +0,0 @@
-../../extract_ir.py
\ No newline at end of file
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
new file mode 100644
index 0000000000000..3101cef196b4a
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -0,0 +1,184 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Extract IR for training.
+
+Extract IR for training, either from a compile_commands.json file produced by
+cmake, or a linker parameter list file.
+
+Only run with
+'python compiler_opt/tools/extract_ir.py ...'
+
+The compilation is assumed to have been performed with clang, using
+-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
+
+In a distributed ThinLTO case, the compilation is assumed to have been performed
+specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
+
+In a local ThinLTO case, the compilation is assumedto have been performed
+specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
+
+To change the logging verbosity, set the --verbosity flag to the desired level.
+Setting it to a specific level will enable all messages at that level and
+higher. Exact values can be found by invoking the script with --help.
+"""
+
+import argparse
+import json
+import logging
+
+from mlgo.corpus import extract_ir_lib
+from mlgo.corpus import flags
+
+
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for making a corpus from build artifacts"
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        help="Input file or directory - either compile_commands.json, a linker "
+        "parameter list, or a path to a directory containing object files.",
+    )
+    parser.add_argument(
+        "--input_type",
+        type=str,
+        help="Input file type - JSON, LLD params, directory, or bazel aquery.",
+        choices=["json", "params", "directory", "bazel_aquery"],
+        default="json",
+        nargs="?",
+    )
+    parser.add_argument("--output_dir", type=str, help="Output directory")
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        help="Number of parallel works for objcopy. `None` for maximum available.",
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--llvm_objcopy_path",
+        type=str,
+        help="Path to llvm-objcopy",
+        default="llvm-objcopy",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--obj_base_dir",
+        type=str,
+        help="Base directory for object files. Defaults to current working dir.",
+        default="",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--cmd_filter",
+        type=str,
+        help="Include only those modules with a command line matching this regular "
+        "expression. Set it to None to not perform any filtering. Note that the "
+        "regular expression is applied independently for each separate command line "
+        "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
+        "with thinlto_build=lld.",
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--thinlto_build",
+        type=str,
+        help="Set if the build was performed with either 'distributed' or 'local' "
+        "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
+        "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
+        "the distributed case or -Wl,--save-temps=import and "
+        "-Wl,--thinlto-emit-index-files passed in the local case",
+        choices=["distributed", "local"],
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--cmd_section_name",
+        type=str,
+        help="The section name passed to llvm-objcopy. For ELF object files, the "
+        "default .llvmcmd is correct. For Mach-O object files, one should use "
+        "something like __LLVM,__cmdline",
+        default=".llvmcmd",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--bitcode_section_name",
+        type=str,
+        help="The section name passed to llvm-objcopy. For ELF object files, the "
+        "default .llvmbc is correct. For Mach-O object files, one should use "
+        "__LLVM,__bitcode",
+        default=".llvmbc",
+        nargs="?",
+    )
+    flags.add_verbosity_arguments(parser)
+    args = parser.parse_args()
+    main(args)
+
+
+def main(args):
+    logging.basicConfig(level=args.verbosity)
+
+    objs = []
+    if args.input is not None and args.thinlto_build == "local":
+        raise ValueError("--thinlto_build=local cannot be run with --input")
+    if args.input is None:
+        if args.thinlto_build != "local":
+            raise ValueError("--input or --thinlto_build=local must be provided")
+        objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
+    elif args.input_type == "json":
+        with open(args.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_compile_commands(
+                json.load(f), args.output_dir
+            )
+    elif args.input_type == "params":
+        if not args.obj_base_dir:
+            logging.info(
+                "-obj_base_dir is unspecified, assuming current directory. "
+                "If no objects are found, use this option to specify the root "
+                "directory for the object file paths in the input file."
+            )
+        with open(args.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_lld_params(
+                [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
+            )
+    elif args.input_type == "directory":
+        logging.warning(
+            "Using the directory input is only recommended if the build system "
+            "your project uses does not support any structured output that "
+            "ml-compiler-opt understands. If your build system provides a "
+            "structured compilation database, use that instead"
+        )
+        objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
+    elif args.input_type == "bazel_aquery":
+        with open(args.input, encoding="utf-8") as aquery_json_handle:
+            objs = extract_ir_lib.load_bazel_aquery(
+                json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
+            )
+    else:
+        logging.error("Unknown input type: %s", args.input_type)
+
+    relative_output_paths = extract_ir_lib.run_extraction(
+        objs,
+        args.num_workers,
+        args.llvm_objcopy_path,
+        args.cmd_filter,
+        args.thinlto_build,
+        args.cmd_section_name,
+        args.bitcode_section_name,
+    )
+
+    extract_ir_lib.write_corpus_manifest(
+        args.thinlto_build, relative_output_paths, args.output_dir
+    )
+
+    logging.info(
+        "Converted %d files out of %d",
+        len(objs) - relative_output_paths.count(None),
+        len(objs),
+    )
+
+
+if __name__ == "__main__":
+    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
deleted file mode 120000
index 7ea4447a76efc..0000000000000
--- a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
+++ /dev/null
@@ -1 +0,0 @@
-../../make_corpus.py
\ No newline at end of file
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
new file mode 100644
index 0000000000000..221486e16c6e0
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
@@ -0,0 +1,53 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Tool for making a corpus from arbitrary bitcode.
+
+To create a corpus from a set of bitcode files in an input directory, run
+the following command:
+
+PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
+  --input_dir=<path to input directory> \
+  --output_dir=<path to output directory> \
+  --default_args="<list of space separated flags>"
+"""
+
+import argparse
+import logging
+
+from mlgo.corpus import make_corpus_lib
+
+
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for making a corpus from arbitrary bitcode"
+    )
+    parser.add_argument("--input_dir", type=str, help="The input directory.")
+    parser.add_argument("--output_dir", type=str, help="The output directory.")
+    parser.add_argument(
+        "--default_args",
+        type=str,
+        help="The compiler flags to compile with when using downstream tooling.",
+        default="",
+        nargs="?",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
+def main(args):
+    logging.warning(
+        "Using this tool does not guarantee that the bitcode is taken at "
+        "the correct stage for consumption during model training. Make "
+        "sure to validate assumptions about where the bitcode is coming "
+        "from before using it in production."
+    )
+    relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
+    make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
+    make_corpus_lib.write_corpus_manifest(
+        relative_paths, args.output_dir, args.default_args.split()
+    )
+
+
+if __name__ == "__main__":
+    parse_args_and_run()