pytorch
diff --git a/‎.github/scripts/generate_vllm_benchmark_matrix.py
Lines changed: 84 additions & 50 deletions b/‎.github/scripts/generate_vllm_benchmark_matrix.py
Lines changed: 84 additions & 50 deletions
diff --git a/‎.github/scripts/setup_vllm_benchmark.py
Lines changed: 13 additions & 2 deletions b/‎.github/scripts/setup_vllm_benchmark.py
Lines changed: 13 additions & 2 deletions
@@ -12,10 +12,12 @@
 logging.basicConfig(level=logging.INFO)
 # Those are H100 runners from https://github.com/pytorch-labs/pytorch-gha-infra/blob/main/multi-tenant/inventory/manual_inventory
 # while ROCm runner are provided by AMD
-RUNNERS_MAPPING = {
+TP_TO_RUNNER_MAPPING = {
     1: [
+        "linux.aws.a100",
         "linux.aws.h100",
         "linux.rocm.gpu.mi300.2",  # No single ROCm GPU?
+        "linux.24xl.spr-metal",
     ],
     # NB: There is no 2xH100 runner at the momement, so let's use the next one
     # in the list here which is 4xH100
@@ -26,13 +28,27 @@
     4: [
         "linux.aws.h100.4",
         "linux.rocm.gpu.mi300.4",
+        # TODO (huydhn): Enable this when Intel's runners are ready
+        # "intel-cpu-emr",
     ],
     8: [
         "linux.aws.h100.8",
         "linux.rocm.gpu.mi300.8",
     ],
 }
 
+# This mapping is needed to find out the platform of the runner
+RUNNER_TO_PLATFORM_MAPPING = {
+    "linux.aws.a100": "cuda",
+    "linux.aws.h100": "cuda",
+    "linux.aws.h100.4": "cuda",
+    "linux.aws.h100.8": "cuda",
+    "linux.rocm.gpu.mi300.2": "rocm",
+    "linux.rocm.gpu.mi300.4": "rocm",
+    "linux.rocm.gpu.mi300.8": "rocm",
+    "linux.24xl.spr-metal": "cpu",
+}
+
 # All the different names vLLM uses to refer to their benchmark configs
 VLLM_BENCHMARK_CONFIGS_PARAMETER = set(
     [
@@ -76,10 +92,11 @@ def parse_args() -> Any:
         help="the comma-separated list of models to benchmark",
     )
     parser.add_argument(
-        "--gpus",
+        "--runners",
         type=str,
         default="",
-        help="the comma-separated list of GPUs to benchmark",
+        help="the comma-separated list of runners to run the benchmark",
+        required=True,
     )
 
     return parser.parse_args()
@@ -107,59 +124,76 @@ def set_output(name: str, val: Any) -> None:
 
 
 def generate_benchmark_matrix(
-    benchmark_configs_dir: str, models: List[str], gpus: List[str]
+    benchmark_configs_dir: str, models: List[str], runners: List[str]
 ) -> Dict[str, Any]:
     """
     Parse all the JSON files in vLLM benchmark configs directory to get the
-    model name and tensor parallel size (aka number of GPUs)
+    model name and tensor parallel size (aka number of GPUs or CPU NUMA nodes)
     """
-    get_all_models = True if not models else False
-    use_all_gpus = True if not gpus else False
-
     benchmark_matrix: Dict[str, Any] = {
         "include": [],
     }
 
-    for file in glob.glob(f"{benchmark_configs_dir}/*.json"):
-        with open(file) as f:
-            try:
-                configs = json.load(f)
-            except json.JSONDecodeError as e:
-                warning(f"Fail to load {file}: {e}")
-                continue
-
-        for config in configs:
-            param = list(VLLM_BENCHMARK_CONFIGS_PARAMETER & set(config.keys()))
-            assert len(param) == 1
-
-            benchmark_config = config[param[0]]
-            if "model" not in benchmark_config:
-                warning(f"Model name is not set in {benchmark_config}, skipping...")
-                continue
-            model = benchmark_config["model"].lower()
-
-            # Dedup
-            if model in models:
-                continue
-            if get_all_models:
-                models.append(model)
-
-            if "tensor_parallel_size" in benchmark_config:
-                tp = benchmark_config["tensor_parallel_size"]
-            elif "tp" in benchmark_config:
-                tp = benchmark_config["tp"]
-            else:
-                tp = 8
-            assert tp in RUNNERS_MAPPING
-
-            for runner in RUNNERS_MAPPING[tp]:
-                found_runner = False
-                for gpu in gpus:
-                    if gpu.lower() in runner:
-                        found_runner = True
-                        break
-
-                if found_runner or use_all_gpus:
+    platforms = set()
+    if not runners:
+        use_all_runners = True
+        platforms = set(v for v in RUNNER_TO_PLATFORM_MAPPING.values())
+    else:
+        use_all_runners = False
+        for k, v in RUNNER_TO_PLATFORM_MAPPING.items():
+            for r in runners:
+                if r.lower() in k:
+                    platforms.add(v)
+
+    # Gather all possible benchmarks
+    for platform in sorted(platforms):
+        selected_models = []
+        for file in glob.glob(f"{benchmark_configs_dir}/{platform}/*.json"):
+            with open(file) as f:
+                try:
+                    configs = json.load(f)
+                except json.JSONDecodeError as e:
+                    warning(f"Fail to load {file}: {e}")
+                    continue
+
+            for config in configs:
+                param = list(VLLM_BENCHMARK_CONFIGS_PARAMETER & set(config.keys()))
+                assert len(param) == 1
+
+                benchmark_config = config[param[0]]
+                if "model" not in benchmark_config:
+                    warning(f"Model name is not set in {benchmark_config}, skipping...")
+                    continue
+                model = benchmark_config["model"].lower()
+
+                # Dedup
+                if model in selected_models:
+                    continue
+                # and only choose the selected model:
+                if models and model not in models:
+                    continue
+                selected_models.append(model)
+
+                if "tensor_parallel_size" in benchmark_config:
+                    tp = benchmark_config["tensor_parallel_size"]
+                elif "tp" in benchmark_config:
+                    tp = benchmark_config["tp"]
+                else:
+                    tp = 8
+                assert tp in TP_TO_RUNNER_MAPPING
+
+                for runner in TP_TO_RUNNER_MAPPING[tp]:
+                    # Wrong platform
+                    if (
+                        runner not in RUNNER_TO_PLATFORM_MAPPING
+                        or RUNNER_TO_PLATFORM_MAPPING[runner] != platform
+                    ):
+                        continue
+
+                    found_runner = any([r and r.lower() in runner for r in runners])
+                    if not found_runner and not use_all_runners:
+                        continue
+
                     benchmark_matrix["include"].append(
                         {
                             "runner": runner,
@@ -175,11 +209,11 @@ def generate_benchmark_matrix(
 def main() -> None:
     args = parse_args()
     models = [m.strip().lower() for m in args.models.split(",") if m.strip()]
-    gpus = [m.strip().lower() for m in args.gpus.split(",") if m.strip()]
+    runners = [m.strip().lower() for m in args.runners.split(",") if m.strip()]
     benchmark_matrix = generate_benchmark_matrix(
         args.benchmark_configs_dir,
         models,
-        gpus,
+        runners,
     )
     set_output("benchmark_matrix", benchmark_matrix)
 
 
@@ -61,17 +61,27 @@ def parse_args() -> Any:
         help="the list of models to benchmark",
         required=True,
     )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="",
+        help="device for the runner",
+        required=True,
+    )
 
     return parser.parse_args()
 
 
 def setup_benchmark_configs(
-    from_benchmark_configs_dir: str, to_benchmark_configs_dir: str, models: List[str]
+    from_benchmark_configs_dir: str,
+    to_benchmark_configs_dir: str,
+    models: List[str],
+    device: str,
 ) -> None:
     """
     Setup the benchmark configs to run on this runner
     """
-    for file in glob.glob(f"{from_benchmark_configs_dir}/*.json"):
+    for file in glob.glob(f"{from_benchmark_configs_dir}/{device}/*.json"):
         filename = os.path.basename(file)
         benchmark_configs = []
 
@@ -108,6 +118,7 @@ def main() -> None:
         args.from_benchmark_configs_dir,
         args.to_benchmark_configs_dir,
         args.models.split(","),
+        args.device,
     )