Fix the logic to calculate the number of workers based on the TPU version. (#51227)

qinyiyan · web-flow · commit 3e03ddff3c0b · 2025-03-13T22:10:45.000Z
The calculation of number of workers were incorrect that it didn't take
the correct number of cores/chip into the calculation.

---------

Signed-off-by: Quinn &lt;qinyiyan@google.com&gt;
diff --git a/python/ray/_private/accelerators/tpu.py b/python/ray/_private/accelerators/tpu.py
@@ -43,6 +43,22 @@
 TPU_HOST_BOUNDS_ENV_VAR = "TPU_HOST_BOUNDS"
 TPU_SINGLE_HOST_BOUNDS = "1,1,1"
 
+# By default TPU VMs come with 4 chips per host and 2 tensorcores per chip.
+# For more details: https://cloud.google.com/tpu/docs/system-architecture-tpu-vm
+DEFAULT_TPU_NUM_CHIPS_PER_HOST = 4
+DEFAULT_TPU_NUM_CORES_PER_CHIP = 2
+
+# Accelerators that are 4 chips per host: v2, v3, v4, v5p
+# Accelerators that are 8 chips per host: v5e, v6e
+SINGLE_HOST_8_CHIPS_TPU_TYPES = ("v5litepod", "v6e")
+
+# Accelerators that are 2 cores per chip: v2, v3, v4, v5p
+# Accelerators that are 1 core per chip: v5e, v6e
+SINGLE_CORE_TPU_TYPES = ("v5litepod", "v6e")
+
+# The valid TPU types.
+VALID_TPU_TYPES = ("v2", "v3", "v4", "v5p", "v5litepod", "v6e")
+
 
 def _get_tpu_metadata(key: str) -> Optional[str]:
     """Poll and get TPU metadata."""
@@ -67,6 +83,29 @@ def _get_tpu_metadata(key: str) -> Optional[str]:
     return None
 
 
+def _accelerator_type_check(accelerator_type: str):
+    if not accelerator_type.startswith(VALID_TPU_TYPES):
+        raise ValueError(
+            f"Invalid accelerator type: {accelerator_type}. Must start with one of: {VALID_TPU_TYPES}"
+        )
+
+
+def get_num_tpu_visible_chips_per_host(accelerator_type: str) -> int:
+    _accelerator_type_check(accelerator_type)
+    if accelerator_type.startswith(SINGLE_HOST_8_CHIPS_TPU_TYPES):
+        return 8
+
+    return DEFAULT_TPU_NUM_CHIPS_PER_HOST
+
+
+def get_tpu_cores_per_chip(accelerator_type: str) -> int:
+    _accelerator_type_check(accelerator_type)
+    if accelerator_type.startswith(SINGLE_CORE_TPU_TYPES):
+        return 1
+
+    return DEFAULT_TPU_NUM_CORES_PER_CHIP
+
+
 class TPUAcceleratorManager(AcceleratorManager):
     """Google TPU accelerators."""
 
@@ -273,10 +312,16 @@ def _get_current_node_tpu_worker_id() -> Optional[int]:
     def get_num_workers_in_current_tpu_pod() -> Optional[int]:
         """Return the total number of workers in a TPU pod."""
         tpu_pod_type = TPUAcceleratorManager._get_current_node_tpu_pod_type()
-        cores_per_host = TPUAcceleratorManager.get_current_node_num_accelerators()
+        chips_per_host = TPUAcceleratorManager.get_current_node_num_accelerators()
+        cores_per_chip = get_tpu_cores_per_chip(tpu_pod_type)  # Hard-coded map.
+        cores_per_host = chips_per_host * cores_per_chip
         if tpu_pod_type and cores_per_host > 0:
-            num_chips_or_cores = int(tpu_pod_type.split("-")[1])
-            return num_chips_or_cores // cores_per_host
+            num_cores = int(tpu_pod_type.split("-")[1])
+            num_workers = num_cores // cores_per_host
+            # If the chip count doesn't fill a full host, a sub-host is still treated as a host.
+            if num_cores % cores_per_host != 0:
+                num_workers += 1
+            return num_workers
         else:
             logging.debug("Could not get num workers in TPU pod.")
             return None
diff --git a/python/ray/autoscaler/_private/gcp/config.py b/python/ray/autoscaler/_private/gcp/config.py
@@ -17,6 +17,7 @@
 from googleapiclient import discovery, errors
 
 from ray._private.accelerators import TPUAcceleratorManager
+from ray._private.accelerators import tpu
 from ray.autoscaler._private.gcp.node import MAX_POLLS, POLL_INTERVAL, GCPNodeType
 from ray.autoscaler._private.util import check_legacy_fields
 
@@ -51,11 +52,6 @@
 # NOTE: iam.serviceAccountUser allows the Head Node to create worker nodes
 # with ServiceAccounts.
 
-# By default TPU VMs come with 4 chips per host and 2 tensorcores per chip.
-# For more details: https://cloud.google.com/tpu/docs/system-architecture-tpu-vm
-DEFAULT_TPU_NUM_CHIPS_PER_HOST = 4
-DEFAULT_TPU_CORES_PER_CHIP = 2
-
 
 def tpu_accelerator_config_to_type(accelerator_config: dict) -> str:
     """Convert a provided accelerator_config to accelerator_type.
@@ -75,17 +71,14 @@ def tpu_accelerator_config_to_type(accelerator_config: dict) -> str:
     # Reduce e.g. "2x2x2" to 8
     chip_dimensions = [int(chip_count) for chip_count in topology.split("x")]
     num_chips = reduce(lambda x, y: x * y, chip_dimensions)
-    num_cores = num_chips * DEFAULT_TPU_CORES_PER_CHIP
 
     # V5LitePod is rendered as "V5LITE_POD" in accelerator configuration but
     # accelerator type uses a format like "v5litepod-{cores}", so we need
     # to manually convert the string here.
     if generation == "v5lite_pod":
         generation = "v5litepod"
-        num_cores = num_chips
 
-    if generation == "v6e":
-        num_cores = num_chips
+    num_cores = tpu.get_tpu_cores_per_chip(generation) * num_chips
 
     return f"{generation}-{num_cores}"
 
@@ -136,39 +129,13 @@ def _validate_tpu_config(node: dict):
             )
 
 
-def _get_num_tpu_visible_chips_per_host(accelerator_type: str) -> int:
-    if accelerator_type == "v5litepod-8":
-        return 8
-
-    # All V6e configurations have 8 chips per host
-    if accelerator_type.startswith("v6e"):
-        return 8
-
-    return DEFAULT_TPU_NUM_CHIPS_PER_HOST
-
-
-def _get_tpu_cores_per_chip(accelerator_type: str) -> int:
-    # accelerator_type  is in the form v{generateion}-{cores}
-    accelerator_type = accelerator_type.split("-")[0]
-
-    # V5Litepods have 1 core per chip
-    if accelerator_type == "v5litepod":
-        return 1
-
-    # V6es have 1 core per chip
-    if accelerator_type == "v6e":
-        return 1
-
-    return DEFAULT_TPU_CORES_PER_CHIP
-
-
 def _get_num_tpu_chips(node: dict) -> int:
     chips = 0
     if "acceleratorType" in node:
         accelerator_type = node["acceleratorType"]
         # `acceleratorType` is typically v{generation}-{cores}
         cores = int(accelerator_type.split("-")[1])
-        chips = cores / _get_tpu_cores_per_chip(accelerator_type)
+        chips = cores / tpu.get_tpu_cores_per_chip(accelerator_type)
     if "acceleratorConfig" in node:
         topology = node["acceleratorConfig"]["topology"]
         # `topology` is typically {chips}x{chips}x{chips}
@@ -185,7 +152,7 @@ def _is_single_host_tpu(node: dict) -> bool:
         accelerator_type = node["acceleratorType"]
     else:
         accelerator_type = tpu_accelerator_config_to_type(node["acceleratorConfig"])
-    return _get_num_tpu_chips(node) == _get_num_tpu_visible_chips_per_host(
+    return _get_num_tpu_chips(node) <= tpu.get_num_tpu_visible_chips_per_host(
         accelerator_type
     )
 
diff --git a/python/ray/tests/accelerators/test_tpu.py b/python/ray/tests/accelerators/test_tpu.py
@@ -282,8 +282,21 @@ def test_empty_get_current_pod_name_returns_none():
 @pytest.mark.parametrize(
     "test_case",
     [
-        (4, "v4-16", 4),
+        # (number_chips_per_host, accl_type, expected_worker_count)
+        (4, "v2-4", 1),
+        (4, "v3-32", 4),
+        (4, "v4-8", 1),
+        (4, "v4-16", 2),
+        (8, "v5litepod-4", 1),
+        (8, "v5litepod-8", 1),
+        (8, "v5litepod-16", 2),
+        (8, "v5litepod-32", 4),
+        (4, "v5p-4", 1),
+        (4, "v5p-8", 1),
+        (4, "v5p-16", 2),
+        (8, "v6e-4", 1),
         (8, "v6e-8", 1),
+        (8, "v6e-16", 2),
     ],
 )
 @patch("glob.glob")