* Support automating single slice GKE TPU cluster via Job API.

jax authors · jax authors · commit a53e99a8c19d · 2024-03-16T11:20:36.000-07:00
* Refactor the GCE and GKE clusters to inherit from a shared base since they
supply the same information
* This shared base supports both multislice and single slice

PiperOrigin-RevId: 616440923
diff --git a/jax/_src/clusters/__init__.py b/jax/_src/clusters/__init__.py
@@ -23,5 +23,4 @@
 from .ompi_cluster import OmpiCluster
 from .slurm_cluster import SlurmCluster
 from .cloud_tpu_cluster import GkeTpuCluster
-from .cloud_tpu_cluster import MultisliceGceTpuCluster
-from .cloud_tpu_cluster import SingleSliceGceTpuCluster
+from .cloud_tpu_cluster import GceTpuCluster
diff --git a/jax/_src/clusters/cloud_tpu_cluster.py b/jax/_src/clusters/cloud_tpu_cluster.py
@@ -14,17 +14,22 @@
 
 from __future__ import annotations
 
+import logging
 import os
 import re
 import socket
 import time
 from jax._src import clusters
 from jax._src.cloud_tpu_init import running_in_cloud_tpu_vm
 
+logger = logging.getLogger(__name__)
+
 # We use an arbitrarily chosen port for the coordinator since we cannot
 # rely on communication to choose one in real time.
 coordinator_port = '8476'
 
+metadata_response_code_success = 200
+
 def get_metadata(key):
   import requests  # pytype: disable=import-error
   import time  # pytype: disable=import-error
@@ -47,11 +52,11 @@ def get_metadata(key):
 
   if api_resp is None:
     raise RuntimeError(f"Getting metadata['{key}'] failed for 6 tries")
-  return api_resp.text
+  return api_resp.text, api_resp.status_code
 
 def get_tpu_env_value(key):
   def get_tpu_env_value_from_metadata(key):
-    tpu_env_data = get_metadata('tpu-env')
+    tpu_env_data = get_metadata('tpu-env')[0]
     key_value_pairs = tpu_env_data.split('\n')
     for key_value_pair in key_value_pairs:
       # Typical line is MEGASCALE_NUM_SLICES: '2'
@@ -65,54 +70,44 @@ def get_tpu_env_value_from_metadata(key):
   value = os.environ.get(key, None)
   return value if value is not None else get_tpu_env_value_from_metadata(key)
 
-def is_gce_env():
-  worker_number_string = get_metadata('agent-worker-number')
-  try:
-    worker_number = int(worker_number_string)
-    return True
-  except:
-    return False
-
-def is_multislice_gce_env():
-  return is_gce_env() and get_tpu_env_value('MEGASCALE_COORDINATOR_ADDRESS') is not None
-
-def is_gke_env():
-  return os.environ.get("TPU_WORKER_HOSTNAMES", None) is not None
+def has_megascale_address():
+  return get_tpu_env_value('MEGASCALE_COORDINATOR_ADDRESS') is not None
 
-def get_gce_worker_endpoints() -> str:
-  return get_metadata('worker-network-endpoints').split(',')
+class BaseTpuCluster(clusters.ClusterEnv):
+  """Abstract cluster supports both single and multislice TPU environments.
 
-class SingleSliceGceTpuCluster(clusters.ClusterEnv):
-  @classmethod
-  def is_env_present(cls) -> bool:
-    return running_in_cloud_tpu_vm and is_gce_env() and not is_multislice_gce_env()
-
-  @classmethod
-  def get_coordinator_address(cls) -> str:
-    return f"{get_gce_worker_endpoints()[0].split(':')[2]}:{coordinator_port}"
-
-  @classmethod
-  def get_process_count(cls) -> int:
-    return len(get_gce_worker_endpoints())
-
-  @classmethod
-  def get_process_id(cls) -> int:
-    return int(get_metadata('agent-worker-number'))
-
-  @classmethod
-  def get_local_process_id(cls) -> int | None:
-    return None
+  If MEGASCALE_COORDINATOR_ADDRESS is not set, we assume single slice topology.
+  Concrete extensions of this class must implement methods for generating a list
+    of within-slice workers and a within-slice process ID.
+  `get_coordinator_address` must return the address of the host with
+  process ID 0 (as returned by `get_process_id`), since the coordinator service
+  is started on the host with process ID = 0.
+  """
 
-class MultisliceGceTpuCluster(clusters.ClusterEnv):
   @classmethod
   def is_env_present(cls) -> bool:
-    return running_in_cloud_tpu_vm and is_multislice_gce_env()
+    """Override this method to return True if the environment is present."""
+    return False
 
   @classmethod
   def get_coordinator_address(cls) -> str:
-    coordinator_address = get_tpu_env_value('MEGASCALE_COORDINATOR_ADDRESS')
+    if has_megascale_address():
+      # For both GCE via QueuedResources and GKE via JobSet, the
+      # Megascale coordinator address is set as the host with process id = 0,
+      # so can be used as the jax distributed system coordinator.
+      coordinator_address = get_tpu_env_value('MEGASCALE_COORDINATOR_ADDRESS')
+    else:
+      # For both GCE (QueuedResources and TPUVM create) and GKE via Job API,
+      # the workers lists are sorted by process ID so the first one can
+      # be used as the jax distributed system coordinator.
+      coordinator_address = cls._get_worker_list_in_slice()[0]
     coordinator_address = coordinator_address.split(':')[0]
+    logger.debug("TPU Cluster using coordinator address: %s", coordinator_address)
+    cls.wait_for_coordinator(coordinator_address)
+    return f'{coordinator_address}:{coordinator_port}'
 
+  @classmethod
+  def wait_for_coordinator(cls, coordinator_address):
     # The coordinator may not be up before the other hosts try to
     # communicate with it. We check for its existence with retries.
     coordinator_found = False
@@ -126,51 +121,92 @@ def get_coordinator_address(cls) -> str:
         print(f"Failed to recognize coordinator address {coordinator_address} on attempt {lookup_attempt}, retrying...")
         lookup_attempt += 1
         time.sleep(5)
-
     if not coordinator_found:
       raise RuntimeError(f"Failed to recognize coordinator address {coordinator_address}")
 
-    # Use a different port for the jax coordinator than the MXLA coordinator,
-    # which is set to 8080 in multislice GCE.
-    return f'{coordinator_address}:{coordinator_port}'
-
   @classmethod
   def get_process_count(cls) -> int:
-    processes_per_slice = cls._get_process_count_per_slice()
-    num_slices = int(get_tpu_env_value('MEGASCALE_NUM_SLICES'))
-    return processes_per_slice * num_slices
+    processes_per_slice = len(cls._get_worker_list_in_slice())
+    num_slices = cls._get_num_slices()
+    total_process_count = processes_per_slice * num_slices
+    logger.debug("Total process count of %s = %s processes per slice and %s slices", total_process_count, processes_per_slice, num_slices)
+    return total_process_count
 
   @classmethod
   def get_process_id(cls) -> int:
     process_id_in_slice = cls._get_process_id_in_slice()
-    slice_id = int(get_tpu_env_value('MEGASCALE_SLICE_ID'))
-    processes_per_slice = cls._get_process_count_per_slice()
-    return process_id_in_slice + slice_id * processes_per_slice
+    slice_id = cls._get_slice_id()
+    processes_per_slice = len(cls._get_worker_list_in_slice())
+    process_id = process_id_in_slice + slice_id * processes_per_slice
+    logger.debug("Process ID of %s generated by within-slice id %s and slice id %s", process_id, process_id_in_slice, slice_id)
+    return process_id
 
-  @classmethod
-  def get_local_process_id(cls) -> int | None:
-    return None
+  @staticmethod
+  def _get_num_slices() -> int:
+    if has_megascale_address():
+      return int(get_tpu_env_value('MEGASCALE_NUM_SLICES'))
+    else:
+      return 1
 
   @staticmethod
-  def _get_process_count_per_slice() -> int:
-    return len(get_gce_worker_endpoints())
+  def _get_slice_id() -> int:
+    if has_megascale_address():
+      return int(get_tpu_env_value('MEGASCALE_SLICE_ID'))
+    else:
+      return 0
 
   @staticmethod
   def _get_process_id_in_slice() -> int:
-    return int(get_metadata('agent-worker-number'))
+    """Returns a process ID that is unique within slice."""
+    raise NotImplementedError()
 
-class GkeTpuCluster(MultisliceGceTpuCluster):
-  # This class handles both single and multislice GKE as the environment
-  # variables are set the same in both cases.
+  @staticmethod
+  def _get_worker_list_in_slice() -> list[str]:
+    """Returns a list of worker endpoints/hostnames within slice."""
+    raise NotImplementedError()
+
+class GceTpuCluster(BaseTpuCluster):
   @classmethod
   def is_env_present(cls) -> bool:
-    return running_in_cloud_tpu_vm and is_gke_env()
+    if not running_in_cloud_tpu_vm:
+      logger.debug("Did not detect cloud TPU VM")
+      return False
+    metadata_response, metadata_code = get_metadata('agent-worker-number')
+    if metadata_code == metadata_response_code_success:
+      logger.debug("Gce Tpu Cluster detected for Jax Distributed System")
+      return True
+    else:
+      logger.debug("Did not detect Gce Tpu Cluster since agent-worker-number is not set in metadata")
+      logger.debug("Metadata code: %s", metadata_code)
+      logger.debug("Metadata response: %s", metadata_response)
+      return False
 
   @staticmethod
-  def _get_process_count_per_slice() -> int:
-    tpu_worker_hostnames = str(os.environ.get('TPU_WORKER_HOSTNAMES', None))
-    return len(tpu_worker_hostnames.split(','))
+  def _get_process_id_in_slice() -> int:
+    return int(get_metadata('agent-worker-number')[0])
+
+  @staticmethod
+  def _get_worker_list_in_slice() -> list[str]:
+    workers = get_metadata('worker-network-endpoints')[0].split(',')
+    return [worker.split(':')[2] for worker in workers]
+
+class GkeTpuCluster(BaseTpuCluster):
+  @classmethod
+  def is_env_present(cls) -> bool:
+    if running_in_cloud_tpu_vm and os.environ.get("TPU_WORKER_HOSTNAMES") is not None:
+      logger.debug("Gke Tpu Cluster detected for Jax Distributed System")
+      return True
+    else:
+      if not running_in_cloud_tpu_vm:
+        logger.debug("Did not detect cloud TPU VM")
+      else:
+        logger.debug("Did not detect TPU GKE cluster since TPU_WORKER_HOSTNAMES is not set")
+      return False
 
   @staticmethod
   def _get_process_id_in_slice() -> int:
     return int(str(os.environ.get('TPU_WORKER_ID')))
+
+  @staticmethod
+  def _get_worker_list_in_slice() -> list[str]:
+    return str(os.environ.get('TPU_WORKER_HOSTNAMES', None)).split(',')