Support run trainer locally (#111)

zpcore · web-flow · commit 825320ef8ed5 · 2025-03-14T23:04:26.000-07:00
* Support run trainer locally

* nit

* update on feedback
diff --git a/README.md b/README.md
@@ -169,33 +169,69 @@ Finally, each model may also provide a GPU "original" version that illustrates
 and attributes where this model code came from, if any. This also helps to
 show case what changes we have done to make it performant on TPU. The original
 version is not expected to be run.
-
 ## Contributing
 
 Contributions are welcome! Please feel free to submit a pull request.
 
 When developing, use `pip install -e '.[dev]'` to install dev dependencies such
 as linter and formatter.
 
-How to run tests:
+### How to run tests:
 
 ```sh
 pytest
 ```
 
-How to run some of the tests, and re-run them whenever you change a file:
+### How to run some of the tests, and re-run them whenever you change a file:
 
 ```sh
 tp -i test ... # replace with path to tests/directories
 ```
 
-How to format:
+
+### How to run HuggingFace transformer models
+Torchprime supports run with huggingface models by taking advantage of `tp run`.
+To use huggingface models, you can clone
+[huggingface/transformers](https://github.com/huggingface/transformers) under
+torchprime and name it as `local_transformers`. This allows you to pick any
+branch or make code modifications in transformers for experiment:
+```
+git clone https://github.com/huggingface/transformers.git local_transformers
+```
+If huggingface transformer doesn't exist, torchprime will automatically clone
+the repo and build the docker for experiment. To switch to huggingface models,
+add flag `--use-hf` to `tp run` command:
+```
+tp run --use-hf torchprime/hf_models/train.py
+```
+
+### How to run inside the docker container locally
+You can also run locally without XPK with docker. When running inside the docker
+container, it will use the same dependencies and build process as used in the
+XPK approach, improving the hermeticity and reliability.
+```
+tp docker-run torchprime/torch_xla_models/train.py
+```
+This will run the TorchPrime docker image locally. You can also add `--use-hf`
+to run HuggingFace model locally.
+```
+tp docker-run --use-hf torchprime/hf_models/train.py
+```
+
+### How to run locally without XPK:
+```
+tp dbrun torchprime/torch_xla_models/train.py
+```
+This will run the TorchPrime docker image locally. You can also add `--use-hf`
+to run HuggingFace model locally.
+
+### How to format:
 
 ```sh
 ruff format
 ```
 
-How to lint:
+### How to lint:
 
 ```sh
 ruff check [--fix]
diff --git a/torchprime/launcher/buildpush.py b/torchprime/launcher/buildpush.py
@@ -15,7 +15,7 @@
 def buildpush(
   torchprime_project_id,
   torchprime_docker_url=None,
-  torchprime_docker_tag=None,
+  push_docker=True,
   *,
   build_arg=None,
 ) -> str:
@@ -36,7 +36,7 @@ def buildpush(
 
   # Determine Docker tag
   default_tag = f"{datetime_str}-{random_chars}"
-  docker_tag = torchprime_docker_tag if torchprime_docker_tag else default_tag
+  docker_tag = default_tag
 
   # Determine Docker URL
   default_url = f"gcr.io/{torchprime_project_id}/torchprime-{user}:{docker_tag}"
@@ -62,7 +62,8 @@ def buildpush(
     _run(
       f"{sudo_cmd} docker tag {docker_tag} {docker_url}",
     )
-    _run(f"{sudo_cmd} docker push {docker_url}")
+    if push_docker:
+      _run(f"{sudo_cmd} docker push {docker_url}")
   except subprocess.CalledProcessError as e:
     print(f"Error running command: {e}")
     exit(e.returncode)
@@ -83,9 +84,10 @@ def _run(command):
   # Read environment variables or use defaults
   torchprime_project_id = os.getenv("TORCHPRIME_PROJECT_ID", "tpu-pytorch")
   torchprime_docker_url = os.getenv("TORCHPRIME_DOCKER_URL", None)
-  torchprime_docker_tag = os.getenv("TORCHPRIME_DOCKER_TAG", None)
+  push_docker_str = os.getenv("TORCHPRIME_PUSH_DOCKER", "true")
+  push_docker = push_docker_str.lower() in ("true", "1", "yes", "y")
   buildpush(
     torchprime_project_id,
     torchprime_docker_url,
-    torchprime_docker_tag,
+    push_docker,
   )
diff --git a/torchprime/launcher/cli.py b/torchprime/launcher/cli.py
@@ -23,6 +23,13 @@
 import torchprime.launcher.doctor
 from torchprime.launcher.buildpush import buildpush
 
+_DOCKER_ENV_FORWARD_LIST = [
+  "HF_TOKEN",
+  "XLA_IR_DEBUG",
+  "XLA_HLO_DEBUG",
+  "LIBTPU_INIT_ARGS",
+]
+
 
 @dataclass_json
 @dataclass
@@ -195,6 +202,53 @@ def create_and_activate_gcloud(gcloud_config_name, config: Config):
   )
 
 
+@cli.command(
+  name="docker-run",
+  context_settings=dict(
+    ignore_unknown_options=True,
+  ),
+)
+@click.argument("args", nargs=-1, type=click.UNPROCESSED)
+@click.option("--use-hf", is_flag=True, help="Use HuggingFace transformer")
+def docker_run(args, use_hf: bool):
+  """
+  Runs the provided training command locally for quick testing.
+  """
+  config = read_config()
+
+  click.echo(get_project_dir().absolute())
+
+  # Build docker image.
+  build_arg = "USE_TRANSFORMERS=true" if use_hf else None
+  docker_project = config.docker_project
+  if docker_project is None:
+    docker_project = config.project
+  docker_url = buildpush(docker_project, push_docker=False, build_arg=build_arg)
+  # Forward a bunch of important env vars.
+  env_forwarding = [
+    arg for env_var in _DOCKER_ENV_FORWARD_LIST for arg in forward_env(env_var)
+  ]
+  command = [
+    "python",
+  ] + list(args)
+  docker_command = [
+    "docker",
+    "run",
+    "-i",
+    *env_forwarding,
+    "--privileged",
+    "--net",
+    "host",
+    "--rm",
+    "-v",
+    f"{os.getcwd()}:/workspace",
+    "-w",
+    "/workspace",
+    docker_url,
+  ] + command
+  subprocess.run(docker_command, check=True)
+
+
 @cli.command(
   context_settings=dict(
     ignore_unknown_options=True,
@@ -255,12 +309,8 @@ def run(
 
   # Forward a bunch of important env vars.
   env_forwarding = [
-    *forward_env("HF_TOKEN"),  # HuggingFace token
-    *forward_env("XLA_IR_DEBUG"),  # torch_xla debugging flag
-    *forward_env("XLA_HLO_DEBUG"),  # torch_xla debugging flag
-    *forward_env("LIBTPU_INIT_ARGS"),  # XLA flags
+    arg for env_var in _DOCKER_ENV_FORWARD_LIST for arg in forward_env(env_var)
   ]
-
   # Pass artifact dir and jobset name as env vars.
   artifact_arg = [
     "--env",