oracle
diff --git a/‎ads/jobs/builders/runtimes/pytorch_runtime.py
Lines changed: 2 additions & 1 deletion b/‎ads/jobs/builders/runtimes/pytorch_runtime.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎ads/jobs/templates/driver_pytorch.py
Lines changed: 1 addition & 1 deletion b/‎ads/jobs/templates/driver_pytorch.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎ads/jobs/templates/driver_utils.py
Lines changed: 5 additions & 1 deletion b/‎ads/jobs/templates/driver_utils.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/source/user_guide/jobs/data_science_job.rst
Lines changed: 1 addition & 0 deletions b/‎docs/source/user_guide/jobs/data_science_job.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/user_guide/jobs/index.rst
Lines changed: 1 addition & 0 deletions b/‎docs/source/user_guide/jobs/index.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/user_guide/jobs/infra_and_runtime.rst
Lines changed: 3 additions & 0 deletions b/‎docs/source/user_guide/jobs/infra_and_runtime.rst
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/source/user_guide/jobs/run_python.rst
Lines changed: 0 additions & 10 deletions b/‎docs/source/user_guide/jobs/run_python.rst
Lines changed: 0 additions & 10 deletions
diff --git a/‎docs/source/user_guide/jobs/run_pytorch_ddp.rst
Lines changed: 118 additions & 0 deletions b/‎docs/source/user_guide/jobs/run_pytorch_ddp.rst
Lines changed: 118 additions & 0 deletions
diff --git a/‎docs/source/user_guide/jobs/tabs/llama2_full.rst
Lines changed: 128 additions & 0 deletions b/‎docs/source/user_guide/jobs/tabs/llama2_full.rst
Lines changed: 128 additions & 0 deletions
@@ -205,7 +205,8 @@ def run(self, dsc_job, **kwargs):
                 if not envs:
                     envs = {}
                 # Huggingface accelerate requires machine rank
-                envs["RANK"] = str(i)
+                # Here we use NODE_RANK to store the machine rank
+                envs["NODE_RANK"] = str(i)
                 envs["WORLD_SIZE"] = str(replicas)
                 if main_run:
                     envs["MAIN_JOB_RUN_OCID"] = main_run.id
 
@@ -694,7 +694,7 @@ def __init__(self, code_dir: str = driver_utils.DEFAULT_CODE_DIR) -> None:
         # --multi_gpu will be set automatically if there is more than 1 GPU
         # self.multi_gpu = bool(self.node_count > 1 or self.gpu_count > 1)
         self.num_machines = self.node_count
-        self.machine_rank = os.environ["RANK"]
+        self.machine_rank = os.environ["NODE_RANK"]
         # Total number of processes across all nodes
         # Here we assume all nodes are having the same shape
         self.num_processes = (self.gpu_count if self.gpu_count else 1) * self.node_count
 
@@ -276,7 +276,7 @@ def copy_inputs(mappings: dict = None):
             return
 
         for src, dest in mappings.items():
-            logger.debug("Copying %s to %s", src, dest)
+            logger.debug("Copying %s to %s", src, os.path.abspath(dest))
             # Create the dest dir if one does not exist.
             if str(dest).endswith("/"):
                 dest_dir = dest
@@ -439,6 +439,10 @@ def install_pip_packages(self, packages: str = None):
             packages = os.environ.get(CONST_ENV_PIP_PKG)
         if not packages:
             return self
+        # The package requirement may contain special character like '>'.
+        # Here we wrap each package requirement with single quote to make sure they can be installed correctly
+        package_list = shlex.split(packages)
+        packages = " ".join([f"'{package}'" for package in package_list])
         self.run_command(
             f"pip install {packages}", conda_prefix=self.conda_prefix, check=True
         )
 
@@ -35,6 +35,7 @@ is available on `Data Science AI Sample GitHub Repository <https://github.com/or
 For more details, see :doc:`infra_and_runtime` configurations.
 You can also :doc:`run_notebook`, :doc:`run_script` and :doc:`run_git`.
 
+.. _yaml:
 
 YAML
 ====
 
@@ -15,6 +15,7 @@ Data Science Jobs
   ../jobs/run_script
   ../jobs/run_container
   ../jobs/run_git
+  ../jobs/run_pytorch_ddp
   ../cli/opctl/_template/jobs
   ../cli/opctl/_template/monitoring
   ../cli/opctl/localdev/local_jobs
 
@@ -253,6 +253,9 @@ Here are a few more examples:
 
 .. include:: ../jobs/tabs/runtime_args.rst
 
+
+.. _conda_environment:
+
 Conda Environment
 -----------------
 
 
@@ -14,16 +14,6 @@ Here is an example to define and run a job using :py:class:`~ads.jobs.PythonRunt
 
 .. include:: ../jobs/tabs/python_runtime.rst
 
-.. code-block:: python
-
-  # Create the job on OCI Data Science
-  job.create()
-  # Start a job run
-  run = job.run()
-  # Stream the job run outputs
-  run.watch()
-
-
 The :py:class:`~ads.jobs.PythonRuntime` uses an driver script from ADS for the job run.
 It performs additional operations before and after invoking your code.
 You can examine the driver script by downloading the job artifact from the OCI Console.
 
@@ -0,0 +1,118 @@
+Train PyTorch Models
+********************
+
+.. versionadded:: 2.8.8
+
+The :py:class:`~ads.jobs.PyTorchDistributedRuntime` is designed for training PyTorch models, including large language models (LLMs) with multiple GPUs from multiple nodes. If you develop you training code that is compatible with `torchrun <https://pytorch.org/docs/stable/elastic/run.html>`_, `DeepSpeed <https://www.deepspeed.ai/>`_, or `Accelerate <https://huggingface.co/docs/accelerate/index>`_, you can run them using OCI Data Science Jobs with zero code change. For multi-node training, ADS will launch multiple job runs, each corresponding to one node.
+
+See `Distributed Data Parallel in PyTorch <https://pytorch.org/tutorials/beginner/ddp_series_intro.html>`_ for a series of tutorials on PyTorch distributed training.
+
+.. admonition:: Prerequisite
+  :class: note
+
+  You need oracle-ads\>=2.8.8 to create a job with :py:class:`~ads.jobs.PyTorchDistributedRuntime`.
+
+  You also need to specify a conda environment with PyTorch\>=1.10 and oracle-ads\>=2.6.8 for the job. See the :ref:`Conda Environment <conda_environment>` about specifying the conda environment for a job.
+
+  We recommend using the ``pytorch20_p39_gpu_v1`` service conda environment and add additional packages as needed.
+
+  You need to specify a subnet ID and allow ingress traffic within the subnet.
+
+
+Torchrun Example
+================
+
+Here is an example to train a GPT model using the source code directly from the official PyTorch Examples Github repository. See `Training "Real-World" models with DDP <https://pytorch.org/tutorials/intermediate/ddp_series_minGPT.html>`_ tutorial for a walkthrough of the source code.
+
+.. include:: ../jobs/tabs/pytorch_ddp_torchrun.rst
+
+.. include:: ../jobs/tabs/run_job.rst
+
+
+Source Code
+===========
+
+The source code location can be specified as Git repository, local path or remote URI supported by
+`fsspec <https://filesystem-spec.readthedocs.io/en/latest/>`_.
+
+You can use the :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_git` method to specify the source code ``url`` on a Git repository. You can optionally specify the ``branch`` or ``commit`` for checking out the source code. 
+
+For a public repository, we recommend the "http://" or "https://" URL.
+Authentication may be required for the SSH URL even if the repository is public.
+
+To use a private repository, you must first save an SSH key to
+`OCI Vault <https://docs.oracle.com/en-us/iaas/Content/KeyManagement/Concepts/keyoverview.htm>`_ as a secret,
+and provide the ``secret_ocid`` when calling :py:meth:`~ads.jobs.GitPythonRuntime.with_source`.
+For more information about creating and using secrets,
+see `Managing Secret with Vault <https://docs.oracle.com/en-us/iaas/Content/KeyManagement/Tasks/managingsecrets.htm>`_.
+For repository on GitHub, you could setup the
+`GitHub Deploy Key <https://docs.github.com/en/developers/overview/managing-deploy-keys#deploy-keys>`_ as secret.
+
+.. admonition:: Git Version for Private Repository
+  :class: note
+
+  Git version of 2.3+ is required to use a private repository.
+
+Alternatively, you can use the :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_source` method to specify the source code as e a local path or a remote URI supported by
+`fsspec <https://filesystem-spec.readthedocs.io/en/latest/>`_.
+For example, you can specify files on OCI object storage using URI like
+``oci://bucket@namespace/path/to/prefix``. ADS will use the authentication method configured by
+:py:meth:`ads.set_auth()` to fetch the files and upload them as job artifact. The source code can be a single file, a compressed file/archive (zip/tar), or a folder.
+
+Working Directory
+=================
+
+The default working directory depends on how the source code is specified.
+* When the source code is specified as Git repository URL, the default working directory is the root of the Git repository.
+* When the source code is a single file (script), the default working directory containing the file.
+* When the source code is specified as a local or remote directory, the default working directory is the directory containing the source code directory.
+
+The working directory of your workload can be configured by :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_working_dir`. See :ref:`Python Runtime Working Directory <runtime_working_dir>` for more details.
+
+Input Data
+==========
+
+You can specify the input (training) data for the job using the :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_inputs` method, which takes a dictionary mapping the "source" to the "destination". The "source" can be an OCI object storage URI, HTTP or FTP URL. The "destination" is the local path in a job run. If the "destination" is specified as relative path, it will be relative to the working directory.
+
+Outputs
+=======
+
+You can specify the output data to be copied to the object storage by using the :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_output` method.
+It allows you to specify the output path ``output_path``
+in the job run and a remote URI (``output_uri``).
+Files in the ``output_path`` are copied to the remote output URI after the job run finishes successfully.
+Note that the ``output_path`` should be a path relative to the working directory.
+
+OCI object storage location can be specified in the format of ``oci://bucket_name@namespace/path/to/dir``.
+Please make sure you configure the I AM policy to allow the job run dynamic group to use object storage.
+
+Number of nodes
+===============
+
+The :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_replica` method helps you to specify the number node for the training job.
+
+Command
+=======
+
+The command to start your workload is specified by using the :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_command` method.
+
+For ``torchrun``, ADS will set ``--nnode``, ``--nproc_per_node``, ``--rdzv_backend`` and ``--rdzv_endpoint`` automatically. You do not need to specify them in the command unless you would like to override the values. The default ``rdzv_backend`` will be ``c10d``. The default port for ``rdzv_endpoint`` is 29400
+
+If you workload uses Deepspeed, you also need to set ``use_deepspeed`` to ``True`` when specifying the command. For Deepspeed, ADS will generate the hostfile automatically and setup the SSH configurations.
+
+For ``accelerate launch``, you can add your config YAML to the source code and specify it using ``--config_file`` argument. In your config, please use ``LOCAL_MACHINE`` as the compute environment. The same config file will be used by all nodes in multi-node workload. ADS will set ``--num_processes``, ``--num_machines``, ``--machine_rank``, ``--main_process_ip`` and ``--main_process_port`` automatically. For these arguments, ADS will override the values from your config YAML. If you would like to use your own values, you need to specify them as command arguments. The default ``main_process_port`` is 29400.
+
+Additional dependencies
+=======================
+
+The :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_dependency` method helps you to specify additional dependencies to be installed into the conda environment before starting your workload.
+* ``pip_req`` specifies the path of the ``requirements.txt`` file in your source code.
+* ``pip_pkg`` specifies the packages to be installed as a string.
+
+Python Paths
+============
+
+The working directory is added to the Python paths automatically.
+You can call :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_python_path` to add additional python paths as needed.
+The paths should be relative paths from the working directory.
+
@@ -0,0 +1,128 @@
+.. tabs::
+
+  .. code-tab:: python
+    :caption: Python
+
+    from ads.jobs import Job, DataScienceJob, PyTorchDistributedRuntime
+
+    job = (
+        Job(name="LLAMA2-Fine-Tuning")
+        .with_infrastructure(
+            DataScienceJob()
+            .with_log_group_id("<log_group_ocid>")
+            .with_log_id("<log_ocid>")
+            .with_compartment_id("<compartment_ocid>")
+            .with_project_id("<project_ocid>")
+            .with_subnet_id("<subnet_ocid>")
+            .with_shape_name("VM.GPU.A10.1")
+            .with_block_storage_size(256)
+        )
+        .with_runtime(
+            PyTorchDistributedRuntime()
+            # Specify the service conda environment by slug name.
+            .with_service_conda("pytorch20_p39_gpu_v1")
+            .with_git(
+              url="https://github.com/facebookresearch/llama-recipes.git",
+              commit="03faba661f079ee1ecaeb66deaa6bdec920a7bab"
+            )
+            .with_dependency(
+              pip_pkg=" ".join([
+                "'accelerate>=0.21.0'",
+                "appdirs",
+                "loralib",
+                "bitsandbytes==0.39.1",
+                "black",
+                "'black[jupyter]'",
+                "datasets",
+                "fire",
+                "'git+https://github.com/huggingface/peft.git'",
+                "'transformers>=4.31.0'",
+                "sentencepiece",
+                "py7zr",
+                "scipy",
+                "optimum"
+              ])
+            )
+            .with_output("/home/datascience/outputs", "oci://bucket@namespace/outputs/$JOB_RUN_OCID")
+            .with_command(" ".join([
+              "torchrun llama_finetuning.py",
+              "--enable_fsdp",
+              "--pure_bf16",
+              "--batch_size_training 1",
+              "--micro_batch_size 1",
+              "--model_name $MODEL_NAME",
+              "--dist_checkpoint_root_folder /home/datascience/outputs",
+              "--dist_checkpoint_folder fine-tuned"
+            ]))
+            .with_replica(2)
+            .with_environment_variable(
+              MODEL_NAME="meta-llama/Llama-2-7b-hf",
+              HUGGING_FACE_HUB_TOKEN="<access_token>",
+              LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/opt/conda/lib",
+            )
+        )
+    )
+
+  .. code-tab:: yaml
+    :caption: YAML
+
+    kind: job
+    apiVersion: v1.0
+    spec:
+      name: LLAMA2-Fine-Tuning
+      infrastructure:
+        kind: infrastructure
+        spec:
+          blockStorageSize: 256
+          compartmentId: "<compartment_ocid>"
+          logGroupId: "<log_group_id>"
+          logId: "<log_id>"
+          projectId: "<project_id>"
+          subnetId: "<subnet_id>"
+          shapeName: VM.GPU.A10.2
+        type: dataScienceJob
+      runtime:
+        kind: runtime
+        type: pyTorchDistributed
+        spec:
+          git:
+            url: https://github.com/facebookresearch/llama-recipes.git
+            commit: 03faba661f079ee1ecaeb66deaa6bdec920a7bab
+          command: >-
+            torchrun llama_finetuning.py
+            --enable_fsdp
+            --pure_bf16
+            --batch_size_training 1
+            --micro_batch_size 1
+            --model_name $MODEL_NAME
+            --dist_checkpoint_root_folder /home/datascience/outputs
+            --dist_checkpoint_folder fine-tuned
+          replicas: 2
+          conda:
+            type: service
+            slug: pytorch20_p39_gpu_v1
+          dependencies:
+            pipPackages: >-
+              'accelerate>=0.21.0'
+              appdirs
+              loralib
+              bitsandbytes==0.39.1
+              black
+              'black[jupyter]'
+              datasets
+              fire
+              'git+https://github.com/huggingface/peft.git'
+              'transformers>=4.31.0'
+              sentencepiece
+              py7zr
+              scipy
+              optimum
+          outputDir: /home/datascience/outputs
+          outputUri: oci://bucket@namespace/outputs/$JOB_RUN_OCID
+          env:
+            - name: MODEL_NAME
+              value: meta-llama/Llama-2-7b-hf
+            - name: HUGGING_FACE_HUB_TOKEN
+              value: "<access_token>"
+            - name: LD_LIBRARY_PATH
+              value: /usr/local/nvidia/lib:/usr/local/nvidia/lib64:/opt/conda/lib