From 7ef1b6549969a9a86c35fe03903457ad4cfa5322 Mon Sep 17 00:00:00 2001
From: manu-sj <152865565+manu-sj@users.noreply.github.com>
Date: Thu, 3 Jul 2025 09:42:07 +0200
Subject: [PATCH] [HWORKS-2190][APPEND] Updating job configuration to include
file, pyfiles, archives and jars (#478)
* updating docs for jobs configs to include files, pyFiles, jars and archives
* updating based on review comments
* updating documentation for notebooks and python Jobs
---
docs/user_guides/projects/jobs/notebook_job.md | 1 +
docs/user_guides/projects/jobs/pyspark_job.md | 6 +++++-
docs/user_guides/projects/jobs/python_job.md | 1 +
docs/user_guides/projects/jobs/spark_job.md | 7 ++++++-
4 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/docs/user_guides/projects/jobs/notebook_job.md b/docs/user_guides/projects/jobs/notebook_job.md
index 364b5900e..7c724bcb7 100644
--- a/docs/user_guides/projects/jobs/notebook_job.md
+++ b/docs/user_guides/projects/jobs/notebook_job.md
@@ -179,6 +179,7 @@ The following table describes the JSON payload returned by `jobs_api.get_configu
| `resourceConfig.gpus` | number (int) | Number of GPUs to be allocated | `0` |
| `logRedirection` | boolean | Whether logs are redirected | `true` |
| `jobType` | string | Type of job | `"PYTHON"` |
+| `files` | string | HDFS path(s) to files to be provided to the Notebook Job. Multiple files can be included in a single string, separated by commas.
Example: `"hdfs:///Project//Resources/file1.py,hdfs:///Project//Resources/file2.txt"` | `null` |
## Accessing project data
diff --git a/docs/user_guides/projects/jobs/pyspark_job.md b/docs/user_guides/projects/jobs/pyspark_job.md
index c0cb7e804..e329312f3 100644
--- a/docs/user_guides/projects/jobs/pyspark_job.md
+++ b/docs/user_guides/projects/jobs/pyspark_job.md
@@ -217,7 +217,7 @@ The following table describes the JSON payload returned by `jobs_api.get_configu
| Field | Type | Description | Default |
| ------------------------------------------ | -------------- |-----------------------------------------------------| -------------------------- |
| `type` | string | Type of the job configuration | `"sparkJobConfiguration"` |
-| `appPath` | string | Project path to script (e.g `Resources/foo.py`) | `null` |
+| `appPath` | string | Project path to script (e.g `Resources/foo.py`) | `null` |
| `environmentName` | string | Name of the project spark environment | `"spark-feature-pipeline"` |
| `spark.driver.cores` | number (float) | Number of CPU cores allocated for the driver | `1.0` |
| `spark.driver.memory` | number (int) | Memory allocated for the driver (in MB) | `2048` |
@@ -229,6 +229,10 @@ The following table describes the JSON payload returned by `jobs_api.get_configu
| `spark.dynamicAllocation.maxExecutors` | number (int) | Maximum number of executors with dynamic allocation | `2` |
| `spark.dynamicAllocation.initialExecutors` | number (int) | Initial number of executors with dynamic allocation | `1` |
| `spark.blacklist.enabled` | boolean | Whether executor/node blacklisting is enabled | `false` |
+| `files` | string | HDFS path(s) to files to be provided to the Spark application. Multiple files can be included in a single string, separated by commas.
Example: `"hdfs:///Project//Resources/file1.py,hdfs:///Project//Resources/file2.txt"` | `null` |
+| `pyFiles` | string | HDFS path(s) to Python files to be provided to the Spark application. These will be added to the `PYTHONPATH` so they can be imported as modules. Multiple files can be included in a single string, separated by commas.
Example: `"hdfs:///Project//Resources/module1.py,hdfs:///Project//Resources/module2.py"` | `null` |
+| `jars` | string | HDFS path(s) to JAR files to be provided to the Spark application. These will be added to the classpath. Multiple files can be included in a single string, separated by commas.
Example: `"hdfs:///Project//Resources/lib1.jar,hdfs:///Project//Resources/lib2.jar"` | `null` |
+| `archives` | string | HDFS path(s) to archive files to be provided to the Spark application. Multiple files can be included in a single string, separated by commas.
Example: `"hdfs:///Project//Resources/archive1.zip,hdfs:///Project//Resources/archive2.tar.gz"` | `null` |
## Accessing project data
diff --git a/docs/user_guides/projects/jobs/python_job.md b/docs/user_guides/projects/jobs/python_job.md
index 420e38e49..0fa2a9e9f 100644
--- a/docs/user_guides/projects/jobs/python_job.md
+++ b/docs/user_guides/projects/jobs/python_job.md
@@ -177,6 +177,7 @@ The following table describes the JSON payload returned by `jobs_api.get_configu
| `resourceConfig.gpus` | number (int) | Number of GPUs to be allocated | `0` |
| `logRedirection` | boolean | Whether logs are redirected | `true` |
| `jobType` | string | Type of job | `"PYTHON"` |
+| `files` | string | HDFS path(s) to files to be provided to the Python Job. Multiple files can be included in a single string, separated by commas.
Example: `"hdfs:///Project//Resources/file1.py,hdfs:///Project//Resources/file2.txt"` | `null` |
## Accessing project data
diff --git a/docs/user_guides/projects/jobs/spark_job.md b/docs/user_guides/projects/jobs/spark_job.md
index 6d0f0510b..6345d5a65 100644
--- a/docs/user_guides/projects/jobs/spark_job.md
+++ b/docs/user_guides/projects/jobs/spark_job.md
@@ -230,7 +230,12 @@ The following table describes the JSON payload returned by `jobs_api.get_configu
| `spark.dynamicAllocation.minExecutors` | number (int) | Minimum number of executors with dynamic allocation | `1` |
| `spark.dynamicAllocation.maxExecutors` | number (int) | Maximum number of executors with dynamic allocation | `2` |
| `spark.dynamicAllocation.initialExecutors` | number (int) | Initial number of executors with dynamic allocation | `1` |
-| `spark.blacklist.enabled` | boolean | Whether executor/node blacklisting is enabled | `false` |
+| `spark.blacklist.enabled` | boolean | Whether executor/node blacklisting is enabled | `false`
+| `files` | string | HDFS path(s) to files to be provided to the Spark application. Multiple files can be included in a single string, separated by commas.
Example: `"hdfs:///Project//Resources/file1.py,hdfs:///Project//Resources/file2.txt"` | `null` |
+| `pyFiles` | string | HDFS path(s) to Python files to be provided to the Spark application. These will be added to the `PYTHONPATH` so they can be imported as modules. Multiple files can be included in a single string, separated by commas.
Example: `"hdfs:///Project//Resources/module1.py,hdfs:///Project//Resources/module2.py"` | `null` |
+| `jars` | string | HDFS path(s) to JAR files to be provided to the Spark application. These will be added to the classpath. Multiple files can be included in a single string, separated by commas.
Example: `"hdfs:///Project//Resources/lib1.jar,hdfs:///Project//Resources/lib2.jar"` | `null` |
+| `archives` | string | HDFS path(s) to archive files to be provided to the Spark application. Multiple files can be included in a single string, separated by commas.
Example: `"hdfs:///Project//Resources/archive1.zip,hdfs:///Project//Resources/archive2.tar.gz"` | `null` |
+
## Accessing project data