[SPARK-52663][SDP] Introduce name field to pipeline spec

sryza · gengliangwang · commit 00a8adbf217c · 2025-07-05T15:26:44.000-07:00
### What changes were proposed in this pull request? The Declarative Pipelines SPIP included a "name" field in the pipeline spec, but we left that out in the earlier implementation. This adds it in. The name field is required. This matches behavior for similar systems, like dbt. ### Why are the changes needed? See above. ### Does this PR introduce _any_ user-facing change? Yes, but only to unreleased code. ### How was this patch tested? Updated existing tests, and added tests for proper error when the name is missing. ### Was this patch authored or co-authored using generative AI tooling? Closes #51353 from sryza/pipeline-name. Authored-by: Sandy Ryza <sandy.ryza@databricks.com> Signed-off-by: Gengliang Wang <gengliang@apache.org>
diff --git a/docs/declarative-pipelines-programming-guide.md b/docs/declarative-pipelines-programming-guide.md
@@ -75,6 +75,7 @@ A YAML-formatted pipeline spec file contains the top-level configuration for the
 An example pipeline spec file:
 
 ```yaml
+name: my_pipeline
 definitions:
   - glob:
       include: transformations/**/*.py
@@ -99,7 +100,7 @@ The `spark-pipelines` command line interface (CLI) is the primary way to execute
 
 ### `spark-pipelines init`
 
-`spark-pipelines init` generates a simple pipeline project, including a spec file and example definitions.
+`spark-pipelines init --name my_pipeline` generates a simple pipeline project, inside a directory named "my_pipeline", including a spec file and example definitions.
 
 ### `spark-pipelines run`
 
diff --git a/python/pyspark/errors/error-conditions.json b/python/pyspark/errors/error-conditions.json
@@ -884,6 +884,11 @@
       "No pipeline.yaml or pipeline.yml file provided in arguments or found in directory `<dir_path>` or readable ancestor directories."
     ]
   },
+  "PIPELINE_SPEC_MISSING_REQUIRED_FIELD": {
+    "message": [
+      "Pipeline spec missing required field `<field_name>`."
+    ]
+  },
   "PIPELINE_SPEC_UNEXPECTED_FIELD": {
     "message": [
       "Pipeline spec field `<field_name>` is unexpected."
diff --git a/python/pyspark/pipelines/cli.py b/python/pyspark/pipelines/cli.py
@@ -61,12 +61,14 @@ class DefinitionsGlob:
 class PipelineSpec:
     """Spec for a pipeline.
 
+    :param name: The name of the pipeline.
     :param catalog: The default catalog to use for the pipeline.
     :param database: The default database to use for the pipeline.
     :param configuration: A dictionary of Spark configuration properties to set for the pipeline.
     :param definitions: A list of glob patterns for finding pipeline definitions files.
     """
 
+    name: str
     catalog: Optional[str]
     database: Optional[str]
     configuration: Mapping[str, str]
@@ -110,13 +112,23 @@ def load_pipeline_spec(spec_path: Path) -> PipelineSpec:
 
 
 def unpack_pipeline_spec(spec_data: Mapping[str, Any]) -> PipelineSpec:
+    ALLOWED_FIELDS = {"name", "catalog", "database", "schema", "configuration", "definitions"}
+    REQUIRED_FIELDS = ["name"]
     for key in spec_data.keys():
-        if key not in ["catalog", "database", "schema", "configuration", "definitions"]:
+        if key not in ALLOWED_FIELDS:
             raise PySparkException(
                 errorClass="PIPELINE_SPEC_UNEXPECTED_FIELD", messageParameters={"field_name": key}
             )
 
+    for key in REQUIRED_FIELDS:
+        if key not in spec_data:
+            raise PySparkException(
+                errorClass="PIPELINE_SPEC_MISSING_REQUIRED_FIELD",
+                messageParameters={"field_name": key},
+            )
+
     return PipelineSpec(
+        name=spec_data["name"],
         catalog=spec_data.get("catalog"),
         database=spec_data.get("database", spec_data.get("schema")),
         configuration=validate_str_dict(spec_data.get("configuration", {}), "configuration"),
diff --git a/python/pyspark/pipelines/init_cli.py b/python/pyspark/pipelines/init_cli.py
@@ -18,6 +18,7 @@
 from pathlib import Path
 
 SPEC = """
+name: {{ name }}
 definitions:
   - glob:
       include: transformations/**/*.py
@@ -49,7 +50,7 @@ def init(name: str) -> None:
     # Write the spec file to the project directory
     spec_file = project_dir / "pipeline.yml"
     with open(spec_file, "w") as f:
-        f.write(SPEC)
+        f.write(SPEC.replace("{{ name }}", name))
 
     # Create the transformations directory
     transformations_dir = project_dir / "transformations"
diff --git a/python/pyspark/pipelines/tests/test_cli.py b/python/pyspark/pipelines/tests/test_cli.py
@@ -50,6 +50,7 @@ def test_load_pipeline_spec(self):
             tmpfile.write(
                 """
                 {
+                    "name": "test_pipeline",
                     "catalog": "test_catalog",
                     "database": "test_database",
                     "configuration": {
@@ -64,17 +65,44 @@ def test_load_pipeline_spec(self):
             )
             tmpfile.flush()
             spec = load_pipeline_spec(Path(tmpfile.name))
+            assert spec.name == "test_pipeline"
             assert spec.catalog == "test_catalog"
             assert spec.database == "test_database"
             assert spec.configuration == {"key1": "value1", "key2": "value2"}
             assert len(spec.definitions) == 1
             assert spec.definitions[0].include == "test_include"
 
+    def test_load_pipeline_spec_name_is_required(self):
+        with tempfile.NamedTemporaryFile(mode="w") as tmpfile:
+            tmpfile.write(
+                """
+                {
+                    "catalog": "test_catalog",
+                    "database": "test_database",
+                    "configuration": {
+                        "key1": "value1",
+                        "key2": "value2"
+                    },
+                    "definitions": [
+                        {"glob": {"include": "test_include"}}
+                    ]
+                }
+                """
+            )
+            tmpfile.flush()
+            with self.assertRaises(PySparkException) as context:
+                load_pipeline_spec(Path(tmpfile.name))
+            self.assertEqual(
+                context.exception.getCondition(), "PIPELINE_SPEC_MISSING_REQUIRED_FIELD"
+            )
+            self.assertEqual(context.exception.getMessageParameters(), {"field_name": "name"})
+
     def test_load_pipeline_spec_schema_fallback(self):
         with tempfile.NamedTemporaryFile(mode="w") as tmpfile:
             tmpfile.write(
                 """
                 {
+                    "name": "test_pipeline",
                     "catalog": "test_catalog",
                     "schema": "test_database",
                     "configuration": {
@@ -120,20 +148,22 @@ def test_load_pipeline_spec_invalid(self):
             )
 
     def test_unpack_empty_pipeline_spec(self):
-        empty_spec = PipelineSpec(catalog=None, database=None, configuration={}, definitions=[])
-        self.assertEqual(unpack_pipeline_spec({}), empty_spec)
+        empty_spec = PipelineSpec(
+            name="test_pipeline", catalog=None, database=None, configuration={}, definitions=[]
+        )
+        self.assertEqual(unpack_pipeline_spec({"name": "test_pipeline"}), empty_spec)
 
     def test_unpack_pipeline_spec_bad_configuration(self):
         with self.assertRaises(TypeError) as context:
-            unpack_pipeline_spec({"configuration": "not_a_dict"})
+            unpack_pipeline_spec({"name": "test_pipeline", "configuration": "not_a_dict"})
         self.assertIn("should be a dict", str(context.exception))
 
         with self.assertRaises(TypeError) as context:
-            unpack_pipeline_spec({"configuration": {"key": {}}})
+            unpack_pipeline_spec({"name": "test_pipeline", "configuration": {"key": {}}})
         self.assertIn("key", str(context.exception))
 
         with self.assertRaises(TypeError) as context:
-            unpack_pipeline_spec({"configuration": {1: "something"}})
+            unpack_pipeline_spec({"name": "test_pipeline", "configuration": {1: "something"}})
         self.assertIn("int", str(context.exception))
 
     def test_find_pipeline_spec_in_current_directory(self):
@@ -205,6 +235,7 @@ def test_find_pipeline_spec_in_parent_directory(self):
 
     def test_register_definitions(self):
         spec = PipelineSpec(
+            name="test_pipeline",
             catalog=None,
             database=None,
             configuration={},
@@ -247,6 +278,7 @@ def mv2():
     def test_register_definitions_file_raises_error(self):
         """Errors raised while executing definitions code should make it to the outer context."""
         spec = PipelineSpec(
+            name="test_pipeline",
             catalog=None,
             database=None,
             configuration={},
@@ -264,6 +296,7 @@ def test_register_definitions_file_raises_error(self):
 
     def test_register_definitions_unsupported_file_extension_matches_glob(self):
         spec = PipelineSpec(
+            name="test_pipeline",
             catalog=None,
             database=None,
             configuration={},
@@ -317,6 +350,7 @@ def test_python_import_current_directory(self):
                     inner_dir1 / "pipeline.yaml",
                     registry,
                     PipelineSpec(
+                        name="test_pipeline",
                         catalog=None,
                         database=None,
                         configuration={},
diff --git a/python/pyspark/pipelines/tests/test_init_cli.py b/python/pyspark/pipelines/tests/test_init_cli.py
@@ -50,6 +50,7 @@ def test_init(self):
             with change_dir(Path(temp_dir) / project_name):
                 spec_path = find_pipeline_spec(Path.cwd())
                 spec = load_pipeline_spec(spec_path)
+                assert spec.name == project_name
                 registry = LocalGraphElementRegistry()
                 register_definitions(spec_path, registry, spec)
                 self.assertEqual(len(registry.datasets), 1)