Skip to content

Commit a142a9e

Browse files
authored
feat(sdk): add dataflow and datajob entity (#13551)
1 parent f335093 commit a142a9e

19 files changed

+1909
-1
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from datahub.metadata.urns import TagUrn
2+
from datahub.sdk import DataFlow, DataHubClient
3+
4+
client = DataHubClient.from_env()
5+
6+
dataflow = DataFlow(
7+
name="example_dataflow",
8+
platform="airflow",
9+
description="airflow pipeline for production",
10+
tags=[TagUrn(name="production"), TagUrn(name="data_engineering")],
11+
)
12+
13+
client.entities.upsert(dataflow)
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from datahub.metadata.urns import TagUrn
2+
from datahub.sdk import DataFlow, DataHubClient, DataJob
3+
4+
client = DataHubClient.from_env()
5+
6+
# datajob will inherit the platform and platform instance from the flow
7+
8+
dataflow = DataFlow(
9+
platform="airflow",
10+
name="example_dag",
11+
platform_instance="PROD",
12+
description="example dataflow",
13+
tags=[TagUrn(name="tag1"), TagUrn(name="tag2")],
14+
)
15+
16+
datajob = DataJob(
17+
name="example_datajob",
18+
flow=dataflow,
19+
)
20+
21+
client.entities.upsert(dataflow)
22+
client.entities.upsert(datajob)

metadata-ingestion/src/datahub/sdk/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,12 @@
1919
TagUrn,
2020
)
2121
from datahub.sdk.container import Container
22+
from datahub.sdk.dataflow import DataFlow
23+
from datahub.sdk.datajob import DataJob
2224
from datahub.sdk.dataset import Dataset
2325
from datahub.sdk.main_client import DataHubClient
26+
from datahub.sdk.mlmodel import MLModel
27+
from datahub.sdk.mlmodelgroup import MLModelGroup
2428
from datahub.sdk.search_filters import Filter, FilterDsl
2529

2630
# We want to print out the warning if people do `from datahub.sdk import X`.

metadata-ingestion/src/datahub/sdk/_all_entities.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from typing import Dict, List, Type
22

33
from datahub.sdk.container import Container
4+
from datahub.sdk.dataflow import DataFlow
5+
from datahub.sdk.datajob import DataJob
46
from datahub.sdk.dataset import Dataset
57
from datahub.sdk.entity import Entity
68
from datahub.sdk.mlmodel import MLModel
@@ -12,6 +14,8 @@
1214
Dataset,
1315
MLModel,
1416
MLModelGroup,
17+
DataFlow,
18+
DataJob,
1519
]
1620

1721
ENTITY_CLASSES: Dict[str, Type[Entity]] = {

metadata-ingestion/src/datahub/sdk/_shared.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
ContainerUrn,
3030
CorpGroupUrn,
3131
CorpUserUrn,
32+
DataFlowUrn,
3233
DataJobUrn,
3334
DataPlatformInstanceUrn,
3435
DataPlatformUrn,
@@ -47,10 +48,10 @@
4748

4849
if TYPE_CHECKING:
4950
from datahub.sdk.container import Container
50-
5151
UrnOrStr: TypeAlias = Union[Urn, str]
5252
DatasetUrnOrStr: TypeAlias = Union[str, DatasetUrn]
5353
DatajobUrnOrStr: TypeAlias = Union[str, DataJobUrn]
54+
DataflowUrnOrStr: TypeAlias = Union[str, DataFlowUrn]
5455

5556
ActorUrn: TypeAlias = Union[CorpUserUrn, CorpGroupUrn]
5657

Lines changed: 302 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,302 @@
1+
from __future__ import annotations
2+
3+
import warnings
4+
from datetime import datetime
5+
from typing import Dict, Optional, Type, Union
6+
7+
from typing_extensions import Self
8+
9+
import datahub.metadata.schema_classes as models
10+
from datahub.cli.cli_utils import first_non_null
11+
from datahub.emitter.mce_builder import DEFAULT_ENV
12+
from datahub.errors import (
13+
IngestionAttributionWarning,
14+
)
15+
from datahub.metadata.urns import DataFlowUrn, Urn
16+
from datahub.sdk._attribution import is_ingestion_attribution
17+
from datahub.sdk._shared import (
18+
DomainInputType,
19+
HasContainer,
20+
HasDomain,
21+
HasInstitutionalMemory,
22+
HasOwnership,
23+
HasPlatformInstance,
24+
HasSubtype,
25+
HasTags,
26+
HasTerms,
27+
LinksInputType,
28+
OwnersInputType,
29+
ParentContainerInputType,
30+
TagsInputType,
31+
TermsInputType,
32+
make_time_stamp,
33+
parse_time_stamp,
34+
)
35+
from datahub.sdk.entity import Entity, ExtraAspectsType
36+
from datahub.utilities.sentinels import Unset, unset
37+
38+
39+
class DataFlow(
40+
HasPlatformInstance,
41+
HasSubtype,
42+
HasOwnership,
43+
HasContainer,
44+
HasInstitutionalMemory,
45+
HasTags,
46+
HasTerms,
47+
HasDomain,
48+
Entity,
49+
):
50+
"""Represents a dataflow in DataHub.
51+
A dataflow represents a collection of data, such as a table, view, or file.
52+
This class provides methods for managing dataflow metadata including schema,
53+
lineage, and various aspects like ownership, tags, and terms.
54+
"""
55+
56+
__slots__ = ()
57+
58+
@classmethod
59+
def get_urn_type(cls) -> Type[DataFlowUrn]:
60+
"""Get the URN type for dataflows.
61+
Returns:
62+
The DataflowUrn class.
63+
"""
64+
return DataFlowUrn
65+
66+
def __init__(
67+
self,
68+
*,
69+
# Identity.
70+
name: str,
71+
platform: str,
72+
display_name: Optional[str] = None,
73+
platform_instance: Optional[str] = None,
74+
env: str = DEFAULT_ENV,
75+
# Dataflow properties.
76+
description: Optional[str] = None,
77+
external_url: Optional[str] = None,
78+
custom_properties: Optional[Dict[str, str]] = None,
79+
created: Optional[datetime] = None,
80+
last_modified: Optional[datetime] = None,
81+
# Standard aspects.
82+
subtype: Optional[str] = None,
83+
owners: Optional[OwnersInputType] = None,
84+
links: Optional[LinksInputType] = None,
85+
tags: Optional[TagsInputType] = None,
86+
terms: Optional[TermsInputType] = None,
87+
domain: Optional[DomainInputType] = None,
88+
parent_container: ParentContainerInputType | Unset = unset,
89+
extra_aspects: ExtraAspectsType = None,
90+
):
91+
"""Initialize a new Dataflow instance.
92+
Args:
93+
platform: The platform this dataflow belongs to (e.g. "mysql", "snowflake").
94+
name: The name of the dataflow.
95+
platform_instance: Optional platform instance identifier.
96+
env: The environment this dataflow belongs to (default: DEFAULT_ENV).
97+
description: Optional description of the dataflow.
98+
display_name: Optional display name for the dataflow.
99+
external_url: Optional URL to external documentation or source.
100+
custom_properties: Optional dictionary of custom properties.
101+
created: Optional creation timestamp.
102+
last_modified: Optional last modification timestamp.
103+
subtype: Optional subtype of the dataflow.
104+
owners: Optional list of owners.
105+
links: Optional list of links.
106+
tags: Optional list of tags.
107+
terms: Optional list of glossary terms.
108+
domain: Optional domain this dataflow belongs to.
109+
extra_aspects: Optional list of additional aspects.
110+
upstreams: Optional upstream lineage information.
111+
"""
112+
urn = DataFlowUrn.create_from_ids(
113+
orchestrator=platform,
114+
flow_id=name,
115+
env=env,
116+
platform_instance=platform_instance,
117+
)
118+
super().__init__(urn)
119+
self._set_extra_aspects(extra_aspects)
120+
121+
self._set_platform_instance(urn.orchestrator, platform_instance)
122+
123+
# Initialize DataFlowInfoClass directly with name
124+
self._setdefault_aspect(models.DataFlowInfoClass(name=display_name or name))
125+
self._ensure_dataflow_props().env = env
126+
127+
if description is not None:
128+
self.set_description(description)
129+
if display_name is not None:
130+
self.set_display_name(display_name)
131+
if external_url is not None:
132+
self.set_external_url(external_url)
133+
if custom_properties is not None:
134+
self.set_custom_properties(custom_properties)
135+
if created is not None:
136+
self.set_created(created)
137+
if last_modified is not None:
138+
self.set_last_modified(last_modified)
139+
if subtype is not None:
140+
self.set_subtype(subtype)
141+
if owners is not None:
142+
self.set_owners(owners)
143+
if links is not None:
144+
self.set_links(links)
145+
if tags is not None:
146+
self.set_tags(tags)
147+
if terms is not None:
148+
self.set_terms(terms)
149+
if domain is not None:
150+
self.set_domain(domain)
151+
if parent_container is not unset:
152+
self._set_container(parent_container)
153+
154+
@classmethod
155+
def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self:
156+
assert isinstance(urn, DataFlowUrn)
157+
entity = cls(
158+
platform=urn.orchestrator,
159+
name=urn.flow_id,
160+
)
161+
return entity._init_from_graph(current_aspects)
162+
163+
@property
164+
def urn(self) -> DataFlowUrn:
165+
return self._urn # type: ignore
166+
167+
def _ensure_dataflow_props(self) -> models.DataFlowInfoClass:
168+
props = self._get_aspect(models.DataFlowInfoClass)
169+
if props is None:
170+
# Use name from URN as fallback
171+
props = models.DataFlowInfoClass(name=self.urn.flow_id)
172+
self._set_aspect(props)
173+
return props
174+
175+
def _get_editable_props(self) -> Optional[models.EditableDataFlowPropertiesClass]:
176+
return self._get_aspect(models.EditableDataFlowPropertiesClass)
177+
178+
def _ensure_editable_props(self) -> models.EditableDataFlowPropertiesClass:
179+
# Note that most of the fields in this aspect are not used.
180+
# The only one that's relevant for us is the description.
181+
return self._setdefault_aspect(models.EditableDataFlowPropertiesClass())
182+
183+
@property
184+
def description(self) -> Optional[str]:
185+
"""Get the description of the dataflow.
186+
Returns:
187+
The description if set, None otherwise.
188+
"""
189+
editable_props = self._get_editable_props()
190+
return first_non_null(
191+
[
192+
editable_props.description if editable_props is not None else None,
193+
self._ensure_dataflow_props().description,
194+
]
195+
)
196+
197+
def set_description(self, description: str) -> None:
198+
"""Set the description of the dataflow.
199+
Args:
200+
description: The description to set.
201+
Note:
202+
If called during ingestion, this will warn if overwriting
203+
a non-ingestion description.
204+
"""
205+
if is_ingestion_attribution():
206+
editable_props = self._get_editable_props()
207+
if editable_props is not None and editable_props.description is not None:
208+
warnings.warn(
209+
"Overwriting non-ingestion description from ingestion is an anti-pattern.",
210+
category=IngestionAttributionWarning,
211+
stacklevel=2,
212+
)
213+
# Force the ingestion description to show up.
214+
editable_props.description = None
215+
216+
self._ensure_dataflow_props().description = description
217+
else:
218+
self._ensure_editable_props().description = description
219+
220+
@property
221+
def name(self) -> str:
222+
"""Get the name of the dataflow.
223+
Returns:
224+
The name of the dataflow.
225+
"""
226+
return self.urn.flow_id
227+
228+
@property
229+
def display_name(self) -> Optional[str]:
230+
"""Get the display name of the dataflow.
231+
Returns:
232+
The display name if set, None otherwise.
233+
"""
234+
return self._ensure_dataflow_props().name
235+
236+
def set_display_name(self, display_name: str) -> None:
237+
"""Set the display name of the dataflow.
238+
Args:
239+
display_name: The display name to set.
240+
"""
241+
self._ensure_dataflow_props().name = display_name
242+
243+
@property
244+
def external_url(self) -> Optional[str]:
245+
"""Get the external URL of the dataflow.
246+
Returns:
247+
The external URL if set, None otherwise.
248+
"""
249+
return self._ensure_dataflow_props().externalUrl
250+
251+
def set_external_url(self, external_url: str) -> None:
252+
"""Set the external URL of the dataflow.
253+
Args:
254+
external_url: The external URL to set.
255+
"""
256+
self._ensure_dataflow_props().externalUrl = external_url
257+
258+
@property
259+
def custom_properties(self) -> Dict[str, str]:
260+
"""Get the custom properties of the dataflow.
261+
Returns:
262+
Dictionary of custom properties.
263+
"""
264+
return self._ensure_dataflow_props().customProperties
265+
266+
def set_custom_properties(self, custom_properties: Dict[str, str]) -> None:
267+
"""Set the custom properties of the dataflow.
268+
Args:
269+
custom_properties: Dictionary of custom properties to set.
270+
"""
271+
self._ensure_dataflow_props().customProperties = custom_properties
272+
273+
@property
274+
def created(self) -> Optional[datetime]:
275+
"""Get the creation timestamp of the dataflow.
276+
Returns:
277+
The creation timestamp if set, None otherwise.
278+
"""
279+
return parse_time_stamp(self._ensure_dataflow_props().created)
280+
281+
def set_created(self, created: datetime) -> None:
282+
"""Set the creation timestamp of the dataflow.
283+
Args:
284+
created: The creation timestamp to set.
285+
"""
286+
self._ensure_dataflow_props().created = make_time_stamp(created)
287+
288+
@property
289+
def last_modified(self) -> Optional[datetime]:
290+
"""Get the last modification timestamp of the dataflow.
291+
Returns:
292+
The last modification timestamp if set, None otherwise.
293+
"""
294+
return parse_time_stamp(self._ensure_dataflow_props().lastModified)
295+
296+
def set_last_modified(self, last_modified: datetime) -> None:
297+
self._ensure_dataflow_props().lastModified = make_time_stamp(last_modified)
298+
299+
@property
300+
def env(self) -> Optional[Union[str, models.FabricTypeClass]]:
301+
"""Get the environment of the dataflow."""
302+
return self._ensure_dataflow_props().env

0 commit comments

Comments
 (0)