Skip to content

Commit 3a5bf5a

Browse files
committed
feat(search): lineage search
* remove aggregation from limited entities query * prevent slow responses from large page sizes (prevent 10k queries) * parallelize upstream/downstream 1-hop search * implement pagination to remove 1k upstream/downstream limit * add configuration options to application.yaml
1 parent 822be78 commit 3a5bf5a

File tree

77 files changed

+2644
-474
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

77 files changed

+2644
-474
lines changed

datahub-web-react/src/app/lineageV2/useSearchAcrossLineage.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ export default function useSearchAcrossLineage(
7373
direction,
7474
types: type === EntityType.SchemaField ? [EntityType.SchemaField] : undefined,
7575
start: 0,
76-
count: 10000,
76+
count: 2000,
7777
orFilters: [
7878
{
7979
and: [

datahub-web-react/src/graphql/lineage.graphql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -900,7 +900,7 @@ query getSearchAcrossLineageCounts(
900900
urn: $urn
901901
query: "*"
902902
start: 0
903-
count: 10000
903+
count: 2000
904904
filters: [{ field: "degree", value: "1", values: ["1"] }]
905905
direction: UPSTREAM
906906
lineageFlags: { startTimeMillis: $startTimeMillis, endTimeMillis: $endTimeMillis }
@@ -918,7 +918,7 @@ query getSearchAcrossLineageCounts(
918918
urn: $urn
919919
query: "*"
920920
start: 0
921-
count: 10000
921+
count: 2000
922922
filters: [{ field: "degree", value: "1", values: ["1"] }]
923923
direction: DOWNSTREAM
924924
lineageFlags: { startTimeMillis: $startTimeMillis, endTimeMillis: $endTimeMillis }

docker/datahub-gms/env/docker.env

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ MCE_CONSUMER_ENABLED=true
2727
PE_CONSUMER_ENABLED=true
2828
UI_INGESTION_ENABLED=true
2929
THEME_V2_DEFAULT=true
30+
ELASTICSEARCH_LIMIT_RESULTS_STRICT=true
3031

3132
# Uncomment to disable Metadata Service Authentication
3233
# METADATA_SERVICE_AUTH_ENABLED=false

docker/quickstart/docker-compose-m1.quickstart.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ services:
8585
- ELASTICSEARCH_HOST=elasticsearch
8686
- ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true
8787
- ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
88+
- ELASTICSEARCH_LIMIT_RESULTS_STRICT=true
8889
- ELASTICSEARCH_PORT=9200
8990
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
9091
- ENTITY_SERVICE_ENABLE_RETENTION=true

docker/quickstart/docker-compose.quickstart.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ services:
8585
- ELASTICSEARCH_HOST=elasticsearch
8686
- ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true
8787
- ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
88+
- ELASTICSEARCH_LIMIT_RESULTS_STRICT=true
8889
- ELASTICSEARCH_PORT=9200
8990
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
9091
- ENTITY_SERVICE_ENABLE_RETENTION=true

docs/api/restli/restli-overview.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1365,7 +1365,7 @@ curl -X POST 'http://localhost:8080/aspects?action=getTimeseriesAspectValues' \
13651365
13661366
{
13671367
"value":{
1368-
"limit":10000,
1368+
"limit":2000,
13691369
"aspectName":"datasetProfile",
13701370
"endTimeMillis":1627455600000,
13711371
"startTimeMillis":1625122800000,

docs/deploy/environment-vars.md

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -51,25 +51,27 @@ DataHub works.
5151

5252
## Search
5353

54-
| Variable | Default | Unit/Type | Components | Description |
55-
| --------------------------------------------------- | ------------------- | --------- | --------------------------------------------------------------- | ------------------------------------------------------------------------ |
56-
| `INDEX_PREFIX` | `` | string | [`GMS`, `MAE Consumer`, `Elasticsearch Setup`, `System Update`] | Prefix Elasticsearch indices with the given string. |
57-
| `ELASTICSEARCH_NUM_SHARDS_PER_INDEX` | 1 | integer | [`System Update`] | Default number of shards per Elasticsearch index. |
58-
| `ELASTICSEARCH_NUM_REPLICAS_PER_INDEX` | 1 | integer | [`System Update`] | Default number of replica per Elasticsearch index. |
59-
| `ELASTICSEARCH_BUILD_INDICES_RETENTION_VALUE` | 60 | integer | [`System Update`] | Number of units for the retention of Elasticsearch clone/backup indices. |
60-
| `ELASTICSEARCH_BUILD_INDICES_RETENTION_UNIT` | DAYS | string | [`System Update`] | Unit for the retention of Elasticsearch clone/backup indices. |
61-
| `ELASTICSEARCH_QUERY_EXACT_MATCH_EXCLUSIVE` | `false` | boolean | [`GMS`] | Only return exact matches when using quotes. |
62-
| `ELASTICSEARCH_QUERY_EXACT_MATCH_WITH_PREFIX` | `true` | boolean | [`GMS`] | Include prefix match in exact match results. |
63-
| `ELASTICSEARCH_QUERY_EXACT_MATCH_FACTOR` | 10.0 | float | [`GMS`] | Multiply by this number on true exact match. |
64-
| `ELASTICSEARCH_QUERY_EXACT_MATCH_PREFIX_FACTOR` | 1.6 | float | [`GMS`] | Multiply by this number when prefix match. |
65-
| `ELASTICSEARCH_QUERY_EXACT_MATCH_CASE_FACTOR` | 0.7 | float | [`GMS`] | Multiply by this number when case insensitive match. |
66-
| `ELASTICSEARCH_QUERY_EXACT_MATCH_ENABLE_STRUCTURED` | `true` | boolean | [`GMS`] | When using structured query, also include exact matches. |
67-
| `ELASTICSEARCH_QUERY_PARTIAL_URN_FACTOR` | 0.5 | float | [`GMS`] | Multiply by this number when partial token match on URN) |
68-
| `ELASTICSEARCH_QUERY_PARTIAL_FACTOR` | 0.4 | float | [`GMS`] | Multiply by this number when partial token match on non-URN field. |
69-
| `ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED` | `true` | boolean | [`GMS`] | Enable search query and ranking customization configuration. |
70-
| `ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE` | `search_config.yml` | string | [`GMS`] | The location of the search customization configuration. |
71-
| `ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX` | `false` | boolean | [`System Update`] | Enable reindexing on Elasticsearch schema changes. |
72-
| `ENABLE_STRUCTURED_PROPERTIES_SYSTEM_UPDATE` | `false` | boolean | [`System Update`] | Enable reindexing to remove hard deleted structured properties. |
54+
| Variable | Default | Unit/Type | Components | Description |
55+
| --------------------------------------------------- | ------------------- | --------- | --------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------ |
56+
| `INDEX_PREFIX` | `` | string | [`GMS`, `MAE Consumer`, `Elasticsearch Setup`, `System Update`] | Prefix Elasticsearch indices with the given string. |
57+
| `ELASTICSEARCH_NUM_SHARDS_PER_INDEX` | 1 | integer | [`System Update`] | Default number of shards per Elasticsearch index. |
58+
| `ELASTICSEARCH_NUM_REPLICAS_PER_INDEX` | 1 | integer | [`System Update`] | Default number of replica per Elasticsearch index. |
59+
| `ELASTICSEARCH_BUILD_INDICES_RETENTION_VALUE` | 60 | integer | [`System Update`] | Number of units for the retention of Elasticsearch clone/backup indices. |
60+
| `ELASTICSEARCH_BUILD_INDICES_RETENTION_UNIT` | DAYS | string | [`System Update`] | Unit for the retention of Elasticsearch clone/backup indices. |
61+
| `ELASTICSEARCH_QUERY_EXACT_MATCH_EXCLUSIVE` | `false` | boolean | [`GMS`] | Only return exact matches when using quotes. |
62+
| `ELASTICSEARCH_QUERY_EXACT_MATCH_WITH_PREFIX` | `true` | boolean | [`GMS`] | Include prefix match in exact match results. |
63+
| `ELASTICSEARCH_QUERY_EXACT_MATCH_FACTOR` | 10.0 | float | [`GMS`] | Multiply by this number on true exact match. |
64+
| `ELASTICSEARCH_QUERY_EXACT_MATCH_PREFIX_FACTOR` | 1.6 | float | [`GMS`] | Multiply by this number when prefix match. |
65+
| `ELASTICSEARCH_QUERY_EXACT_MATCH_CASE_FACTOR` | 0.7 | float | [`GMS`] | Multiply by this number when case insensitive match. |
66+
| `ELASTICSEARCH_QUERY_EXACT_MATCH_ENABLE_STRUCTURED` | `true` | boolean | [`GMS`] | When using structured query, also include exact matches. |
67+
| `ELASTICSEARCH_QUERY_PARTIAL_URN_FACTOR` | 0.5 | float | [`GMS`] | Multiply by this number when partial token match on URN) |
68+
| `ELASTICSEARCH_QUERY_PARTIAL_FACTOR` | 0.4 | float | [`GMS`] | Multiply by this number when partial token match on non-URN field. |
69+
| `ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED` | `true` | boolean | [`GMS`] | Enable search query and ranking customization configuration. |
70+
| `ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE` | `search_config.yml` | string | [`GMS`] | The location of the search customization configuration. |
71+
| `ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX` | `false` | boolean | [`System Update`] | Enable reindexing on Elasticsearch schema changes. |
72+
| `ENABLE_STRUCTURED_PROPERTIES_SYSTEM_UPDATE` | `false` | boolean | [`System Update`] | Enable reindexing to remove hard deleted structured properties. |
73+
| `ELASTICSEARCH_LIMIT_RESULTS_MAX` | 2000 | integer | [`GMS`] | Maximum search results per page. |
74+
| `ELASTICSEARCH_LIMIT_RESULTS_STRICT` | `false` | boolean | [`GMS`] | If `false`, reduce the page size to the maximum rathen then throw an exception is the request exceeds the maximum value. |
7375

7476
## Entities and Versions
7577

docs/how/updating-datahub.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
77
### Breaking Changes
88
9+
- Maximum search results per page (new: 5000, old: 10000) can be configured with environment variable `ELASTICSEARCH_LIMIT_RESULTS_MAX`
10+
- Maximum lineage visualization hops (new: 20, old: 1000) can be configured with environment variable `ELASTICSEARCH_SEARCH_GRAPH_LINEAGE_MAX_HOPS`
11+
912
### Known Issues
1013
1114
### Potential Downtime

docs/modeling/metadata-model.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ curl -X POST 'http://localhost:8080/aspects?action=getTimeseriesAspectValues' \
187187
188188
{
189189
"value":{
190-
"limit":10000,
190+
"limit":2000,
191191
"aspectName":"datasetProfile",
192192
"endTimeMillis":1627455600000,
193193
"startTimeMillis":1625122800000,

metadata-ingestion/src/datahub/cli/delete_cli.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
231231
default=3000,
232232
type=int,
233233
help="Batch size when querying for entities to un-soft delete."
234-
"Maximum 10000. Large batch sizes may cause timeouts.",
234+
"Maximum 5000. Large batch sizes may cause timeouts.",
235235
)
236236
def undo_by_filter(
237237
urn: Optional[str], platform: Optional[str], batch_size: int
@@ -336,7 +336,7 @@ def undo_by_filter(
336336
default=3000,
337337
type=int,
338338
help="Batch size when querying for entities to delete."
339-
"Maximum 10000. Large batch sizes may cause timeouts.",
339+
"Maximum 5000. Large batch sizes may cause timeouts.",
340340
)
341341
@click.option(
342342
"-n",
@@ -654,8 +654,8 @@ def _validate_user_aspect_flags(
654654
def _validate_batch_size(batch_size: int) -> None:
655655
if batch_size <= 0:
656656
raise click.UsageError("Batch size must be a positive integer.")
657-
elif batch_size > 10000:
658-
raise click.UsageError("Batch size cannot exceed 10,000.")
657+
elif batch_size > 5000:
658+
raise click.UsageError("Batch size cannot exceed 5,000.")
659659

660660

661661
def _delete_one_urn(

metadata-ingestion/src/datahub/ingestion/graph/client.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -806,7 +806,7 @@ def get_container_urns_by_filter(
806806
"input": search_query,
807807
"entity": "container",
808808
"start": 0,
809-
"count": 10000,
809+
"count": 5000,
810810
"filter": {"or": container_filters},
811811
}
812812
results: Dict = self._post_generic(url, search_body)
@@ -901,7 +901,7 @@ def get_urns_by_filter(
901901
query: Optional[str] = None,
902902
container: Optional[str] = None,
903903
status: Optional[RemovedStatusFilter] = RemovedStatusFilter.NOT_SOFT_DELETED,
904-
batch_size: int = 10000,
904+
batch_size: int = 5000,
905905
extraFilters: Optional[List[RawSearchFilterRule]] = None,
906906
extra_or_filters: Optional[RawSearchFilter] = None,
907907
) -> Iterable[str]:
@@ -993,7 +993,7 @@ def get_results_by_filter(
993993
query: Optional[str] = None,
994994
container: Optional[str] = None,
995995
status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
996-
batch_size: int = 10000,
996+
batch_size: int = 5000,
997997
extra_and_filters: Optional[List[RawSearchFilterRule]] = None,
998998
extra_or_filters: Optional[RawSearchFilter] = None,
999999
extra_source_fields: Optional[List[str]] = None,

metadata-io/build.gradle

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,10 @@ dependencies {
7272

7373
testImplementation project(':test-models')
7474
testImplementation project(path: ':test-models', configuration: 'testDataTemplate')
75-
testImplementation project(':datahub-graphql-core')
75+
testFixturesApi project(':datahub-graphql-core')
7676
testImplementation project(path: ':metadata-integration:java:datahub-client', configuration: 'shadow')
7777
testImplementation project(':metadata-service:auth-impl')
78+
testFixturesApi project(':metadata-service:configuration')
7879
testImplementation project(':li-utils')
7980
testImplementation externalDependency.testng
8081
testImplementation externalDependency.h2
@@ -96,7 +97,7 @@ dependencies {
9697
testImplementation externalDependency.testContainersOpenSearch
9798
testImplementation externalDependency.testContainersCassandra
9899
testImplementation externalDependency.lombok
99-
testImplementation externalDependency.springBootTest
100+
testFixturesApi externalDependency.springBootTest
100101
testImplementation spec.product.pegasus.restliServer
101102
testImplementation externalDependency.ebeanTest
102103
testImplementation externalDependency.opentelemetrySdk

0 commit comments

Comments
 (0)