Skip to content

Commit bb3768a

Browse files
authored
Improve creating UC catalogs (#2898)
## Changes Ran into a couple improvements when manually testing #2744: - We request the catalog location also when the catalog already exists. Solved by checking if a catalog exists before requesting the storage location - Multiple loops over the storage locations are not supported as the iterator is empty after first loop. Solved by emptying the external locations in a list. - More consistent: - Logging - Matching storage locations ### Linked issues Resolves #2879 ### Functionality - [x] modified existing command: `databricks labs ucx create-ucx-catalog/create-catalogs-schemas` ### Tests - [x] manually tested - [x] added unit tests
1 parent 3983fbe commit bb3768a

File tree

3 files changed

+72
-38
lines changed

3 files changed

+72
-38
lines changed

src/databricks/labs/ucx/hive_metastore/catalog_schema.py

Lines changed: 22 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import collections
22
import logging
33
from dataclasses import replace
4-
import fnmatch
54
from pathlib import PurePath
65

76
from databricks.labs.blueprint.tui import Prompts
@@ -30,7 +29,7 @@ def __init__(
3029
):
3130
self._ws = ws
3231
self._table_mapping = table_mapping
33-
self._external_locations = self._ws.external_locations.list()
32+
self._external_locations = list(self._ws.external_locations.list())
3433
self._principal_grants = principal_grants
3534
self._backend = sql_backend
3635
self._hive_grants_crawler = grants_crawler
@@ -45,32 +44,19 @@ def create_ucx_catalog(self, prompts: Prompts, *, properties: dict[str, str] | N
4544
properties : (dict[str, str] | None), default None
4645
The properties to pass to the catalog. If None, no properties are passed.
4746
"""
48-
try:
49-
self._create_catalog_validate(self._ucx_catalog, prompts, properties=properties)
50-
except BadRequest as e:
51-
if "already exists" in str(e):
52-
logger.warning(f"Catalog '{self._ucx_catalog}' already exists. Skipping.")
53-
return
54-
raise
47+
self._create_catalog_validate(self._ucx_catalog, prompts, properties=properties)
5548

5649
def create_all_catalogs_schemas(self, prompts: Prompts) -> None:
5750
candidate_catalogs, candidate_schemas = self._get_missing_catalogs_schemas()
5851
for candidate_catalog in candidate_catalogs:
59-
try:
60-
self._create_catalog_validate(candidate_catalog, prompts, properties=None)
61-
except BadRequest as e:
62-
if "already exists" in str(e):
63-
logger.warning(f"Catalog '{candidate_catalog}' already exists. Skipping.")
64-
continue
52+
self._create_catalog_validate(candidate_catalog, prompts, properties=None)
6553
for candidate_catalog, schemas in candidate_schemas.items():
6654
for candidate_schema in schemas:
6755
try:
6856
self._create_schema(candidate_catalog, candidate_schema)
6957
except BadRequest as e:
7058
if "already exists" in str(e):
71-
logger.warning(
72-
f"Schema '{candidate_schema}' in catalog '{candidate_catalog}' already exists. Skipping."
73-
)
59+
logger.warning(f"Skipping already existing schema: {candidate_catalog}.{candidate_schema}")
7460
continue
7561
self._apply_from_legacy_table_acls()
7662
self._update_principal_acl()
@@ -141,19 +127,28 @@ def _get_database_source_target_mapping(self) -> dict[str, list[SchemaInfo]]:
141127
src_trg_schema_mapping[table_mapping.src_schema].append(schema)
142128
return src_trg_schema_mapping
143129

144-
def _create_catalog_validate(self, catalog: str, prompts: Prompts, *, properties: dict[str, str] | None) -> None:
145-
logger.info(f"Validating UC catalog: {catalog}")
130+
def _create_catalog_validate(
131+
self, catalog_name: str, prompts: Prompts, *, properties: dict[str, str] | None
132+
) -> None:
133+
try:
134+
catalog = self._ws.catalogs.get(catalog_name)
135+
except NotFound:
136+
catalog = None
137+
if catalog:
138+
logger.warning(f"Skipping already existing catalog: {catalog_name}")
139+
return
140+
logger.info(f"Validating UC catalog: {catalog_name}")
146141
attempts = 3
147142
while True:
148143
catalog_storage = prompts.question(
149-
f"Please provide storage location url for catalog: {catalog}", default="metastore"
144+
f"Please provide storage location url for catalog: {catalog_name}", default="metastore"
150145
)
151146
if self._validate_location(catalog_storage):
152147
break
153148
attempts -= 1
154149
if attempts == 0:
155-
raise NotFound(f"Failed to validate location for {catalog} catalog")
156-
self._create_catalog(catalog, catalog_storage, properties=properties)
150+
raise NotFound(f"Failed to validate location for catalog: {catalog_name}")
151+
self._create_catalog(catalog_name, catalog_storage, properties=properties)
157152

158153
def _list_existing(self) -> tuple[set[str], dict[str, set[str]]]:
159154
"""generate a list of existing UC catalogs and schema."""
@@ -203,19 +198,18 @@ def _get_missing_catalogs_schemas(self) -> tuple[set[str], dict[str, set[str]]]:
203198
target_schemas[catalog] = target_schemas[catalog] - schemas
204199
return target_catalogs, target_schemas
205200

206-
def _validate_location(self, location: str):
201+
def _validate_location(self, location: str) -> bool:
207202
if location == "metastore":
208203
return True
209204
try:
210205
PurePath(location)
211206
except ValueError:
212-
logger.error(f"Invalid location path {location}")
207+
logger.error(f"Invalid location path: {location}")
213208
return False
214209
for external_location in self._external_locations:
215-
if location == external_location.url:
216-
return True
217-
if external_location.url is not None and fnmatch.fnmatch(location, external_location.url + '*'):
210+
if external_location.url is not None and location.startswith(external_location.url):
218211
return True
212+
logger.warning(f"No matching external location found for: {location}")
219213
return False
220214

221215
def _create_catalog(self, catalog: str, catalog_storage: str, *, properties: dict[str, str] | None) -> None:

tests/unit/hive_metastore/test_catalog_schema.py

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,23 @@
1717
def prepare_test(ws, backend: MockBackend | None = None) -> CatalogSchema:
1818
ws.catalogs.list.return_value = [CatalogInfo(name="catalog1")]
1919

20+
def get_catalog(catalog_name: str) -> CatalogInfo:
21+
if catalog_name == "catalog1":
22+
return CatalogInfo(name="catalog1")
23+
raise NotFound(f"Catalog: {catalog_name}")
24+
25+
ws.catalogs.get.side_effect = get_catalog
26+
2027
def raise_catalog_exists(catalog: str, *_, **__) -> None:
2128
if catalog == "catalog1":
2229
raise BadRequest("Catalog 'catalog1' already exists")
2330

2431
ws.catalogs.create.side_effect = raise_catalog_exists
2532
ws.schemas.list.return_value = [SchemaInfo(name="schema1")]
26-
ws.external_locations.list.return_value = [ExternalLocationInfo(url="s3://foo/bar")]
33+
ws.external_locations.list.return_value = [
34+
ExternalLocationInfo(url="s3://foo/bar"),
35+
ExternalLocationInfo(url="abfss://container@storageaccount.dfs.core.windows.net"),
36+
]
2737
if backend is None:
2838
backend = MockBackend()
2939
installation = MockInstallation(
@@ -133,8 +143,8 @@ def test_create_ucx_catalog_creates_ucx_catalog() -> None:
133143

134144
def test_create_ucx_catalog_skips_when_ucx_catalogs_exists(caplog) -> None:
135145
ws = create_autospec(WorkspaceClient)
136-
mock_prompts = MockPrompts({"Please provide storage location url for catalog: ucx": "metastore"})
137146
catalog_schema = prepare_test(ws)
147+
ws.catalogs.get.side_effect = lambda catalog_name: CatalogInfo(name=catalog_name)
138148

139149
def raise_catalog_exists(catalog: str, *_, **__) -> None:
140150
if catalog == "ucx":
@@ -143,12 +153,20 @@ def raise_catalog_exists(catalog: str, *_, **__) -> None:
143153
ws.catalogs.create.side_effect = raise_catalog_exists
144154

145155
with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.hive_metastore.catalog_schema"):
146-
catalog_schema.create_ucx_catalog(mock_prompts)
147-
assert "Catalog 'ucx' already exists. Skipping." in caplog.text
156+
catalog_schema.create_ucx_catalog(MockPrompts({}))
157+
assert "Skipping already existing catalog: ucx" in caplog.text
148158

149159

150-
@pytest.mark.parametrize("location", ["s3://foo/bar", "s3://foo/bar/test", "s3://foo/bar/test/baz"])
151-
def test_create_all_catalogs_schemas_creates_catalogs(location: str):
160+
@pytest.mark.parametrize(
161+
"location",
162+
[
163+
"s3://foo/bar",
164+
"s3://foo/bar/test",
165+
"s3://foo/bar/test/baz",
166+
"abfss://container@storageaccount.dfs.core.windows.net",
167+
],
168+
)
169+
def test_create_all_catalogs_schemas_creates_catalogs(location: str) -> None:
152170
"""Catalog 2-4 should be created; catalog 1 already exists."""
153171
ws = create_autospec(WorkspaceClient)
154172
mock_prompts = MockPrompts({"Please provide storage location url for catalog: *": location})
@@ -164,6 +182,28 @@ def test_create_all_catalogs_schemas_creates_catalogs(location: str):
164182
ws.catalogs.create.assert_has_calls(calls, any_order=True)
165183

166184

185+
def test_create_all_catalogs_schemas_creates_catalogs_with_different_locations() -> None:
186+
"""Catalog 2-4 should be created; catalog 1 already exists."""
187+
ws = create_autospec(WorkspaceClient)
188+
mock_prompts = MockPrompts(
189+
{
190+
"Please provide storage location url for catalog: catalog2": "s3://foo/bar",
191+
"Please provide storage location url for catalog: catalog3": "s3://foo/bar/test",
192+
"Please provide storage location url for catalog: catalog4": "s3://foo/bar/test/baz",
193+
}
194+
)
195+
196+
catalog_schema = prepare_test(ws)
197+
catalog_schema.create_all_catalogs_schemas(mock_prompts)
198+
199+
calls = [
200+
call("catalog2", storage_root="s3://foo/bar", comment="Created by UCX", properties=None),
201+
call("catalog3", storage_root="s3://foo/bar/test", comment="Created by UCX", properties=None),
202+
call("catalog4", storage_root="s3://foo/bar/test/baz", comment="Created by UCX", properties=None),
203+
]
204+
ws.catalogs.create.assert_has_calls(calls, any_order=True)
205+
206+
167207
@pytest.mark.parametrize(
168208
"catalog,schema",
169209
[("catalog1", "schema2"), ("catalog1", "schema3"), ("catalog2", "schema2"), ("catalog3", "schema3")],

tests/unit/test_cli.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -804,8 +804,8 @@ def test_create_catalogs_schemas_handles_existing(ws, caplog) -> None:
804804
create_catalogs_schemas(ws, prompts, ctx=WorkspaceContext(ws))
805805
ws.catalogs.list.assert_called_once()
806806

807-
assert "Catalog 'test' already exists. Skipping." in caplog.messages
808-
assert "Schema 'test' in catalog 'test' already exists. Skipping." in caplog.messages
807+
assert "Skipping already existing catalog: test" in caplog.messages
808+
assert "Skipping already existing schema: test.test" in caplog.messages
809809

810810

811811
def test_cluster_remap(ws, caplog):
@@ -887,12 +887,12 @@ def test_assign_metastore_logs_account_id_and_assigns_metastore(caplog, acc_clie
887887
acc_client.metastore_assignments.create.assert_called_once()
888888

889889

890-
def test_create_ucx_catalog_calls_create_catalog(ws) -> None:
890+
def test_create_ucx_catalog_calls_get_catalog(ws) -> None:
891891
prompts = MockPrompts({"Please provide storage location url for catalog: .*": "metastore"})
892892

893893
create_ucx_catalog(ws, prompts, ctx=WorkspaceContext(ws))
894894

895-
ws.catalogs.create.assert_called_once()
895+
ws.catalogs.get.assert_called_once()
896896

897897

898898
def test_create_ucx_catalog_creates_history_schema_and_table(ws, mock_backend) -> None:

0 commit comments

Comments
 (0)