1
1
import collections
2
2
import logging
3
+ import re
4
+ from dataclasses import dataclass , replace
5
+ from functools import cached_property
6
+ from typing import ClassVar
7
+ from packaging .version import Version , InvalidVersion
8
+
3
9
4
10
from databricks .labs .blueprint .installation import Installation
11
+ from databricks .labs .blueprint .tui import Prompts
5
12
from databricks .sdk import WorkspaceClient
6
13
from databricks .sdk .errors import AlreadyExists , NotFound , BadRequest
7
14
from databricks .sdk .service .catalog import (
14
21
)
15
22
16
23
from databricks .labs .ucx .account .workspaces import WorkspaceInfo
24
+ from databricks .labs .ucx .assessment .secrets import SecretsMixin
17
25
from databricks .labs .ucx .config import WorkspaceConfig
18
26
from databricks .labs .ucx .hive_metastore import ExternalLocations
19
27
20
28
21
29
logger = logging .getLogger (__name__ )
22
30
23
31
32
+ @dataclass
33
+ class ExternalHmsInfo :
34
+ """
35
+ This is a dataclass that represents the external Hive Metastore connection information.
36
+ It supports non glue external metastores.
37
+ """
38
+
39
+ database_type : str
40
+ host : str
41
+ port : str
42
+ database : str
43
+ user : str | None
44
+ password : str | None
45
+ version : str | None
46
+
47
+ def as_dict (self ) -> dict [str , str ]:
48
+ return {
49
+ "database" : self .database ,
50
+ "db_type" : self .database_type ,
51
+ "host" : self .host ,
52
+ "port" : self .port ,
53
+ }
54
+
55
+
24
56
class HiveMetastoreFederationEnabler :
25
57
def __init__ (self , installation : Installation ):
26
58
self ._installation = installation
@@ -31,61 +63,174 @@ def enable(self):
31
63
self ._installation .save (config )
32
64
33
65
34
- class HiveMetastoreFederation :
66
+ class HiveMetastoreFederation ( SecretsMixin ) :
35
67
def __init__ (
36
68
self ,
37
- workspace_client : WorkspaceClient ,
69
+ ws : WorkspaceClient ,
38
70
external_locations : ExternalLocations ,
39
71
workspace_info : WorkspaceInfo ,
72
+ config : WorkspaceConfig ,
73
+ * ,
40
74
enable_hms_federation : bool = False ,
41
75
):
42
- self ._workspace_client = workspace_client
76
+ self ._ws = ws
43
77
self ._external_locations = external_locations
44
78
self ._workspace_info = workspace_info
45
79
self ._enable_hms_federation = enable_hms_federation
80
+ self ._config = config
81
+
82
+ # Supported databases and version for HMS Federation
83
+ supported_database_versions : ClassVar [dict [str , list [str ]]] = {
84
+ "mysql" : ["2.3" , "0.13" ],
85
+ }
46
86
47
- def register_internal_hms_as_federated_catalog (self ) -> CatalogInfo :
87
+ def create_from_cli (self , prompts : Prompts ) -> None :
48
88
if not self ._enable_hms_federation :
49
89
raise RuntimeWarning ('Run `databricks labs ucx enable-hms-federation` to enable HMS Federation' )
50
- name = self ._workspace_info .current ()
51
- connection_info = self ._get_or_create_connection (name )
90
+
91
+ name = prompts .question (
92
+ 'Enter the name of the Hive Metastore connection and catalog' , default = self ._workspace_info .current ()
93
+ )
94
+
95
+ if self ._external_hms and prompts .confirm (
96
+ f'A supported external Hive Metastore connection was identified: { self ._external_hms .database_type } . '
97
+ f'Use this connection?'
98
+ ):
99
+ connection_info = self ._get_or_create_ext_connection (name , self ._external_hms )
100
+ else :
101
+ connection_info = self ._get_or_create_int_connection (name )
102
+
52
103
assert connection_info .name is not None
104
+ self ._register_federated_catalog (connection_info )
105
+
106
+ @cached_property
107
+ def _external_hms (self ) -> ExternalHmsInfo | None :
108
+ if not self ._config .spark_conf :
109
+ logger .info ('Spark config not found' )
110
+ return None
111
+ spark_config = self ._config .spark_conf
112
+ jdbc_url = self ._get_value_from_config_key (spark_config , 'spark.hadoop.javax.jdo.option.ConnectionURL' )
113
+ if not jdbc_url :
114
+ logger .info ('JDBC URL not found' )
115
+ return None
116
+ version_value = self ._get_value_from_config_key (spark_config , 'spark.sql.hive.metastore.version' )
117
+ if not version_value :
118
+ logger .info ('Hive Metastore version not found' )
119
+ return None
53
120
try :
54
- return self ._workspace_client .catalogs .create (
121
+ version = Version (version_value )
122
+ except InvalidVersion :
123
+ logger .info ('Hive Metastore version is not valid' )
124
+ return None
125
+ major_minor_version = f"{ version .major } .{ version .minor } "
126
+ external_hms = replace (self ._split_jdbc_url (jdbc_url ), version = major_minor_version )
127
+ supported_versions = self .supported_database_versions .get (external_hms .database_type )
128
+ if not supported_versions :
129
+ logger .info (f'Unsupported Hive Metastore: { external_hms .database_type } ' )
130
+ return None
131
+ if major_minor_version not in supported_versions :
132
+ logger .info (f'Unsupported Hive Metastore Version: { external_hms .database_type } - { version } ' )
133
+ return None
134
+
135
+ if not external_hms .user :
136
+ external_hms = replace (
137
+ external_hms ,
138
+ user = self ._get_value_from_config_key (spark_config , 'spark.hadoop.javax.jdo.option.ConnectionUserName' ),
139
+ )
140
+ if not external_hms .password :
141
+ external_hms = replace (
142
+ external_hms ,
143
+ password = self ._get_value_from_config_key (
144
+ spark_config , 'spark.hadoop.javax.jdo.option.ConnectionPassword'
145
+ ),
146
+ )
147
+ return external_hms
148
+
149
+ @classmethod
150
+ def _split_jdbc_url (cls , jdbc_url : str ) -> ExternalHmsInfo :
151
+ # Define the regex pattern to match the JDBC URL components
152
+ pattern = re .compile (
153
+ r'jdbc:(?P<db_type>[a-zA-Z0-9]+)://(?P<host>[^:/]+):(?P<port>\d+)/(?P<database>[^?]+)(\?user=(?P<user>[^&]+)&password=(?P<password>[^&]+))?'
154
+ )
155
+ match = pattern .match (jdbc_url )
156
+ if not match :
157
+ raise ValueError (f'Unsupported JDBC URL: { jdbc_url } ' )
158
+
159
+ db_type = match .group ('db_type' )
160
+ host = match .group ('host' )
161
+ port = match .group ('port' )
162
+ database = match .group ('database' )
163
+ user = match .group ('user' )
164
+ password = match .group ('password' )
165
+
166
+ return ExternalHmsInfo (db_type , host , port , database , user , password , None )
167
+
168
+ def _register_federated_catalog (
169
+ self ,
170
+ connection_info ,
171
+ ) -> CatalogInfo :
172
+ try :
173
+ return self ._ws .catalogs .create (
55
174
name = connection_info .name ,
56
175
connection_name = connection_info .name ,
57
176
options = {"authorized_paths" : self ._get_authorized_paths ()},
58
177
)
59
178
except BadRequest as err :
60
179
if err .error_code == 'CATALOG_ALREADY_EXISTS' :
61
180
logger .info (f'Catalog { connection_info .name } already exists' )
62
- for catalog_info in self ._workspace_client .catalogs .list ():
181
+ for catalog_info in self ._ws .catalogs .list ():
63
182
if catalog_info .name == connection_info .name :
64
183
return catalog_info
65
184
raise err
66
185
67
- def _get_or_create_connection (self , name : str ) -> ConnectionInfo :
186
+ def _get_or_create_int_connection (self , name : str ) -> ConnectionInfo :
68
187
try :
69
- return self ._workspace_client .connections .create (
188
+ return self ._ws .connections .create (
70
189
name = name ,
71
190
connection_type = ConnectionType .HIVE_METASTORE , # needs SDK change
72
191
options = {"builtin" : "true" },
73
192
)
74
193
except AlreadyExists :
75
- for connection in self ._workspace_client .connections .list ():
76
- if connection .name == name :
77
- return connection
194
+ return self ._get_existing_connection (name )
195
+
196
+ def _get_existing_connection (self , name : str ) -> ConnectionInfo :
197
+ for connection in self ._ws .connections .list ():
198
+ if connection .name == name :
199
+ return connection
78
200
raise NotFound (f'Connection { name } not found' )
79
201
202
+ def _get_or_create_ext_connection (self , name : str , external_hms : ExternalHmsInfo ) -> ConnectionInfo :
203
+ options = external_hms .as_dict ()
204
+ if external_hms .user :
205
+ options ["user" ] = external_hms .user
206
+ if external_hms .password :
207
+ options ["password" ] = external_hms .password
208
+ if external_hms .version :
209
+ options ["version" ] = external_hms .version
210
+ try :
211
+ return self ._ws .connections .create (
212
+ name = name ,
213
+ connection_type = ConnectionType .HIVE_METASTORE , # needs SDK change
214
+ options = options ,
215
+ )
216
+ except AlreadyExists :
217
+ return self ._get_existing_connection (name )
218
+
80
219
def _get_authorized_paths (self ) -> str :
81
220
existing = {}
82
- for external_location in self ._workspace_client .external_locations .list ():
221
+ for external_location in self ._ws .external_locations .list ():
83
222
existing [external_location .url ] = external_location
84
223
authorized_paths = []
85
- current_user = self ._workspace_client .current_user .me ()
224
+ current_user = self ._ws .current_user .me ()
86
225
if not current_user .user_name :
87
226
raise NotFound ('Current user not found' )
88
- for external_location_info in self ._external_locations .external_locations_with_root ():
227
+ # Get the external locations. If not using external HMS, include the root DBFS location.
228
+ if self ._external_hms is not None :
229
+ external_locations = self ._external_locations .external_locations_with_root ()
230
+ else :
231
+ external_locations = self ._external_locations .snapshot ()
232
+
233
+ for external_location_info in external_locations :
89
234
location = ExternalLocations .clean_location (external_location_info .location )
90
235
existing_location = existing .get (location )
91
236
if not existing_location :
@@ -103,11 +248,11 @@ def _add_missing_permissions_if_needed(self, location_name: str, current_user: s
103
248
grants = self ._location_grants (location_name )
104
249
if Privilege .CREATE_FOREIGN_SECURABLE not in grants [current_user ]:
105
250
change = PermissionsChange (principal = current_user , add = [Privilege .CREATE_FOREIGN_SECURABLE ])
106
- self ._workspace_client .grants .update (SecurableType .EXTERNAL_LOCATION , location_name , changes = [change ])
251
+ self ._ws .grants .update (SecurableType .EXTERNAL_LOCATION , location_name , changes = [change ])
107
252
108
253
def _location_grants (self , location_name : str ) -> dict [str , set [Privilege ]]:
109
254
grants : dict [str , set [Privilege ]] = collections .defaultdict (set )
110
- result = self ._workspace_client .grants .get (SecurableType .EXTERNAL_LOCATION , location_name )
255
+ result = self ._ws .grants .get (SecurableType .EXTERNAL_LOCATION , location_name )
111
256
if not result .privilege_assignments :
112
257
return grants
113
258
for assignment in result .privilege_assignments :
0 commit comments