4
4
import logging
5
5
import re
6
6
from dataclasses import dataclass
7
+ from functools import partial
7
8
9
+ from databricks .labs .blueprint .parallel import Threads
8
10
from databricks .sdk import WorkspaceClient
9
- from databricks .sdk .errors import BadRequest , NotFound
11
+ from databricks .sdk .errors import BadRequest , NotFound , ResourceConflict
10
12
from databricks .sdk .service .workspace import ImportFormat
11
13
12
14
from databricks .labs .ucx .account import WorkspaceInfo
13
- from databricks .labs .ucx .framework .crawlers import StatementExecutionBackend
15
+ from databricks .labs .ucx .framework .crawlers import SqlBackend
14
16
from databricks .labs .ucx .hive_metastore import TablesCrawler
15
17
from databricks .labs .ucx .hive_metastore .tables import Table
16
18
@@ -46,14 +48,21 @@ def as_hms_table_key(self):
46
48
return f"hive_metastore.{ self .src_schema } .{ self .src_table } "
47
49
48
50
51
+ @dataclass
52
+ class TableToMigrate :
53
+ src : Table
54
+ rule : Rule
55
+
56
+
49
57
class TableMapping :
50
58
UCX_SKIP_PROPERTY = "databricks.labs.ucx.skip"
51
59
52
- def __init__ (self , ws : WorkspaceClient , folder : str | None = None ):
60
+ def __init__ (self , ws : WorkspaceClient , backend : SqlBackend , folder : str | None = None ):
53
61
if not folder :
54
62
folder = f"/Users/{ ws .current_user .me ().user_name } /.ucx"
55
63
self ._ws = ws
56
64
self ._folder = folder
65
+ self ._backend = backend
57
66
self ._field_names = [_ .name for _ in dataclasses .fields (Rule )]
58
67
59
68
def current_tables (self , tables : TablesCrawler , workspace_name : str , catalog_name : str ):
@@ -75,11 +84,6 @@ def save(self, tables: TablesCrawler, workspace_info: WorkspaceInfo) -> str:
75
84
buffer .seek (0 )
76
85
return self ._overwrite_mapping (buffer )
77
86
78
- def _overwrite_mapping (self , buffer ) -> str :
79
- path = f"{ self ._folder } /mapping.csv"
80
- self ._ws .workspace .upload (path , buffer , overwrite = True , format = ImportFormat .AUTO )
81
- return path
82
-
83
87
def load (self ) -> list [Rule ]:
84
88
try :
85
89
rules = []
@@ -91,10 +95,12 @@ def load(self) -> list[Rule]:
91
95
msg = "Please run: databricks labs ucx table-mapping"
92
96
raise ValueError (msg ) from None
93
97
94
- def skip_table (self , backend : StatementExecutionBackend , schema : str , table : str ):
98
+ def skip_table (self , schema : str , table : str ):
95
99
# Marks a table to be skipped in the migration process by applying a table property
96
100
try :
97
- backend .execute (f"ALTER TABLE `{ schema } `.`{ table } ` SET TBLPROPERTIES('{ self .UCX_SKIP_PROPERTY } ' = true)" )
101
+ self ._backend .execute (
102
+ f"ALTER TABLE `{ schema } `.`{ table } ` SET TBLPROPERTIES('{ self .UCX_SKIP_PROPERTY } ' = true)"
103
+ )
98
104
except NotFound as nf :
99
105
if "[TABLE_OR_VIEW_NOT_FOUND]" in str (nf ):
100
106
logger .error (f"Failed to apply skip marker for Table { schema } .{ table } . Table not found." )
@@ -103,14 +109,96 @@ def skip_table(self, backend: StatementExecutionBackend, schema: str, table: str
103
109
except BadRequest as br :
104
110
logger .error (br )
105
111
106
- def skip_schema (self , backend : StatementExecutionBackend , schema : str ):
112
+ def skip_schema (self , schema : str ):
107
113
# Marks a schema to be skipped in the migration process by applying a table property
108
114
try :
109
- backend .execute (f"ALTER SCHEMA `{ schema } ` SET DBPROPERTIES('{ self .UCX_SKIP_PROPERTY } ' = true)" )
115
+ self . _backend .execute (f"ALTER SCHEMA `{ schema } ` SET DBPROPERTIES('{ self .UCX_SKIP_PROPERTY } ' = true)" )
110
116
except NotFound as nf :
111
117
if "[SCHEMA_NOT_FOUND]" in str (nf ):
112
118
logger .error (f"Failed to apply skip marker for Schema { schema } . Schema not found." )
113
119
else :
114
120
logger .error (nf )
115
121
except BadRequest as br :
116
122
logger .error (br )
123
+
124
+ def get_tables_to_migrate (self , tables_crawler : TablesCrawler ):
125
+ rules = self .load ()
126
+ # Getting all the source tables from the rules
127
+ databases_in_scope = self ._get_databases_in_scope ({rule .src_schema for rule in rules })
128
+ crawled_tables_keys = {crawled_table .key : crawled_table for crawled_table in tables_crawler .snapshot ()}
129
+ tasks = []
130
+ for rule in rules :
131
+ if rule .as_hms_table_key not in crawled_tables_keys :
132
+ logger .info (f"Table { rule .as_hms_table_key } in the mapping doesn't show up in assessment" )
133
+ continue
134
+ if rule .src_schema not in databases_in_scope :
135
+ logger .info (f"Table { rule .as_hms_table_key } is in a database that was marked to be skipped" )
136
+ continue
137
+ tasks .append (
138
+ partial (self ._get_table_in_scope_task , TableToMigrate (crawled_tables_keys [rule .as_hms_table_key ], rule ))
139
+ )
140
+
141
+ return Threads .strict ("checking all database properties" , tasks )
142
+
143
+ def _overwrite_mapping (self , buffer ) -> str :
144
+ path = f"{ self ._folder } /mapping.csv"
145
+ self ._ws .workspace .upload (path , buffer , overwrite = True , format = ImportFormat .AUTO )
146
+ return path
147
+
148
+ def _get_databases_in_scope (self , databases : set [str ]):
149
+ tasks = []
150
+ for database in databases :
151
+ tasks .append (partial (self ._get_database_in_scope_task , database ))
152
+ return Threads .strict ("checking databases for skip property" , tasks )
153
+
154
+ def _get_database_in_scope_task (self , database : str ) -> str | None :
155
+ describe = {}
156
+ for value in self ._backend .fetch (f"DESCRIBE SCHEMA EXTENDED { database } " ):
157
+ describe [value ["database_description_item" ]] = value ["database_description_value" ]
158
+ if self .UCX_SKIP_PROPERTY in TablesCrawler .parse_database_props (describe .get ("Properties" , "" ).lower ()):
159
+ logger .info (f"Database { database } is marked to be skipped" )
160
+ return None
161
+ return database
162
+
163
+ def _get_table_in_scope_task (self , table_to_migrate : TableToMigrate ) -> TableToMigrate | None :
164
+ table = table_to_migrate .src
165
+ rule = table_to_migrate .rule
166
+
167
+ if self ._exists_in_uc (table , rule .as_uc_table_key ):
168
+ logger .info (f"The intended target for { table .key } , { rule .as_uc_table_key } , already exists." )
169
+ return None
170
+ result = self ._backend .fetch (f"SHOW TBLPROPERTIES `{ table .database } `.`{ table .name } `" )
171
+ for value in result :
172
+ if value ["key" ] == self .UCX_SKIP_PROPERTY :
173
+ logger .info (f"{ table .key } is marked to be skipped" )
174
+ return None
175
+ if value ["key" ] == "upgraded_to" :
176
+ logger .info (f"{ table .key } is set as upgraded to { value ['value' ]} " )
177
+ if self ._exists_in_uc (table , value ["value" ]):
178
+ logger .info (
179
+ f"The table { table .key } was previously upgraded to { value ['value' ]} . "
180
+ f"To revert the table and allow it to be upgraded again use the CLI command:"
181
+ f"databricks labs ucx revert --schema { table .database } --table { table .name } "
182
+ )
183
+ return None
184
+ logger .info (f"The upgrade_to target for { table .key } is missing. Unsetting the upgrade_to property" )
185
+ self ._backend .execute (table .sql_unset_upgraded_to ())
186
+
187
+ return table_to_migrate
188
+
189
+ def _exists_in_uc (self , src_table : Table , target_key : str ):
190
+ # Attempts to get the target table info from UC returns True if it exists.
191
+ try :
192
+ table_info = self ._ws .tables .get (target_key )
193
+ if not table_info .properties :
194
+ return True
195
+ upgraded_from = table_info .properties .get ("upgraded_from" )
196
+ if upgraded_from and upgraded_from != src_table .key :
197
+ msg = f"Expected to be migrated from { src_table .key } , but got { upgraded_from } . "
198
+ "You can skip this error using the CLI command: "
199
+ "databricks labs ucx skip "
200
+ f"--schema { src_table .database } --table { src_table .name } "
201
+ raise ResourceConflict (msg )
202
+ return True
203
+ except NotFound :
204
+ return False
0 commit comments