Skip to content

Commit 2992618

Browse files
zhminroot
and
root
authored
fix: hive metadata extractor not work on postgresql (#394)
* fix: hive metadata extractor not work on postgresql Signed-off-by: zhmin <myzhmin@gmail.com> * fix: add unit test hive metadata extractor patch Signed-off-by: zhmin <myzhmin@gmail.com> Co-authored-by: root <root@master02.hicoretest.com>
1 parent a230c56 commit 2992618

File tree

2 files changed

+57
-10
lines changed

2 files changed

+57
-10
lines changed

databuilder/extractor/hive_table_metadata_extractor.py

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from pyhocon import ConfigFactory, ConfigTree
88
from typing import Iterator, Union, Dict, Any
9+
from sqlalchemy.engine.url import make_url
910

1011
from databuilder import Scoped
1112
from databuilder.extractor.table_metadata_constants import PARTITION_BADGE
@@ -56,6 +57,34 @@ class HiveTableMetadataExtractor(Extractor):
5657
ORDER by tbl_id, is_partition_col desc;
5758
"""
5859

60+
DEFAULT_POSTGRES_SQL_STATEMENT = """
61+
SELECT source.* FROM
62+
(SELECT t."TBL_ID" as tbl_id, d."NAME" as "schema", t."TBL_NAME" as name, t."TBL_TYPE",
63+
tp."PARAM_VALUE" as description, p."PKEY_NAME" as col_name, p."INTEGER_IDX" as col_sort_order,
64+
p."PKEY_TYPE" as col_type, p."PKEY_COMMENT" as col_description, 1 as "is_partition_col",
65+
CASE WHEN t."TBL_TYPE" = 'VIRTUAL_VIEW' THEN 1
66+
ELSE 0 END as "is_view"
67+
FROM "TBLS" t
68+
JOIN "DBS" d ON t."DB_ID" = d."DB_ID"
69+
JOIN "PARTITION_KEYS" p ON t."TBL_ID" = p."TBL_ID"
70+
LEFT JOIN "TABLE_PARAMS" tp ON (t."TBL_ID" = tp."TBL_ID" AND tp."PARAM_KEY"='comment')
71+
{where_clause_suffix}
72+
UNION
73+
SELECT t."TBL_ID" as tbl_id, d."NAME" as "schema", t."TBL_NAME" as name, t."TBL_TYPE",
74+
tp."PARAM_VALUE" as description, c."COLUMN_NAME" as col_name, c."INTEGER_IDX" as col_sort_order,
75+
c."TYPE_NAME" as col_type, c."COMMENT" as col_description, 0 as "is_partition_col",
76+
CASE WHEN t."TBL_TYPE" = 'VIRTUAL_VIEW' THEN 1
77+
ELSE 0 END as "is_view"
78+
FROM "TBLS" t
79+
JOIN "DBS" d ON t."DB_ID" = d."DB_ID"
80+
JOIN "SDS" s ON t."SD_ID" = s."SD_ID"
81+
JOIN "COLUMNS_V2" c ON s."CD_ID" = c."CD_ID"
82+
LEFT JOIN "TABLE_PARAMS" tp ON (t."TBL_ID" = tp."TBL_ID" AND tp."PARAM_KEY"='comment')
83+
{where_clause_suffix}
84+
) source
85+
ORDER by tbl_id, is_partition_col desc;
86+
"""
87+
5988
# CONFIG KEYS
6089
WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
6190
CLUSTER_KEY = 'cluster'
@@ -67,20 +96,28 @@ def init(self, conf: ConfigTree) -> None:
6796
conf = conf.with_fallback(HiveTableMetadataExtractor.DEFAULT_CONFIG)
6897
self._cluster = '{}'.format(conf.get_string(HiveTableMetadataExtractor.CLUSTER_KEY))
6998

70-
default_sql = HiveTableMetadataExtractor.DEFAULT_SQL_STATEMENT.format(
99+
self._alchemy_extractor = SQLAlchemyExtractor()
100+
101+
sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())
102+
default_sql = self._choose_default_sql_stm(sql_alch_conf).format(
71103
where_clause_suffix=conf.get_string(HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY))
72104

73105
self.sql_stmt = conf.get_string(HiveTableMetadataExtractor.EXTRACT_SQL, default=default_sql)
74106

75107
LOGGER.info('SQL for hive metastore: {}'.format(self.sql_stmt))
76108

77-
self._alchemy_extractor = SQLAlchemyExtractor()
78-
sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
79-
.with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))
80-
109+
sql_alch_conf = sql_alch_conf.with_fallback(ConfigFactory.from_dict(
110+
{SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))
81111
self._alchemy_extractor.init(sql_alch_conf)
82112
self._extract_iter: Union[None, Iterator] = None
83113

114+
def _choose_default_sql_stm(self, conf: ConfigTree) -> str:
115+
url = make_url(conf.get_string(SQLAlchemyExtractor.CONN_STRING))
116+
if url.drivername.lower() in ['postgresql', 'postgres']:
117+
return HiveTableMetadataExtractor.DEFAULT_POSTGRES_SQL_STATEMENT
118+
else:
119+
return HiveTableMetadataExtractor.DEFAULT_SQL_STATEMENT
120+
84121
def extract(self) -> Union[TableMetadata, None]:
85122
if not self._extract_iter:
86123
self._extract_iter = self._get_extract_iter()

tests/unit/extractor/test_hive_table_metadata_extractor.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,19 @@ def test_extraction_with_empty_query_result(self) -> None:
2727
"""
2828
Test Extraction with empty result from query
2929
"""
30-
with patch.object(SQLAlchemyExtractor, '_get_connection'):
30+
with patch.object(SQLAlchemyExtractor, '_get_connection'), \
31+
patch.object(HiveTableMetadataExtractor, '_choose_default_sql_stm',
32+
return_value=HiveTableMetadataExtractor.DEFAULT_SQL_STATEMENT):
3133
extractor = HiveTableMetadataExtractor()
3234
extractor.init(self.conf)
3335

3436
results = extractor.extract()
3537
self.assertEqual(results, None)
3638

3739
def test_extraction_with_single_result(self) -> None:
38-
with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection:
40+
with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection, \
41+
patch.object(HiveTableMetadataExtractor, '_choose_default_sql_stm',
42+
return_value=HiveTableMetadataExtractor.DEFAULT_SQL_STATEMENT):
3943
connection = MagicMock()
4044
mock_connection.return_value = connection
4145
sql_execute = MagicMock()
@@ -101,7 +105,9 @@ def test_extraction_with_single_result(self) -> None:
101105
self.assertIsNone(extractor.extract())
102106

103107
def test_extraction_with_multiple_result(self) -> None:
104-
with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection:
108+
with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection, \
109+
patch.object(HiveTableMetadataExtractor, '_choose_default_sql_stm',
110+
return_value=HiveTableMetadataExtractor.DEFAULT_SQL_STATEMENT):
105111
connection = MagicMock()
106112
mock_connection.return_value = connection
107113
sql_execute = MagicMock()
@@ -240,7 +246,9 @@ def test_sql_statement(self) -> None:
240246
"""
241247
Test Extraction with empty result from query
242248
"""
243-
with patch.object(SQLAlchemyExtractor, '_get_connection'):
249+
with patch.object(SQLAlchemyExtractor, '_get_connection'), \
250+
patch.object(HiveTableMetadataExtractor, '_choose_default_sql_stm',
251+
return_value=HiveTableMetadataExtractor.DEFAULT_SQL_STATEMENT):
244252
extractor = HiveTableMetadataExtractor()
245253
extractor.init(self.conf)
246254
self.assertTrue(self.where_clause_suffix in extractor.sql_stmt)
@@ -250,7 +258,9 @@ def test_hive_sql_statement_with_custom_sql(self) -> None:
250258
Test Extraction by providing a custom sql
251259
:return:
252260
"""
253-
with patch.object(SQLAlchemyExtractor, '_get_connection'):
261+
with patch.object(SQLAlchemyExtractor, '_get_connection'), \
262+
patch.object(HiveTableMetadataExtractor, '_choose_default_sql_stm',
263+
return_value=HiveTableMetadataExtractor.DEFAULT_SQL_STATEMENT):
254264
config_dict = {
255265
HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY: self.where_clause_suffix,
256266
'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING):

0 commit comments

Comments
 (0)