Skip to content

Commit caaea07

Browse files
authored
[DPTOOLS-1924] Adding timeout on ColumnProvider call (#13)
* [DPTOOLS-1924] Adding timeout on ColumnProvider call * Update * Update
1 parent 9524509 commit caaea07

File tree

2 files changed

+23
-3
lines changed

2 files changed

+23
-3
lines changed

databuilder/transformer/sql_to_table_col_usage_transformer.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
from multiprocessing.pool import ThreadPool, TimeoutError
23

34
from pyhocon import ConfigTree # noqa: F401
45
from typing import Any, Optional, List, Iterable # noqa: F401
@@ -20,12 +21,17 @@ class SqlToTblColUsageTransformer(Transformer):
2021
Currently it's collects on table level that column on same table will be de-duped.
2122
In many cases, "from" clause does not contain schema and this will be fetched via table name -> schema name mapping
2223
which it gets from Hive metastore. (Naming collision is disregarded as it needs column level to disambiguate)
24+
25+
Currently, ColumnUsageProvider could hang on certain SQL statement and as a short term solution it will timeout
26+
processing statement at 10 seconds.
2327
"""
2428
# Config key
2529
DATABASE_NAME = 'database'
2630
CLUSTER_NAME = 'cluster'
2731
SQL_STATEMENT_ATTRIBUTE_NAME = 'sql_stmt_attribute_name'
2832
USER_EMAIL_ATTRIBUTE_NAME = 'user_email_attribute_name'
33+
COLUMN_EXTRACTION_TIMEOUT_SEC = 'column_extraction_timeout_seconds'
34+
LOG_ALL_EXTRACTION_FAILURES = 'log_all_extraction_failures'
2935

3036
total_counts = 0
3137
failure_counts = 0
@@ -38,6 +44,11 @@ def init(self, conf):
3844
self._sql_stmt_attr = conf.get_string(SqlToTblColUsageTransformer.SQL_STATEMENT_ATTRIBUTE_NAME)
3945
self._user_email_attr = conf.get_string(SqlToTblColUsageTransformer.USER_EMAIL_ATTRIBUTE_NAME)
4046
self._tbl_to_schema_mapping = self._create_schema_by_table_mapping()
47+
self._worker_pool = ThreadPool(processes=1)
48+
self._time_out_sec = conf.get_int(SqlToTblColUsageTransformer.COLUMN_EXTRACTION_TIMEOUT_SEC, 10)
49+
LOGGER.info('Column extraction timeout: {} seconds'.format(self._time_out_sec))
50+
self._log_all_extraction_failures = conf.get_bool(SqlToTblColUsageTransformer.LOG_ALL_EXTRACTION_FAILURES,
51+
False)
4152

4253
def transform(self, record):
4354
# type: (Any) -> Optional[TableColumnUsage]
@@ -48,11 +59,20 @@ def transform(self, record):
4859

4960
result = [] # type: List[ColumnReader]
5061
try:
51-
columns = ColumnUsageProvider.get_columns(query=stmt)
62+
columns = self._worker_pool.apply_async(ColumnUsageProvider.get_columns, (stmt,)).get(self._time_out_sec)
5263
# LOGGER.info('Statement: {} ---> columns: {}'.format(stmt, columns))
64+
except TimeoutError:
65+
SqlToTblColUsageTransformer.failure_counts += 1
66+
LOGGER.exception('Timed out while getting column usage from query: {}'.format(stmt))
67+
LOGGER.info('Killing the thread.')
68+
self._worker_pool.terminate()
69+
self._worker_pool = ThreadPool(processes=1)
70+
LOGGER.info('Killed the thread.')
71+
return None
5372
except Exception:
5473
SqlToTblColUsageTransformer.failure_counts += 1
55-
LOGGER.exception('Failed to get column usage from query: {}'.format(stmt))
74+
if self._log_all_extraction_failures:
75+
LOGGER.exception('Failed to get column usage from query: {}'.format(stmt))
5676
return None
5777

5878
# Dedupe is needed to make it table level. TODO: Remove this once we are at column level

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from setuptools import setup, find_packages
22

33

4-
__version__ = '1.0.2'
4+
__version__ = '1.0.3'
55

66
setup(
77
name='amundsen-databuilder',

0 commit comments

Comments
 (0)