Skip to content

Commit 5ed45f5

Browse files
author
Jesse
authored
Enable v3 retries by default (databricks#282)
Signed-off-by: Jesse Whitehouse <jesse.whitehouse@databricks.com>
1 parent 64924a6 commit 5ed45f5

File tree

5 files changed

+46
-13
lines changed

5 files changed

+46
-13
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919
- Caching metadata calls
2020
- Enable cloud fetch by default. To disable, set `use_cloud_fetch=False` when building `databricks.sql.client`.
2121
- Add integration tests for Databricks UC Volumes ingestion queries
22-
- Add `_retry_max_redirects` config
22+
- Retries:
23+
- Add `_retry_max_redirects` config
24+
- Set `_enable_v3_retries=True` and warn if users override it
2325

2426
## 2.9.3 (2023-08-24)
2527

src/databricks/sql/auth/retry.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class CommandType(Enum):
3030
EXECUTE_STATEMENT = "ExecuteStatement"
3131
CLOSE_SESSION = "CloseSession"
3232
CLOSE_OPERATION = "CloseOperation"
33+
GET_OPERATION_STATUS = "GetOperationStatus"
3334
OTHER = "Other"
3435

3536
@classmethod
@@ -314,9 +315,9 @@ def should_retry(self, method: str, status_code: int) -> Tuple[bool, str]:
314315
2. The request received a 501 (Not Implemented) status code
315316
Because this request can never succeed.
316317
3. The request received a 404 (Not Found) code and the request CommandType
317-
was CloseSession or CloseOperation. This code indicates that the session
318-
or cursor was already closed. Further retries will always return the same
319-
code.
318+
was GetOperationStatus, CloseSession or CloseOperation. This code indicates
319+
that the command, session or cursor was already closed. Further retries will
320+
always return the same code.
320321
4. The request CommandType was ExecuteStatement and the HTTP code does not
321322
appear in the default status_forcelist or force_dangerous_codes list. By
322323
default, this means ExecuteStatement is only retried for codes 429 and 503.
@@ -343,6 +344,13 @@ def should_retry(self, method: str, status_code: int) -> Tuple[bool, str]:
343344
if not self._is_method_retryable(method): # type: ignore
344345
return False, "Only POST requests are retried"
345346

347+
# Request failed with 404 and was a GetOperationStatus. This is not recoverable. Don't retry.
348+
if status_code == 404 and self.command_type == CommandType.GET_OPERATION_STATUS:
349+
return (
350+
False,
351+
"GetOperationStatus received 404 code from Databricks. Operation was canceled.",
352+
)
353+
346354
# Request failed with 404 because CloseSession returns 404 if you repeat the request.
347355
if (
348356
status_code == 404

src/databricks/sql/thrift_backend.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def __init__(
129129
# (defaults to 900)
130130
# _enable_v3_retries
131131
# Whether to use the DatabricksRetryPolicy implemented in urllib3
132-
# (defaults to False)
132+
# (defaults to True)
133133
# _retry_max_redirects
134134
# An integer representing the maximum number of redirects to follow for a request.
135135
# This number must be <= _retry_stop_after_attempts_count.
@@ -185,7 +185,13 @@ def __init__(
185185
self._auth_provider = auth_provider
186186

187187
# Connector version 3 retry approach
188-
self.enable_v3_retries = kwargs.get("_enable_v3_retries", False)
188+
self.enable_v3_retries = kwargs.get("_enable_v3_retries", True)
189+
190+
if not self.enable_v3_retries:
191+
logger.warning(
192+
"Legacy retry behavior is enabled for this connection."
193+
" This behaviour is deprecated and will be removed in a future release."
194+
)
189195
self.force_dangerous_codes = kwargs.get("_retry_dangerous_codes", [])
190196

191197
additional_transport_args = {}
@@ -396,9 +402,6 @@ def attempt_request(attempt):
396402

397403
response = method(request)
398404

399-
# Calling `close()` here releases the active HTTP connection back to the pool
400-
self._transport.close()
401-
402405
# We need to call type(response) here because thrift doesn't implement __name__ attributes for thrift responses
403406
logger.debug(
404407
"Received response: {}(<REDACTED>)".format(type(response).__name__)
@@ -460,6 +463,10 @@ def attempt_request(attempt):
460463
error_message = ThriftBackend._extract_error_message_from_headers(
461464
getattr(self._transport, "headers", {})
462465
)
466+
finally:
467+
# Calling `close()` here releases the active HTTP connection back to the pool
468+
self._transport.close()
469+
463470
return RequestErrorInfo(
464471
error=error,
465472
error_message=error_message,

tests/e2e/common/retry_test_mixins.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,6 @@ class PySQLRetryTestsMixin:
119119

120120
# For testing purposes
121121
_retry_policy = {
122-
"_enable_v3_retries": True,
123122
"_retry_delay_min": 0.1,
124123
"_retry_delay_max": 5,
125124
"_retry_stop_after_attempts_count": 5,
@@ -424,3 +423,19 @@ def test_retry_max_redirects_exceeds_max_attempts_count_warns_user(self):
424423
expected_message_was_found = target in log
425424

426425
assert expected_message_was_found, "Did not find expected log messages"
426+
427+
def test_retry_legacy_behavior_warns_user(self):
428+
with self.assertLogs(
429+
"databricks.sql",
430+
level="WARN",
431+
) as cm:
432+
with self.connection(
433+
extra_params={**self._retry_policy, "_enable_v3_retries": False}
434+
):
435+
expected_message_was_found = False
436+
for log in cm.output:
437+
if expected_message_was_found:
438+
break
439+
target = "Legacy retry behavior is enabled for this connection."
440+
expected_message_was_found = target in log
441+
assert expected_message_was_found, "Did not find expected log messages"

tests/e2e/test_driver.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131

3232
from tests.e2e.common.uc_volume_tests import PySQLUCVolumeTestSuiteMixin
3333

34+
from databricks.sql.exc import SessionAlreadyClosedError
35+
3436
log = logging.getLogger(__name__)
3537

3638
unsafe_logger = logging.getLogger("databricks.sql.unsafe")
@@ -699,10 +701,9 @@ def test_close_connection_closes_cursors(self):
699701
conn.close()
700702

701703
# When connection closes, any cursor operations should no longer exist at the server
702-
with self.assertRaises(thrift.Thrift.TApplicationException) as cm:
704+
with self.assertRaises(SessionAlreadyClosedError) as cm:
703705
op_status_at_server = ars.thrift_backend._client.GetOperationStatus(status_request)
704-
if hasattr(cm, "exception"):
705-
assert "RESOURCE_DOES_NOT_EXIST" in cm.exception.message
706+
706707

707708

708709
def test_closing_a_closed_connection_doesnt_fail(self):

0 commit comments

Comments
 (0)