Skip to content

Commit 4dde766

Browse files
authored
[Cosmos] Fix Service Request Retry Policy (#41804)
* fix request service retry policy * Update CHANGELOG.md * Update _service_request_retry_policy.py
1 parent f5bb254 commit 4dde766

File tree

4 files changed

+73
-7
lines changed

4 files changed

+73
-7
lines changed

sdk/cosmos/azure-cosmos/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
#### Bugs Fixed
1111
* Fixed bug where replacing manual throughput using `ThroughputProperties` would not work. See [PR 41564](https://github.com/Azure/azure-sdk-for-python/pull/41564)
12+
* Fixed bug where constantly raising Service Request Error Exceptions would cause the Service Request Retry Policy to indefinitely retry the request during a query or when a request was sent without a request object. See [PR 41804](https://github.com/Azure/azure-sdk-for-python/pull/41804)
1213

1314
#### Other Changes
1415

sdk/cosmos/azure-cosmos/azure/cosmos/_service_request_retry_policy.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def __init__(self, connection_policy, global_endpoint_manager, pk_range_wrapper,
3030
else:
3131
self.total_retries = len(self.global_endpoint_manager.location_cache.write_regional_routing_contexts)
3232

33-
def ShouldRetry(self):
33+
def ShouldRetry(self): # pylint: disable=too-many-return-statements
3434
"""Returns true if the request should retry based on preferred regions and retries already done.
3535
3636
:returns: a boolean stating whether the request should be retried
@@ -87,6 +87,11 @@ def ShouldRetry(self):
8787
location_endpoint = self.resolve_next_region_service_endpoint()
8888

8989
self.request.route_to_location(location_endpoint)
90+
return True
91+
# Check if the next retry about to be done is safe
92+
if (self.failover_retry_count + 1) >= self.total_retries:
93+
return False
94+
self.failover_retry_count += 1
9095
return True
9196

9297
# This function prepares the request to go to the second endpoint in the same region

sdk/cosmos/azure-cosmos/tests/test_service_retry_policies.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,28 +54,57 @@ def test_service_request_retry_policy(self):
5454
self.REGION3: self.REGIONAL_ENDPOINT}
5555
original_location_cache.read_regional_routing_contexts = [self.REGIONAL_ENDPOINT, self.REGIONAL_ENDPOINT,
5656
self.REGIONAL_ENDPOINT]
57+
58+
expected_counter = len(original_location_cache.read_regional_routing_contexts)
5759
try:
5860
# Mock the function to return the ServiceRequestException we retry
5961
mf = self.MockExecuteServiceRequestException()
6062
_retry_utility.ExecuteFunction = mf
6163
container.read_item(created_item['id'], created_item['pk'])
6264
pytest.fail("Exception was not raised.")
6365
except ServiceRequestError:
64-
assert mf.counter == 3
66+
assert mf.counter == expected_counter
67+
finally:
68+
_retry_utility.ExecuteFunction = self.original_execute_function
69+
70+
# Now we test with a query operation, iterating through items sends request without request object
71+
# retry policy should eventually raise an exception as it should stop retrying with a max retry attempt
72+
# equal to the available read region locations
73+
74+
# Change the location cache to have 3 preferred read regions and 3 available read endpoints by location
75+
original_location_cache = mock_client.client_connection._global_endpoint_manager.location_cache
76+
original_location_cache.account_read_locations = [self.REGION1, self.REGION2, self.REGION3]
77+
original_location_cache.available_read_regional_endpoints_by_locations = {
78+
self.REGION1: self.REGIONAL_ENDPOINT,
79+
self.REGION2: self.REGIONAL_ENDPOINT,
80+
self.REGION3: self.REGIONAL_ENDPOINT}
81+
original_location_cache.read_regional_routing_contexts = [self.REGIONAL_ENDPOINT, self.REGIONAL_ENDPOINT,
82+
self.REGIONAL_ENDPOINT]
83+
84+
expected_counter = len(original_location_cache.read_regional_routing_contexts)
85+
try:
86+
# Mock the function to return the ServiceRequestException we retry
87+
mf = self.MockExecuteServiceRequestException()
88+
_retry_utility.ExecuteFunction = mf
89+
items = list(container.query_items(query="SELECT * FROM c", partition_key=created_item['pk']))
90+
pytest.fail("Exception was not raised.")
91+
except ServiceRequestError:
92+
assert mf.counter == expected_counter
6593
finally:
6694
_retry_utility.ExecuteFunction = self.original_execute_function
6795

6896
# Now we change the location cache to have only 1 preferred read region
6997
original_location_cache.account_read_locations = [self.REGION1]
7098
original_location_cache.read_regional_routing_contexts = [self.REGIONAL_ENDPOINT]
99+
expected_counter = len(original_location_cache.read_regional_routing_contexts)
71100
try:
72101
# Reset the function to reset the counter
73102
mf = self.MockExecuteServiceRequestException()
74103
_retry_utility.ExecuteFunction = mf
75104
container.read_item(created_item['id'], created_item['pk'])
76105
pytest.fail("Exception was not raised.")
77106
except ServiceRequestError:
78-
assert mf.counter == 1
107+
assert mf.counter == expected_counter
79108
finally:
80109
_retry_utility.ExecuteFunction = self.original_execute_function
81110

@@ -84,6 +113,7 @@ def test_service_request_retry_policy(self):
84113
original_location_cache.write_regional_routing_contexts = [self.REGIONAL_ENDPOINT, self.REGIONAL_ENDPOINT]
85114
original_location_cache.available_write_regional_endpoints_by_locations = {self.REGION1: self.REGIONAL_ENDPOINT,
86115
self.REGION2: self.REGIONAL_ENDPOINT}
116+
expected_counter = len(original_location_cache.write_regional_routing_contexts)
87117
try:
88118
# Reset the function to reset the counter
89119
mf = self.MockExecuteServiceRequestException()
@@ -92,7 +122,7 @@ def test_service_request_retry_policy(self):
92122
pytest.fail("Exception was not raised.")
93123
except ServiceRequestError:
94124
# Should retry twice in each region
95-
assert mf.counter == 2
125+
assert mf.counter == expected_counter
96126
finally:
97127
_retry_utility.ExecuteFunction = self.original_execute_function
98128

sdk/cosmos/azure-cosmos/tests/test_service_retry_policies_async.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,28 +65,57 @@ async def test_service_request_retry_policy_async(self):
6565
self.REGION3: self.REGIONAL_ENDPOINT}
6666
original_location_cache.read_regional_routing_contexts = [self.REGIONAL_ENDPOINT, self.REGIONAL_ENDPOINT,
6767
self.REGIONAL_ENDPOINT]
68+
expected_counter = len(original_location_cache.read_regional_routing_contexts)
6869
try:
6970
# Mock the function to return the ServiceRequestException we retry
7071
mf = self.MockExecuteServiceRequestException()
7172
_retry_utility_async.ExecuteFunctionAsync = mf
7273
await container.read_item(created_item['id'], created_item['pk'])
7374
pytest.fail("Exception was not raised.")
7475
except ServiceRequestError:
75-
assert mf.counter == 3
76+
assert mf.counter == expected_counter
77+
finally:
78+
_retry_utility_async.ExecuteFunctionAsync = self.original_execute_function
79+
80+
# Now we test with a query operation, iterating through items sends request without request object
81+
# retry policy should eventually raise an exception as it should stop retrying with a max retry attempt
82+
# equal to the available read region locations
83+
84+
# Change the location cache to have 3 preferred read regions and 3 available read endpoints by location
85+
original_location_cache = mock_client.client_connection._global_endpoint_manager.location_cache
86+
original_location_cache.account_read_locations = [self.REGION1, self.REGION2, self.REGION3]
87+
original_location_cache.available_read_regional_endpoints_by_locations = {
88+
self.REGION1: self.REGIONAL_ENDPOINT,
89+
self.REGION2: self.REGIONAL_ENDPOINT,
90+
self.REGION3: self.REGIONAL_ENDPOINT}
91+
original_location_cache.read_regional_routing_contexts = [self.REGIONAL_ENDPOINT, self.REGIONAL_ENDPOINT,
92+
self.REGIONAL_ENDPOINT]
93+
expected_counter = len(original_location_cache.read_regional_routing_contexts)
94+
95+
try:
96+
# Mock the function to return the ServiceRequestException we retry
97+
mf = self.MockExecuteServiceRequestException()
98+
_retry_utility_async.ExecuteFunctionAsync = mf
99+
items = [item async for item in container.query_items(query="SELECT * FROM c",
100+
partition_key=created_item['pk'])]
101+
pytest.fail("Exception was not raised.")
102+
except ServiceRequestError:
103+
assert mf.counter == expected_counter
76104
finally:
77105
_retry_utility_async.ExecuteFunctionAsync = self.original_execute_function
78106

79107
# Now we change the location cache to have only 1 preferred read region
80108
original_location_cache.account_read_locations = [self.REGION1]
81109
original_location_cache.read_regional_routing_contexts = [self.REGIONAL_ENDPOINT]
110+
expected_counter = len(original_location_cache.read_regional_routing_contexts)
82111
try:
83112
# Reset the function to reset the counter
84113
mf = self.MockExecuteServiceRequestException()
85114
_retry_utility_async.ExecuteFunctionAsync = mf
86115
await container.read_item(created_item['id'], created_item['pk'])
87116
pytest.fail("Exception was not raised.")
88117
except ServiceRequestError:
89-
assert mf.counter == 1
118+
assert mf.counter == expected_counter
90119
finally:
91120
_retry_utility_async.ExecuteFunctionAsync = self.original_execute_function
92121

@@ -96,14 +125,15 @@ async def test_service_request_retry_policy_async(self):
96125
original_location_cache.available_write_regional_endpoints_by_locations = {
97126
self.REGION1: self.REGIONAL_ENDPOINT,
98127
self.REGION2: self.REGIONAL_ENDPOINT}
128+
expected_counter = len(original_location_cache.write_regional_routing_contexts)
99129
try:
100130
# Reset the function to reset the counter
101131
mf = self.MockExecuteServiceRequestException()
102132
_retry_utility_async.ExecuteFunctionAsync = mf
103133
await container.create_item({"id": str(uuid.uuid4()), "pk": str(uuid.uuid4())})
104134
pytest.fail("Exception was not raised.")
105135
except ServiceRequestError:
106-
assert mf.counter == 2
136+
assert mf.counter == expected_counter
107137
finally:
108138
_retry_utility_async.ExecuteFunctionAsync = self.original_execute_function
109139

0 commit comments

Comments
 (0)