From 0934ad768b7f01c5cf174f6c6f40ba4117baaecc Mon Sep 17 00:00:00 2001 From: Hui Song Date: Mon, 2 Jun 2025 16:13:44 -0400 Subject: [PATCH 1/2] fix potential race condition in activation disabling --- src/aap_eda/api/views/activation.py | 6 +-- src/aap_eda/core/enums.py | 1 + ...64_alter_activationrequestqueue_request.py | 26 +++++++++++ .../services/activation/activation_manager.py | 23 +++++----- src/aap_eda/tasks/activation_request_queue.py | 1 + src/aap_eda/tasks/orchestrator.py | 23 +++++++--- .../services/activation/test_manager.py | 43 +++++++++++++------ 7 files changed, 88 insertions(+), 35 deletions(-) create mode 100644 src/aap_eda/core/migrations/0064_alter_activationrequestqueue_request.py diff --git a/src/aap_eda/api/views/activation.py b/src/aap_eda/api/views/activation.py index b164c135f..702a8348f 100644 --- a/src/aap_eda/api/views/activation.py +++ b/src/aap_eda/api/views/activation.py @@ -444,15 +444,11 @@ def disable(self, request, pk): # Redis must be available in order to perform the delete. self.redis_is_available() - activation.status = ActivationStatus.STOPPING - activation.is_enabled = False - activation.save( - update_fields=["is_enabled", "status", "modified_at"] - ) stop_rulebook_process( process_parent_type=ProcessParentType.ACTIVATION, process_parent_id=activation.id, request_id=request.headers.get("x-request-id"), + disable=True, ) logger.info( diff --git a/src/aap_eda/core/enums.py b/src/aap_eda/core/enums.py index ce597e90e..590491618 100644 --- a/src/aap_eda/core/enums.py +++ b/src/aap_eda/core/enums.py @@ -123,6 +123,7 @@ class ActivationRequest(DjangoStrEnum): STOP = "stop" RESTART = "restart" DELETE = "delete" + DISABLE = "disable" AUTO_START = "auto_start" diff --git a/src/aap_eda/core/migrations/0064_alter_activationrequestqueue_request.py b/src/aap_eda/core/migrations/0064_alter_activationrequestqueue_request.py new file mode 100644 index 000000000..8c85ab914 --- /dev/null +++ b/src/aap_eda/core/migrations/0064_alter_activationrequestqueue_request.py @@ -0,0 +1,26 @@ +# Generated by Django 4.2.16 on 2025-06-02 20:26 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0063_remove_decisionenvironment_credential_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="activationrequestqueue", + name="request", + field=models.TextField( + choices=[ + ("start", "start"), + ("stop", "stop"), + ("restart", "restart"), + ("delete", "delete"), + ("disable", "disable"), + ("auto_start", "auto_start"), + ] + ), + ), + ] diff --git a/src/aap_eda/services/activation/activation_manager.py b/src/aap_eda/services/activation/activation_manager.py index bdbbea298..9d0bdf022 100644 --- a/src/aap_eda/services/activation/activation_manager.py +++ b/src/aap_eda/services/activation/activation_manager.py @@ -99,6 +99,12 @@ def _reset_failure_count(self): self.db_instance.failure_count = 0 self.db_instance.save(update_fields=["failure_count", "modified_at"]) + @run_with_lock + def _reset_is_enabled(self): + """Reset the is_enabled of the activation.""" + self.db_instance.is_enabled = False + self.db_instance.save(update_fields=["is_enabled", "modified_at"]) + @run_with_lock def _increase_restart_count(self): """Increase the restart count of the activation.""" @@ -330,15 +336,8 @@ def _is_in_status(self, status: ActivationStatus) -> bool: if latest_instance.status != status: return False - if latest_instance.status == status and ( - container_status is not None and container_status.status == status - ): - return True - - if latest_instance.status == status and ( - container_status is not None and container_status.status != status - ): - return False + if container_status is not None: + return container_status.status == status if ( status != ActivationStatus.RUNNING @@ -696,7 +695,7 @@ def start(self, is_restart: bool = False): if is_restart: self._increase_restart_count() - def stop(self): + def stop(self, disable: bool = False): """User requested stop.""" LOGGER.info( "Stop operation requested for activation " @@ -717,6 +716,8 @@ def stop(self): if self._is_already_stopped(): msg = f"Activation {self.db_instance.id} is already stopped." LOGGER.info(msg) + if disable: + self._reset_is_enabled() return except exceptions.ActivationInstanceNotFound: LOGGER.error( @@ -729,6 +730,8 @@ def stop(self): try: if self.db_instance.status != ActivationStatus.ERROR: self.set_status(ActivationStatus.STOPPING) + if disable: + self._reset_is_enabled() self._stop_instance() except engine_exceptions.ContainerEngineError as exc: diff --git a/src/aap_eda/tasks/activation_request_queue.py b/src/aap_eda/tasks/activation_request_queue.py index 495eaba0d..0d090bdfc 100644 --- a/src/aap_eda/tasks/activation_request_queue.py +++ b/src/aap_eda/tasks/activation_request_queue.py @@ -104,6 +104,7 @@ def _arbitrate( if ( request.request == ActivationRequest.STOP + or request.request == ActivationRequest.DISABLE or request.request == ActivationRequest.DELETE ): while qualified_requests: diff --git a/src/aap_eda/tasks/orchestrator.py b/src/aap_eda/tasks/orchestrator.py index 235575ead..4252999c4 100644 --- a/src/aap_eda/tasks/orchestrator.py +++ b/src/aap_eda/tasks/orchestrator.py @@ -168,6 +168,8 @@ def _run_request( ) elif request.request == ActivationRequest.STOP: manager.stop() + elif request.request == ActivationRequest.DISABLE: + manager.stop(disable=True) elif request.request == ActivationRequest.RESTART: manager.restart() elif request.request == ActivationRequest.DELETE: @@ -506,14 +508,23 @@ def stop_rulebook_process( process_parent_type: ProcessParentType, process_parent_id: int, request_id: str = "", + disable: bool = False, ) -> None: """Create a request to stop the activation with the given id.""" - requests_queue.push( - process_parent_type, - process_parent_id, - ActivationRequest.STOP, - request_id, - ) + if disable: + requests_queue.push( + process_parent_type, + process_parent_id, + ActivationRequest.DISABLE, + request_id, + ) + else: + requests_queue.push( + process_parent_type, + process_parent_id, + ActivationRequest.STOP, + request_id, + ) def delete_rulebook_process( diff --git a/tests/integration/services/activation/test_manager.py b/tests/integration/services/activation/test_manager.py index 76e0a52d6..f0673eecd 100644 --- a/tests/integration/services/activation/test_manager.py +++ b/tests/integration/services/activation/test_manager.py @@ -477,11 +477,25 @@ def test_stop_deleted_activation( assert "does not exist" in eda_caplog.text +@pytest.mark.parametrize( + ("end_status", "expected"), + [ + (enums.ActivationStatus.ERROR, True), + (enums.ActivationStatus.STOPPED, True), + (enums.ActivationStatus.FAILED, True), + (enums.ActivationStatus.COMPLETED, True), + (enums.ActivationStatus.PENDING, False), + (enums.ActivationStatus.STARTING, False), + (enums.ActivationStatus.RUNNING, False), + ], +) @pytest.mark.django_db def test_stop_already_stopped( activation_with_instance: models.Activation, container_engine_mock: MagicMock, eda_caplog: LogCaptureFixture, + end_status: enums.ActivationStatus, + expected: bool, ): """Test stop verb when activation is stopped.""" activation_manager = ActivationManager( @@ -489,18 +503,17 @@ def test_stop_already_stopped( db_instance=activation_with_instance, ) - activation_with_instance.status = enums.ActivationStatus.STOPPED - activation_with_instance.latest_instance.status = ( - enums.ActivationStatus.STOPPED - ) + activation_with_instance.status = end_status + activation_with_instance.latest_instance.status = end_status activation_with_instance.latest_instance.save( update_fields=["status"], ) activation_with_instance.save(update_fields=["status"]) - activation_manager.stop() + activation_manager.stop(disable=True) assert "Stopping" in eda_caplog.text - assert "already stopped" in eda_caplog.text + stopped = "already stopped" in eda_caplog.text + assert stopped == expected assert not container_engine_mock.cleanup.called @@ -535,23 +548,25 @@ def test_stop_running( running_activation.status = before_status running_activation.status_message = before_msg running_activation.save(update_fields=["status", "status_message"]) + running_activation.latest_instance.status = before_status + running_activation.latest_instance.save(update_fields=["status"]) activation_manager = ActivationManager( container_engine=container_engine_mock, db_instance=running_activation, ) + container_engine_mock.get_status.return_value = None activation_manager.stop() assert "Stopping" in eda_caplog.text - assert "Cleanup operation requested" in eda_caplog.text - assert "Activation stopped." in eda_caplog.text assert running_activation.status == after_status - assert ( - running_activation.latest_instance.status - == enums.ActivationStatus.STOPPED - ) - assert running_activation.latest_instance.activation_pod_id is None + assert running_activation.latest_instance.status == after_status assert running_activation.status_message == after_msg - assert container_engine_mock.cleanup.called + + if before_status == enums.ActivationStatus.RUNNING: + assert "Cleanup operation requested" in eda_caplog.text + assert "Activation stopped." in eda_caplog.text + assert running_activation.latest_instance.activation_pod_id is None + assert container_engine_mock.cleanup.called @pytest.mark.django_db From 276567f17af8d8f7a1e1cda334774db8043549cc Mon Sep 17 00:00:00 2001 From: Hui Song Date: Tue, 3 Jun 2025 13:59:54 -0400 Subject: [PATCH 2/2] rework on the status updating --- src/aap_eda/api/views/activation.py | 11 +- src/aap_eda/core/enums.py | 1 - ...64_alter_activationrequestqueue_request.py | 26 ---- .../services/activation/activation_manager.py | 25 ++-- src/aap_eda/tasks/activation_request_queue.py | 1 - src/aap_eda/tasks/orchestrator.py | 23 +--- .../services/activation/test_manager.py | 112 +++++++++++++----- 7 files changed, 109 insertions(+), 90 deletions(-) delete mode 100644 src/aap_eda/core/migrations/0064_alter_activationrequestqueue_request.py diff --git a/src/aap_eda/api/views/activation.py b/src/aap_eda/api/views/activation.py index 702a8348f..7db6c90a3 100644 --- a/src/aap_eda/api/views/activation.py +++ b/src/aap_eda/api/views/activation.py @@ -444,11 +444,20 @@ def disable(self, request, pk): # Redis must be available in order to perform the delete. self.redis_is_available() + if activation.status in [ + ActivationStatus.STARTING, + ActivationStatus.RUNNING, + ]: + activation.status = ActivationStatus.STOPPING + + activation.is_enabled = False + activation.save( + update_fields=["is_enabled", "status", "modified_at"] + ) stop_rulebook_process( process_parent_type=ProcessParentType.ACTIVATION, process_parent_id=activation.id, request_id=request.headers.get("x-request-id"), - disable=True, ) logger.info( diff --git a/src/aap_eda/core/enums.py b/src/aap_eda/core/enums.py index 590491618..ce597e90e 100644 --- a/src/aap_eda/core/enums.py +++ b/src/aap_eda/core/enums.py @@ -123,7 +123,6 @@ class ActivationRequest(DjangoStrEnum): STOP = "stop" RESTART = "restart" DELETE = "delete" - DISABLE = "disable" AUTO_START = "auto_start" diff --git a/src/aap_eda/core/migrations/0064_alter_activationrequestqueue_request.py b/src/aap_eda/core/migrations/0064_alter_activationrequestqueue_request.py deleted file mode 100644 index 8c85ab914..000000000 --- a/src/aap_eda/core/migrations/0064_alter_activationrequestqueue_request.py +++ /dev/null @@ -1,26 +0,0 @@ -# Generated by Django 4.2.16 on 2025-06-02 20:26 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("core", "0063_remove_decisionenvironment_credential_and_more"), - ] - - operations = [ - migrations.AlterField( - model_name="activationrequestqueue", - name="request", - field=models.TextField( - choices=[ - ("start", "start"), - ("stop", "stop"), - ("restart", "restart"), - ("delete", "delete"), - ("disable", "disable"), - ("auto_start", "auto_start"), - ] - ), - ), - ] diff --git a/src/aap_eda/services/activation/activation_manager.py b/src/aap_eda/services/activation/activation_manager.py index 9d0bdf022..040e13a1c 100644 --- a/src/aap_eda/services/activation/activation_manager.py +++ b/src/aap_eda/services/activation/activation_manager.py @@ -99,12 +99,6 @@ def _reset_failure_count(self): self.db_instance.failure_count = 0 self.db_instance.save(update_fields=["failure_count", "modified_at"]) - @run_with_lock - def _reset_is_enabled(self): - """Reset the is_enabled of the activation.""" - self.db_instance.is_enabled = False - self.db_instance.save(update_fields=["is_enabled", "modified_at"]) - @run_with_lock def _increase_restart_count(self): """Increase the restart count of the activation.""" @@ -336,8 +330,15 @@ def _is_in_status(self, status: ActivationStatus) -> bool: if latest_instance.status != status: return False - if container_status is not None: - return container_status.status == status + if latest_instance.status == status and ( + container_status is not None and container_status.status == status + ): + return True + + if latest_instance.status == status and ( + container_status is not None and container_status.status != status + ): + return False if ( status != ActivationStatus.RUNNING @@ -695,7 +696,7 @@ def start(self, is_restart: bool = False): if is_restart: self._increase_restart_count() - def stop(self, disable: bool = False): + def stop(self): """User requested stop.""" LOGGER.info( "Stop operation requested for activation " @@ -716,8 +717,6 @@ def stop(self, disable: bool = False): if self._is_already_stopped(): msg = f"Activation {self.db_instance.id} is already stopped." LOGGER.info(msg) - if disable: - self._reset_is_enabled() return except exceptions.ActivationInstanceNotFound: LOGGER.error( @@ -728,10 +727,6 @@ def stop(self, disable: bool = False): return try: - if self.db_instance.status != ActivationStatus.ERROR: - self.set_status(ActivationStatus.STOPPING) - if disable: - self._reset_is_enabled() self._stop_instance() except engine_exceptions.ContainerEngineError as exc: diff --git a/src/aap_eda/tasks/activation_request_queue.py b/src/aap_eda/tasks/activation_request_queue.py index 0d090bdfc..495eaba0d 100644 --- a/src/aap_eda/tasks/activation_request_queue.py +++ b/src/aap_eda/tasks/activation_request_queue.py @@ -104,7 +104,6 @@ def _arbitrate( if ( request.request == ActivationRequest.STOP - or request.request == ActivationRequest.DISABLE or request.request == ActivationRequest.DELETE ): while qualified_requests: diff --git a/src/aap_eda/tasks/orchestrator.py b/src/aap_eda/tasks/orchestrator.py index 4252999c4..235575ead 100644 --- a/src/aap_eda/tasks/orchestrator.py +++ b/src/aap_eda/tasks/orchestrator.py @@ -168,8 +168,6 @@ def _run_request( ) elif request.request == ActivationRequest.STOP: manager.stop() - elif request.request == ActivationRequest.DISABLE: - manager.stop(disable=True) elif request.request == ActivationRequest.RESTART: manager.restart() elif request.request == ActivationRequest.DELETE: @@ -508,23 +506,14 @@ def stop_rulebook_process( process_parent_type: ProcessParentType, process_parent_id: int, request_id: str = "", - disable: bool = False, ) -> None: """Create a request to stop the activation with the given id.""" - if disable: - requests_queue.push( - process_parent_type, - process_parent_id, - ActivationRequest.DISABLE, - request_id, - ) - else: - requests_queue.push( - process_parent_type, - process_parent_id, - ActivationRequest.STOP, - request_id, - ) + requests_queue.push( + process_parent_type, + process_parent_id, + ActivationRequest.STOP, + request_id, + ) def delete_rulebook_process( diff --git a/tests/integration/services/activation/test_manager.py b/tests/integration/services/activation/test_manager.py index f0673eecd..d886e0051 100644 --- a/tests/integration/services/activation/test_manager.py +++ b/tests/integration/services/activation/test_manager.py @@ -477,25 +477,11 @@ def test_stop_deleted_activation( assert "does not exist" in eda_caplog.text -@pytest.mark.parametrize( - ("end_status", "expected"), - [ - (enums.ActivationStatus.ERROR, True), - (enums.ActivationStatus.STOPPED, True), - (enums.ActivationStatus.FAILED, True), - (enums.ActivationStatus.COMPLETED, True), - (enums.ActivationStatus.PENDING, False), - (enums.ActivationStatus.STARTING, False), - (enums.ActivationStatus.RUNNING, False), - ], -) @pytest.mark.django_db def test_stop_already_stopped( activation_with_instance: models.Activation, container_engine_mock: MagicMock, eda_caplog: LogCaptureFixture, - end_status: enums.ActivationStatus, - expected: bool, ): """Test stop verb when activation is stopped.""" activation_manager = ActivationManager( @@ -503,17 +489,18 @@ def test_stop_already_stopped( db_instance=activation_with_instance, ) - activation_with_instance.status = end_status - activation_with_instance.latest_instance.status = end_status + activation_with_instance.status = enums.ActivationStatus.STOPPED + activation_with_instance.latest_instance.status = ( + enums.ActivationStatus.STOPPED + ) activation_with_instance.latest_instance.save( update_fields=["status"], ) activation_with_instance.save(update_fields=["status"]) - activation_manager.stop(disable=True) + activation_manager.stop() assert "Stopping" in eda_caplog.text - stopped = "already stopped" in eda_caplog.text - assert stopped == expected + assert "already stopped" in eda_caplog.text assert not container_engine_mock.cleanup.called @@ -548,25 +535,23 @@ def test_stop_running( running_activation.status = before_status running_activation.status_message = before_msg running_activation.save(update_fields=["status", "status_message"]) - running_activation.latest_instance.status = before_status - running_activation.latest_instance.save(update_fields=["status"]) activation_manager = ActivationManager( container_engine=container_engine_mock, db_instance=running_activation, ) - container_engine_mock.get_status.return_value = None activation_manager.stop() assert "Stopping" in eda_caplog.text + assert "Cleanup operation requested" in eda_caplog.text + assert "Activation stopped." in eda_caplog.text assert running_activation.status == after_status - assert running_activation.latest_instance.status == after_status + assert ( + running_activation.latest_instance.status + == enums.ActivationStatus.STOPPED + ) + assert running_activation.latest_instance.activation_pod_id is None assert running_activation.status_message == after_msg - - if before_status == enums.ActivationStatus.RUNNING: - assert "Cleanup operation requested" in eda_caplog.text - assert "Activation stopped." in eda_caplog.text - assert running_activation.latest_instance.activation_pod_id is None - assert container_engine_mock.cleanup.called + assert container_engine_mock.cleanup.called @pytest.mark.django_db @@ -680,6 +665,75 @@ def test_stop_stopped_pod_running( assert running_activation.latest_instance.activation_pod_id is None +@pytest.mark.parametrize( + ("initial_status", "expected_status", "expected_message"), + [ + ( + enums.ActivationStatus.COMPLETED, + enums.ActivationStatus.COMPLETED, + "is already stopped.", + ), + ( + enums.ActivationStatus.ERROR, + enums.ActivationStatus.ERROR, + "is already stopped.", + ), + ( + enums.ActivationStatus.FAILED, + enums.ActivationStatus.FAILED, + "is already stopped.", + ), + ( + enums.ActivationStatus.STOPPED, + enums.ActivationStatus.STOPPED, + "is already stopped.", + ), + ( + enums.ActivationStatus.STOPPING, + enums.ActivationStatus.STOPPED, + "Activation stopped.", + ), + ( + enums.ActivationStatus.PENDING, + enums.ActivationStatus.STOPPED, + "Activation stopped.", + ), + ( + enums.ActivationStatus.UNRESPONSIVE, + enums.ActivationStatus.STOPPED, + "Activation stopped.", + ), + ], +) +@pytest.mark.django_db +def test_stop_with_different_statuses( + activation_with_instance: models.Activation, + container_engine_mock: MagicMock, + eda_caplog: LogCaptureFixture, + initial_status: enums.ActivationStatus, + expected_status: enums.ActivationStatus, + expected_message: str, +): + """Test stop with different initial statuses.""" + # Setup initial state + activation_with_instance.status = initial_status + activation_with_instance.save(update_fields=["status"]) + activation_with_instance.latest_instance.status = initial_status + activation_with_instance.latest_instance.save(update_fields=["status"]) + + activation_manager = ActivationManager( + activation_with_instance, container_engine_mock + ) + + # Execute + activation_manager.stop() + + # Assert final state + activation_with_instance.refresh_from_db() + assert activation_with_instance.status == expected_status + assert expected_message in eda_caplog.text + + @pytest.mark.django_db def test_delete_already_deleted( activation_with_instance: models.Activation,