Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 55 additions & 49 deletions keep/api/bl/maintenance_windows_bl.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,61 +59,67 @@ def check_if_alert_in_maintenance_windows(self, alert: AlertDto) -> bool:
env = celpy.Environment()

for maintenance_rule in self.maintenance_rules:
if alert.status in maintenance_rule.ignore_statuses:
self.logger.debug(
"Alert status is set to be ignored, ignoring maintenance windows",
extra={"tenant_id": self.tenant_id},
)
continue

if maintenance_rule.end_time.replace(tzinfo=datetime.UTC) <= datetime.datetime.now(datetime.UTC):
# this is wtf error, should not happen because of query in init
self.logger.error(
"Fetched maintenance window which already ended by mistake, should not happen!"
)
continue
try:
if alert.status in maintenance_rule.ignore_statuses:
self.logger.debug(
"Alert status is set to be ignored, ignoring maintenance windows",
extra={"tenant_id": self.tenant_id},
)
continue

cel_result = MaintenanceWindowsBl.evaluate_cel(maintenance_rule, alert, env, self.logger, extra)
if maintenance_rule.end_time.replace(tzinfo=datetime.UTC) <= datetime.datetime.now(datetime.UTC):
# this is wtf error, should not happen because of query in init
self.logger.error(
"Fetched maintenance window which already ended by mistake, should not happen!"
)
continue

if cel_result:
self.logger.info(
"Alert is in maintenance window",
extra={**extra, "maintenance_rule_id": maintenance_rule.id},
)
cel_result = MaintenanceWindowsBl.evaluate_cel(maintenance_rule, alert, env, self.logger, extra)

try:
audit = AlertAudit(
tenant_id=self.tenant_id,
fingerprint=alert.fingerprint,
user_id="Keep",
action=ActionType.MAINTENANCE.value,
description=(
f"Alert in maintenance due to rule `{maintenance_rule.name}`"
if not maintenance_rule.suppress
else f"Alert suppressed due to maintenance rule `{maintenance_rule.name}`"
),
)
self.session.add(audit)
self.session.commit()
except Exception:
self.logger.exception(
"Failed to write audit for alert maintenance window",
extra={
"tenant_id": self.tenant_id,
"fingerprint": alert.fingerprint,
},
if cel_result:
self.logger.info(
"Alert is in maintenance window",
extra={**extra, "maintenance_rule_id": maintenance_rule.id},
)

if maintenance_rule.suppress:
# If user chose to suppress the alert, let it in but override the status.
if MAINTENANCE_WINDOW_ALERT_STRATEGY == "recover_previous_status":
alert.previous_status = alert.status
alert.status = AlertStatus.MAINTENANCE.value
else:
alert.status = AlertStatus.SUPPRESSED.value
return False
try:
audit = AlertAudit(
tenant_id=self.tenant_id,
fingerprint=alert.fingerprint,
user_id="Keep",
action=ActionType.MAINTENANCE.value,
description=(
f"Alert in maintenance due to rule `{maintenance_rule.name}`"
if not maintenance_rule.suppress
else f"Alert suppressed due to maintenance rule `{maintenance_rule.name}`"
),
)
self.session.add(audit)
self.session.commit()
except Exception:
self.logger.exception(
"Failed to write audit for alert maintenance window",
extra={
"tenant_id": self.tenant_id,
"fingerprint": alert.fingerprint,
},
)

if maintenance_rule.suppress:
# If user chose to suppress the alert, let it in but override the status.
if MAINTENANCE_WINDOW_ALERT_STRATEGY == "recover_previous_status":
alert.previous_status = alert.status
alert.status = AlertStatus.MAINTENANCE.value
else:
alert.status = AlertStatus.SUPPRESSED.value
return False

return True
return True
except Exception:
self.logger.exception(
"Error while evaluating maintenance window CEL expression",
extra={**extra, "maintenance_rule_id": maintenance_rule.id},
)
self.logger.info("Alert is not in maintenance window", extra=extra)
return False

Expand Down
29 changes: 28 additions & 1 deletion tests/test_maintenance_windows_bl.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,4 +554,31 @@ def test_strategy_alert_execution_wf(
#THEN The WF is not executed if there is a resolved alert or executed 1 time if there are only firing alerts
n_executions = get_workflow_executions(SINGLE_TENANT_UUID, workflow.id)[0]

assert n_executions == executions
assert n_executions == executions


def test_maintenance_window_cel_evaluation_exception_handling(
mock_session, active_maintenance_window_rule, alert_dto
):
"""
Feature: Generic - check_if_alert_in_maintenance_windows method exception handling
Scenario: When there is an exception checking the parameters inside the
check_if_alert_in_maintenance_windows method, it should be handled and
the method should return False.
This prevents the system from crashing and continue with the main flow.
"""

# GIVEN a maintenance window active with a erroneous CEL expression
active_maintenance_window_rule.cel_query = r'service.matches("(?i)^[10(\..*)?$")'
mock_session.query.return_value.filter.return_value.filter.return_value.filter.return_value.filter.return_value.all.return_value = [
active_maintenance_window_rule
]

maintenance_window_bl = MaintenanceWindowsBl(
tenant_id="test-tenant", session=mock_session
)
# WHEN it checks if the alert is in maintenance windows
result = maintenance_window_bl.check_if_alert_in_maintenance_windows(alert_dto)

# Then it must return a boolean value, False in this case
assert result is False
Loading