Skip to content
2 changes: 2 additions & 0 deletions changelog.d/+fix-wrong-end-time.changed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Ensure that `end_time` is correct according to state: `None` for stateless,
less than datetime.max for closed.
111 changes: 108 additions & 3 deletions src/argus/incident/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
from operator import and_
from random import randint, choice
from typing import Optional
from urllib.parse import urljoin

from django.contrib.auth import get_user_model
Expand All @@ -14,6 +15,7 @@
from django.utils import timezone

from argus.util.datetime_utils import INFINITY_REPR, get_infinity_repr
from argus.util.exceptions import InconsistentDatabaseException
from .constants import Level
from .fields import DateTimeInfinityField
from .validators import validate_lowercase, validate_key
Expand Down Expand Up @@ -215,7 +217,28 @@ class Type(models.TextChoices):
Type.INCIDENT_CHANGE,
Type.STATELESS,
}
ALLOWED_TYPES_FOR_END_USERS = {Type.CLOSE, Type.REOPEN, Type.ACKNOWLEDGE, Type.OTHER}
ALLOWED_TYPES_FOR_END_USERS = {
Type.ACKNOWLEDGE,
Type.OTHER,
Type.CLOSE,
Type.REOPEN,
}
CLOSING_TYPES = {
Type.INCIDENT_END,
Type.CLOSE,
}
OPENING_TYPES = {
Type.INCIDENT_START,
Type.REOPEN,
}
STATE_TYPES = OPENING_TYPES | CLOSING_TYPES
SHARED_TYPES = {
Type.ACKNOWLEDGE,
Type.OTHER,
Type.INCIDENT_CHANGE,
}
STATELESS_TYPES = SHARED_TYPES | {Type.STATELESS}
STATEFUL_TYPES = SHARED_TYPES | STATE_TYPES

incident = models.ForeignKey(to="Incident", on_delete=models.PROTECT, related_name="events")
actor = models.ForeignKey(to=User, on_delete=models.PROTECT, related_name="caused_events")
Expand Down Expand Up @@ -334,8 +357,9 @@ def create_events(self, actor: User, event_type: Event.Type, timestamp=None, des

def close(self, actor: User, timestamp=None, description=""):
"Close incidents correctly and create the needed events"
timestamp = timestamp or timezone.now()
qs = self.open()
qs.update(end_time=timestamp or timezone.now())
qs.update(end_time=timestamp)
qs = self.all() # Reload changes from database
event_type = Event.Type.CLOSE
events = qs.create_events(actor, event_type, timestamp, description)
Expand Down Expand Up @@ -439,17 +463,36 @@ def tags(self):
def incident_relations(self):
return IncidentRelation.objects.filter(Q(incident1=self) | Q(incident2=self))

def all_opening_events(self):
open_events = Event.OPENING_TYPES
return self.events.filter(type__in=open_events).order_by("timestamp")

def all_reopen_events(self):
return self.events.filter(type=Event.Type.REOPEN).order_by("timestamp")

def all_closing_events(self):
close_events = Event.CLOSING_TYPES
return self.events.filter(type__in=close_events).order_by("timestamp")

@property
def start_event(self):
return self.events.filter(type=Event.Type.INCIDENT_START).order_by("timestamp").first()

@property
def reopen_event(self):
return self.all_reopen_events().last()

@property
def end_event(self):
return self.events.filter(type=Event.Type.INCIDENT_END).order_by("timestamp").first()

@property
def close_event(self):
return self.events.filter(type=Event.Type.CLOSE).order_by("timestamp").first()

@property
def last_close_or_end_event(self):
return self.events.filter(type__in=(Event.Type.CLOSE, Event.Type.INCIDENT_END)).order_by("timestamp").last()
return self.all_closing_events().last()

@property
def latest_change_event(self):
Expand All @@ -475,6 +518,68 @@ def acked(self):

return self.events.filter((acks_query & acks_not_expired_query) | ack_is_just_being_created).exists()

def event_already_exists(self, event_type):
return self.events.filter(type=event_type).exists()

def repair_end_time(self) -> Optional[bool]:
"""Repairs end_time if there is a mismatch between events and end_time

This can happen under race-conditions and because we still cannot use
the ``atomic``-decorator everwhere.

Returns:
* True if a repair needed to be made
* False if it was stateful and ok
* None if it was stateless and ok
"""
LOG.info("Incident %s: Detected potential mismatch of end_time and events", self.pk)

if not self.stateful:
# the vital part for statelessness is set correctly
LOG.info("Incident %s: No mismatch, correctly stateless", self.pk)
return

if self.stateless_event:
# Weird, stateless event without stateless end_time, fix
self.end_time = None
self.save()
LOG.warn("Mismatch between self %s end_time and event type: set stateless", self.pk)
return True

# Only stateful incidents from this point on

close_events = self.all_closing_events()
if not close_events.exists():
if self.open:
# Golden path for open incidents
LOG.info("Incident %s: No mismatch, correctly stateful and open", self.pk)
return False
else:
# missing close event. This is serious. 500 Internal Server Error serious.
message = "Incident %s was previously closed without the creation of an appropriate event"
LOG.error(message, self.pk)
raise InconsistentDatabaseException(message % self.pk)

# Only incidents with at least one close event from this point on

if not self.open:
# Golden path for closed incidents
LOG.info("Incident %s: No mismatch, correctly stateful and closed", self.pk)
return False

reopen_event = self.reopen_event
last_close_event = close_events.last()
if not reopen_event or reopen_event.timestamp < last_close_event.timestamp:
# end_time was not set when making closing event, fix
self.end_time = last_close_event.timestamp
self.save()
LOG.warn("Mismatch between self %s end_time and event type: set end_time to less than infinity", self.pk)
return True

# a reopen event correctly exists and the incident is correctly open
LOG.info("Incident %s: No mismatch, correctly stateful and reopened", self.pk)
return False

def is_acked_by(self, group: str) -> bool:
return group in self.acks.active().group_names()

Expand Down
68 changes: 43 additions & 25 deletions src/argus/incident/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from django.conf import settings
from django.contrib.auth import get_user_model
from django.db import IntegrityError
from django.http import HttpResponseRedirect
from django.utils import timezone

from django_filters import rest_framework as filters
Expand All @@ -23,6 +24,7 @@
from argus.drf.permissions import IsSuperuserOrReadOnly
from argus.filter import get_filter_backend
from argus.util.datetime_utils import INFINITY_REPR
from argus.util.exceptions import InconceivableException, SuccessfulRepairException
from argus.util.signals import bulk_changed

from .forms import AddSourceSystemForm
Expand Down Expand Up @@ -465,47 +467,63 @@ def perform_create(self, serializer: EventSerializer):
# is sent after the incident has been manually closed
if not user.is_source_system:
raise e
except (SuccessfulRepairException, InconceivableException):
# Do not save new event, it was redundant, redirect to incident
return HttpResponseRedirect(reverse("incident:incident-detail", kwargs={"pk": incident.pk}))
else:
# Only update incident if everything is valid; otherwise, just record the event
self.update_incident(serializer.validated_data, incident)
self.update_incident_end_time(serializer.validated_data, incident)

serializer.save(incident=incident, actor=user)

def validate_event_type_for_user(self, event_type: str, user: User):
if user.is_source_system:
if event_type not in Event.ALLOWED_TYPES_FOR_SOURCE_SYSTEMS:
self._raise_type_validation_error(f"A source system cannot post events of type '{event_type}'.")
self._abort_due_to_type_validation_error(f"A source system cannot post events of type '{event_type}'.")
else:
if event_type not in Event.ALLOWED_TYPES_FOR_END_USERS:
self._raise_type_validation_error(f"An end user cannot post events of type '{event_type}'.")
self._abort_due_to_type_validation_error(f"An end user cannot post events of type '{event_type}'.")

def validate_event_type_for_incident(self, event_type: str, incident: Incident):
def validate_incident_has_no_relation_to_event_type():
if incident.events.filter(type=event_type).exists():
self._raise_type_validation_error(f"The incident already has a related event of type '{event_type}'.")

if incident.stateful:
if event_type in {Event.Type.INCIDENT_START, Event.Type.INCIDENT_END}:
validate_incident_has_no_relation_to_event_type()
if event_type in {Event.Type.INCIDENT_END, Event.Type.CLOSE} and not incident.open:
self._raise_type_validation_error("The incident is already closed.")
elif event_type == Event.Type.REOPEN and incident.open:
self._raise_type_validation_error("The incident is already open.")
else:
if event_type == Event.Type.STATELESS:
validate_incident_has_no_relation_to_event_type()
elif event_type == Event.Type.INCIDENT_START:
self._raise_type_validation_error("Stateless incident cannot have an INCIDENT_START event.")
elif event_type in {Event.Type.INCIDENT_END, Event.Type.CLOSE, Event.Type.REOPEN}:
self._raise_type_validation_error("Cannot change the state of a stateless incident.")
def abort_due_to_too_many_events(incident, event_type):
error_msg = f"Incident #{incident.pk} can only have one event of type '{event_type}'."
LOG.warn(error_msg)
self._abort_due_to_type_validation_error(error_msg)

if event_type == Event.Type.ACKNOWLEDGE:
acks_endpoint = reverse("incident:incident-acks", args=[incident.pk], request=self.request)
self._raise_type_validation_error(
f"Acknowledgements of this incidents should be posted through {acks_endpoint}."
self._abort_due_to_type_validation_error(
f"Acknowledgement of an incident should be posted through {acks_endpoint}."
)

def update_incident(self, validated_data: dict, incident: Incident):
if incident.stateful:
if incident.event_already_exists(event_type):
if event_type == Event.Type.INCIDENT_START:
# Only ever 1
abort_due_to_too_many_events(incident, event_type)
if event_type == Event.Type.INCIDENT_END:
# Only ever 1, but might not have been saved correctly earlier
repaired = incident.repair_end_time()
if repaired:
raise SuccessfulRepairException("end_time mismatch repaired, see logs")
# should never happen, here for completeness
LOG.error("Something weird happened, see other logs")
raise InconceivableException("Found end_time mismatch was in error, see logs")
if event_type in Event.CLOSING_TYPES and not incident.open:
self._abort_due_to_type_validation_error("The incident is already closed.")
if event_type == Event.Type.REOPEN and incident.open:
self._abort_due_to_type_validation_error("The incident is already open.")

# type ok for stateful
return

# stateless from here
if event_type == Event.Type.STATELESS and incident.event_already_exists(event_type):
abort_due_to_too_many_events(incident, event_type)
if event_type in Event.STATE_TYPES:
self._abort_due_to_type_validation_error("Cannot change the state of a stateless incident.")

def update_incident_end_time(self, validated_data: dict, incident: Incident):
timestamp = validated_data["timestamp"]
event_type = validated_data["type"]
if event_type in {Event.Type.INCIDENT_END, Event.Type.CLOSE}:
Expand All @@ -516,7 +534,7 @@ def update_incident(self, validated_data: dict, incident: Incident):
incident.save()

@staticmethod
def _raise_type_validation_error(message: str):
def _abort_due_to_type_validation_error(message: str):
raise serializers.ValidationError({"type": message})


Expand Down
15 changes: 15 additions & 0 deletions src/argus/util/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
class SuccessfulRepairException(Exception):
# Something was repaired and we need to skip another action
pass


class InconceivableException(Exception):
# This should not happen, ever!
pass


class InconsistentDatabaseException(ValueError):
# Something is missing or doubled up in the database,
# or has been accidentally deleted
# It cannot be fixed automatically so we must bail out
pass
33 changes: 32 additions & 1 deletion tests/incident/test_incident.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import timedelta
from datetime import timedelta, datetime

from django.test import TestCase
from django.utils import timezone
Expand Down Expand Up @@ -89,3 +89,34 @@
def test_level_str_returns_name_of_level(self):
incident = StatefulIncidentFactory(level=1)
self.assertEqual(incident.pp_level(), "Critical")


class IncidentRepairEndTimeTests(TestCase):
def setup(self):
disconnect_signals()

def tearDown(self):
connect_signals()

def test_golden_path(self):
incident = StatefulIncidentFactory(level=1)
end_time = incident.end_time
self.assertFalse(incident.repair_end_time())
self.assertEqual(end_time, incident.end_time)

def test_stateless_event_on_stateful_incident_should_fix_end_time(self):

Check failure on line 107 in tests/incident/test_incident.py

View workflow job for this annotation

GitHub Actions / Test results

All 4 runs with error: test_stateless_event_on_stateful_incident_should_fix_end_time (tests.incident.test_incident.IncidentRepairEndTimeTests)

artifacts/test-reports-3.10/py310-django42/test-results.xml [took 0s] artifacts/test-reports-3.11/py311-django42/test-results.xml [took 0s] artifacts/test-reports-3.12/py312-django42/test-results.xml [took 0s] artifacts/test-reports-3.9/py39-django42/test-results.xml [took 0s]
Raw output
send_notification() got an unexpected keyword argument 'signal'
Traceback (most recent call last):
  File "/home/runner/work/Argus/Argus/tests/incident/test_incident.py", line 110, in test_stateless_event_on_stateful_incident_should_fix_end_time
    EventFactory(incident=incident, type=Event.Type.STATELESS)
  File "/home/runner/work/Argus/Argus/.tox/py39-django42/lib/python3.9/site-packages/factory/base.py", line 43, in __call__
    return cls.create(**kwargs)
  File "/home/runner/work/Argus/Argus/.tox/py39-django42/lib/python3.9/site-packages/factory/base.py", line 539, in create
    return cls._generate(enums.CREATE_STRATEGY, kwargs)
  File "/home/runner/work/Argus/Argus/.tox/py39-django42/lib/python3.9/site-packages/factory/django.py", line 122, in _generate
    return super()._generate(strategy, params)
  File "/home/runner/work/Argus/Argus/.tox/py39-django42/lib/python3.9/site-packages/factory/base.py", line 468, in _generate
    return step.build()
  File "/home/runner/work/Argus/Argus/.tox/py39-django42/lib/python3.9/site-packages/factory/builder.py", line 274, in build
    instance = self.factory_meta.instantiate(
  File "/home/runner/work/Argus/Argus/.tox/py39-django42/lib/python3.9/site-packages/factory/base.py", line 320, in instantiate
    return self.factory._create(model, *args, **kwargs)
  File "/home/runner/work/Argus/Argus/.tox/py39-django42/lib/python3.9/site-packages/factory/django.py", line 175, in _create
    return manager.create(*args, **kwargs)
  File "/home/runner/work/Argus/Argus/.tox/py39-django42/lib/python3.9/site-packages/django/db/models/manager.py", line 87, in manager_method
    return getattr(self.get_queryset(), name)(*args, **kwargs)
  File "/home/runner/work/Argus/Argus/.tox/py39-django42/lib/python3.9/site-packages/django/db/models/query.py", line 658, in create
    obj.save(force_insert=True, using=self.db)
  File "/home/runner/work/Argus/Argus/src/argus/incident/models.py", line 254, in save
    super().save(*args, **kwargs)
  File "/home/runner/work/Argus/Argus/.tox/py39-django42/lib/python3.9/site-packages/django/db/models/base.py", line 814, in save
    self.save_base(
  File "/home/runner/work/Argus/Argus/.tox/py39-django42/lib/python3.9/site-packages/django/db/models/base.py", line 892, in save_base
    post_save.send(
  File "/home/runner/work/Argus/Argus/.tox/py39-django42/lib/python3.9/site-packages/django/dispatch/dispatcher.py", line 176, in send
    return [
  File "/home/runner/work/Argus/Argus/.tox/py39-django42/lib/python3.9/site-packages/django/dispatch/dispatcher.py", line 177, in <listcomp>
    (receiver, receiver(signal=self, sender=sender, **named))
TypeError: send_notification() got an unexpected keyword argument 'signal'
incident = StatefulIncidentFactory(level=1)
end_time = incident.end_time
EventFactory(incident=incident, type=Event.Type.STATELESS)
self.asserTrue(incident.repair_end_time())
self.assertNotEqual(end_time, incident.end_time)
self.assertEqual(incident.end_time, None)

def test_closing_event_without_finite_end_time_and_reopen_event_should_fix_endt_time(self):

Check warning on line 115 in tests/incident/test_incident.py

View workflow job for this annotation

GitHub Actions / Test results

All 4 runs failed: test_closing_event_without_finite_end_time_and_reopen_event_should_fix_endt_time (tests.incident.test_incident.IncidentRepairEndTimeTests)

artifacts/test-reports-3.10/py310-django42/test-results.xml [took 0s] artifacts/test-reports-3.11/py311-django42/test-results.xml [took 0s] artifacts/test-reports-3.12/py312-django42/test-results.xml [took 0s] artifacts/test-reports-3.9/py39-django42/test-results.xml [took 0s]
Raw output
datet[28 chars]3, 59, 59, 999999, tzinfo=zoneinfo.ZoneInfo(key='Europe/Oslo')) != datet[28 chars]3, 59, 59, 999999)
Traceback (most recent call last):
  File "/home/runner/work/Argus/Argus/tests/incident/test_incident.py", line 118, in test_closing_event_without_finite_end_time_and_reopen_event_should_fix_endt_time
    self.assertEqual(incident.end_time, datetime.max)
AssertionError: datet[28 chars]3, 59, 59, 999999, tzinfo=zoneinfo.ZoneInfo(key='Europe/Oslo')) != datet[28 chars]3, 59, 59, 999999)
incident = StatefulIncidentFactory(level=1)
end_time = incident.end_time
self.assertEqual(incident.end_time, datetime.max)
EventFactory(incident=incident, type=Event.Type.INCIDENT_END)
self.asserTrue(incident.repair_end_time())
self.assertNotEqual(end_time, incident.end_time)
self.assertLessThan(incident.end_time, datetime.max)
Loading