Skip to content

Commit 8650f16

Browse files
authored
NAS-135832 / 25.10 / make PSU alerts proactive (#16485)
1 parent db388e6 commit 8650f16

File tree

1 file changed

+120
-40
lines changed
  • src/middlewared/middlewared/alert/source

1 file changed

+120
-40
lines changed

src/middlewared/middlewared/alert/source/sensors.py

Lines changed: 120 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,18 @@
33
# Licensed under the terms of the TrueNAS Enterprise License Agreement
44
# See the file LICENSE.IX for complete terms and conditions
55

6-
from middlewared.alert.base import AlertClass, AlertCategory, AlertLevel, AlertSource, Alert
6+
import time
7+
8+
from middlewared.alert.base import (
9+
AlertClass,
10+
AlertCategory,
11+
AlertLevel,
12+
AlertSource,
13+
Alert,
14+
UnavailableException,
15+
)
716
from middlewared.utils import ProductType
17+
from middlewared.utils.crypto import generate_token
818

919

1020
class SensorAlertClass(AlertClass):
@@ -19,17 +29,28 @@ class PowerSupplyAlertClass(AlertClass):
1929
category = AlertCategory.HARDWARE
2030
level = AlertLevel.CRITICAL
2131
title = "Power Supply Error"
22-
text = "%(psu)s is %(state)s showing: %(errors)s"
32+
text = (
33+
"%(psu)s is %(state)s showing: %(errors)s. Contact support. Incident ID: %(id)s"
34+
)
2335
products = (ProductType.ENTERPRISE,)
36+
proactive_support = True
37+
proactive_support_notify_gone = True
2438

2539

26-
class SensorsAlertSource(AlertSource):
40+
class PsuAlertSource(AlertSource):
41+
def __init__(self, middleware):
42+
super().__init__(middleware)
43+
self.last_failure = time.monotonic()
44+
self.incident_id = None
45+
self._30mins = 30 * 60
2746

2847
async def should_alert(self):
29-
if (await self.middleware.call('system.dmidecode_info'))['system-product-name'].startswith('TRUENAS-R'):
48+
if (await self.middleware.call("system.dmidecode_info"))[
49+
"system-product-name"
50+
].startswith("TRUENAS-R"):
3051
# r-series
3152
return True
32-
elif await self.middleware.call('failover.hardware') == 'ECHOWARP':
53+
elif await self.middleware.call("failover.hardware") == "ECHOWARP":
3354
# m-series
3455
return True
3556

@@ -40,40 +61,99 @@ async def check(self):
4061
if not await self.should_alert():
4162
return alerts
4263

43-
for i in await self.middleware.call('ipmi.sensors.query'):
44-
if i['state'] != 'Nominal' and i['reading'] != 'N/A':
45-
if i['type'] == 'Power Supply' and i['event']:
46-
alerts.append(Alert(
47-
PowerSupplyAlertClass,
48-
{'psu': i['name'], 'state': i['state'], 'errors': ', '.join(i['event'])}
49-
))
50-
elif (alert := await self.produce_sensor_alert(i)) is not None:
51-
alerts.append(alert)
52-
64+
for i in await self.middleware.call("ipmi.sensors.query"):
65+
if (
66+
i["type"] == "Power Supply" and
67+
i["state"] != "Nominal" and
68+
i["reading"] != "N/A" and
69+
i["event"]
70+
):
71+
if time.monotonic() - self.last_failure > self._30mins:
72+
# we assume a PSU alert that has been around longer than
73+
# 30mins is justification for opening up a proactive ticket
74+
if self.incident_id is None:
75+
self.incident_id = generate_token(16, url_safe=True)
76+
alerts.append(
77+
Alert(
78+
PowerSupplyAlertClass,
79+
{
80+
"id": self.incident_id,
81+
"psu": i["name"],
82+
"state": i["state"],
83+
"errors": ", ".join(i["event"]),
84+
},
85+
)
86+
)
87+
else:
88+
raise UnavailableException()
5389
return alerts
5490

55-
async def produce_sensor_alert(self, sensor):
56-
reading = sensor['reading']
57-
for key in ('lower-non-recoverable', 'lower-critical', 'lower-non-critical'):
58-
if sensor[key] != 'N/A' and reading < sensor[key]:
59-
relative = 'below'
60-
level = 'recommended' if key == 'lower-non-critical' else 'critical'
61-
return Alert(SensorAlertClass, {
62-
'name': sensor['name'],
63-
'relative': relative,
64-
'level': level,
65-
'value': reading,
66-
'event': ', '.join(sensor['event'])
67-
})
68-
69-
for key in ('upper-non-recoverable', 'upper-critical', 'upper-non-critical'):
70-
if sensor[key] != 'N/A' and reading > sensor[key]:
71-
relative = 'above'
72-
level = 'recommended' if key == 'upper-non-critical' else 'critical'
73-
return Alert(SensorAlertClass, {
74-
'name': sensor['name'],
75-
'relative': relative,
76-
'level': level,
77-
'value': reading,
78-
'event': ', '.join(sensor['event'])
79-
})
91+
92+
class SensorsAlertSource(AlertSource):
93+
async def should_alert(self):
94+
if (await self.middleware.call("system.dmidecode_info"))[
95+
"system-product-name"
96+
].startswith("TRUENAS-R"):
97+
# r-series
98+
return True
99+
elif await self.middleware.call("failover.hardware") == "ECHOWARP":
100+
# m-series
101+
return True
102+
103+
return False
104+
105+
async def check(self):
106+
alerts = []
107+
if not await self.should_alert():
108+
return alerts
109+
110+
for sensor in await self.middleware.call("ipmi.sensors.query"):
111+
if (
112+
sensor["type"] != "Power Supply" and
113+
sensor["state"] != "Nominal" and
114+
sensor["reading"] != "N/A" and
115+
sensor["event"]
116+
):
117+
reading = sensor["reading"]
118+
for key in (
119+
"lower-non-recoverable",
120+
"lower-critical",
121+
"lower-non-critical",
122+
):
123+
if sensor[key] != "N/A" and reading < sensor[key]:
124+
relative = "below"
125+
level = (
126+
"recommended" if key == "lower-non-critical" else "critical"
127+
)
128+
return Alert(
129+
SensorAlertClass,
130+
{
131+
"name": sensor["name"],
132+
"relative": relative,
133+
"level": level,
134+
"value": reading,
135+
"event": ", ".join(sensor["event"]),
136+
},
137+
)
138+
139+
for key in (
140+
"upper-non-recoverable",
141+
"upper-critical",
142+
"upper-non-critical",
143+
):
144+
if sensor[key] != "N/A" and reading > sensor[key]:
145+
relative = "above"
146+
level = (
147+
"recommended" if key == "upper-non-critical" else "critical"
148+
)
149+
return Alert(
150+
SensorAlertClass,
151+
{
152+
"name": sensor["name"],
153+
"relative": relative,
154+
"level": level,
155+
"value": reading,
156+
"event": ", ".join(sensor["event"]),
157+
},
158+
)
159+
return alerts

0 commit comments

Comments
 (0)