3
3
# Licensed under the terms of the TrueNAS Enterprise License Agreement
4
4
# See the file LICENSE.IX for complete terms and conditions
5
5
6
- from middlewared .alert .base import AlertClass , AlertCategory , AlertLevel , AlertSource , Alert
6
+ import time
7
+
8
+ from middlewared .alert .base import (
9
+ AlertClass ,
10
+ AlertCategory ,
11
+ AlertLevel ,
12
+ AlertSource ,
13
+ Alert ,
14
+ UnavailableException ,
15
+ )
7
16
from middlewared .utils import ProductType
17
+ from middlewared .utils .crypto import generate_token
8
18
9
19
10
20
class SensorAlertClass (AlertClass ):
@@ -19,17 +29,28 @@ class PowerSupplyAlertClass(AlertClass):
19
29
category = AlertCategory .HARDWARE
20
30
level = AlertLevel .CRITICAL
21
31
title = "Power Supply Error"
22
- text = "%(psu)s is %(state)s showing: %(errors)s"
32
+ text = (
33
+ "%(psu)s is %(state)s showing: %(errors)s. Contact support. Incident ID: %(id)s"
34
+ )
23
35
products = (ProductType .ENTERPRISE ,)
36
+ proactive_support = True
37
+ proactive_support_notify_gone = True
24
38
25
39
26
- class SensorsAlertSource (AlertSource ):
40
+ class PsuAlertSource (AlertSource ):
41
+ def __init__ (self , middleware ):
42
+ super ().__init__ (middleware )
43
+ self .last_failure = time .monotonic ()
44
+ self .incident_id = None
45
+ self ._30mins = 30 * 60
27
46
28
47
async def should_alert (self ):
29
- if (await self .middleware .call ('system.dmidecode_info' ))['system-product-name' ].startswith ('TRUENAS-R' ):
48
+ if (await self .middleware .call ("system.dmidecode_info" ))[
49
+ "system-product-name"
50
+ ].startswith ("TRUENAS-R" ):
30
51
# r-series
31
52
return True
32
- elif await self .middleware .call (' failover.hardware' ) == ' ECHOWARP' :
53
+ elif await self .middleware .call (" failover.hardware" ) == " ECHOWARP" :
33
54
# m-series
34
55
return True
35
56
@@ -40,40 +61,99 @@ async def check(self):
40
61
if not await self .should_alert ():
41
62
return alerts
42
63
43
- for i in await self .middleware .call ('ipmi.sensors.query' ):
44
- if i ['state' ] != 'Nominal' and i ['reading' ] != 'N/A' :
45
- if i ['type' ] == 'Power Supply' and i ['event' ]:
46
- alerts .append (Alert (
47
- PowerSupplyAlertClass ,
48
- {'psu' : i ['name' ], 'state' : i ['state' ], 'errors' : ', ' .join (i ['event' ])}
49
- ))
50
- elif (alert := await self .produce_sensor_alert (i )) is not None :
51
- alerts .append (alert )
52
-
64
+ for i in await self .middleware .call ("ipmi.sensors.query" ):
65
+ if (
66
+ i ["type" ] == "Power Supply" and
67
+ i ["state" ] != "Nominal" and
68
+ i ["reading" ] != "N/A" and
69
+ i ["event" ]
70
+ ):
71
+ if time .monotonic () - self .last_failure > self ._30mins :
72
+ # we assume a PSU alert that has been around longer than
73
+ # 30mins is justification for opening up a proactive ticket
74
+ if self .incident_id is None :
75
+ self .incident_id = generate_token (16 , url_safe = True )
76
+ alerts .append (
77
+ Alert (
78
+ PowerSupplyAlertClass ,
79
+ {
80
+ "id" : self .incident_id ,
81
+ "psu" : i ["name" ],
82
+ "state" : i ["state" ],
83
+ "errors" : ", " .join (i ["event" ]),
84
+ },
85
+ )
86
+ )
87
+ else :
88
+ raise UnavailableException ()
53
89
return alerts
54
90
55
- async def produce_sensor_alert (self , sensor ):
56
- reading = sensor ['reading' ]
57
- for key in ('lower-non-recoverable' , 'lower-critical' , 'lower-non-critical' ):
58
- if sensor [key ] != 'N/A' and reading < sensor [key ]:
59
- relative = 'below'
60
- level = 'recommended' if key == 'lower-non-critical' else 'critical'
61
- return Alert (SensorAlertClass , {
62
- 'name' : sensor ['name' ],
63
- 'relative' : relative ,
64
- 'level' : level ,
65
- 'value' : reading ,
66
- 'event' : ', ' .join (sensor ['event' ])
67
- })
68
-
69
- for key in ('upper-non-recoverable' , 'upper-critical' , 'upper-non-critical' ):
70
- if sensor [key ] != 'N/A' and reading > sensor [key ]:
71
- relative = 'above'
72
- level = 'recommended' if key == 'upper-non-critical' else 'critical'
73
- return Alert (SensorAlertClass , {
74
- 'name' : sensor ['name' ],
75
- 'relative' : relative ,
76
- 'level' : level ,
77
- 'value' : reading ,
78
- 'event' : ', ' .join (sensor ['event' ])
79
- })
91
+
92
+ class SensorsAlertSource (AlertSource ):
93
+ async def should_alert (self ):
94
+ if (await self .middleware .call ("system.dmidecode_info" ))[
95
+ "system-product-name"
96
+ ].startswith ("TRUENAS-R" ):
97
+ # r-series
98
+ return True
99
+ elif await self .middleware .call ("failover.hardware" ) == "ECHOWARP" :
100
+ # m-series
101
+ return True
102
+
103
+ return False
104
+
105
+ async def check (self ):
106
+ alerts = []
107
+ if not await self .should_alert ():
108
+ return alerts
109
+
110
+ for sensor in await self .middleware .call ("ipmi.sensors.query" ):
111
+ if (
112
+ sensor ["type" ] != "Power Supply" and
113
+ sensor ["state" ] != "Nominal" and
114
+ sensor ["reading" ] != "N/A" and
115
+ sensor ["event" ]
116
+ ):
117
+ reading = sensor ["reading" ]
118
+ for key in (
119
+ "lower-non-recoverable" ,
120
+ "lower-critical" ,
121
+ "lower-non-critical" ,
122
+ ):
123
+ if sensor [key ] != "N/A" and reading < sensor [key ]:
124
+ relative = "below"
125
+ level = (
126
+ "recommended" if key == "lower-non-critical" else "critical"
127
+ )
128
+ return Alert (
129
+ SensorAlertClass ,
130
+ {
131
+ "name" : sensor ["name" ],
132
+ "relative" : relative ,
133
+ "level" : level ,
134
+ "value" : reading ,
135
+ "event" : ", " .join (sensor ["event" ]),
136
+ },
137
+ )
138
+
139
+ for key in (
140
+ "upper-non-recoverable" ,
141
+ "upper-critical" ,
142
+ "upper-non-critical" ,
143
+ ):
144
+ if sensor [key ] != "N/A" and reading > sensor [key ]:
145
+ relative = "above"
146
+ level = (
147
+ "recommended" if key == "upper-non-critical" else "critical"
148
+ )
149
+ return Alert (
150
+ SensorAlertClass ,
151
+ {
152
+ "name" : sensor ["name" ],
153
+ "relative" : relative ,
154
+ "level" : level ,
155
+ "value" : reading ,
156
+ "event" : ", " .join (sensor ["event" ]),
157
+ },
158
+ )
159
+ return alerts
0 commit comments