2424
2525metrics = {
2626 # fmt: off
27+ "nvmecli" : Info (
28+ "nvmecli" ,
29+ "nvme-cli tool information" ,
30+ ["version" ], namespace = namespace , registry = registry ,
31+ ),
32+
33+ # Controller-specific (e.g. "nvme0") metrics
2734 "avail_spare" : Gauge (
2835 "available_spare_ratio" ,
2936 "Device available spare ratio" ,
30- ["device " ], namespace = namespace , registry = registry ,
37+ ["controller " ], namespace = namespace , registry = registry ,
3138 ),
3239 "controller_busy_time" : Counter (
3340 "controller_busy_time_seconds" ,
3441 "Device controller busy time in seconds" ,
35- ["device " ], namespace = namespace , registry = registry ,
42+ ["controller " ], namespace = namespace , registry = registry ,
3643 ),
3744 "controller_info" : Info (
3845 "controller" ,
4350 "critical_warning" : Gauge (
4451 "critical_warning" ,
4552 "Device critical warning bitmap field" ,
46- ["device " ], namespace = namespace , registry = registry ,
53+ ["controller " ], namespace = namespace , registry = registry ,
4754 ),
4855 "data_units_read" : Counter (
4956 "data_units_read_total" ,
5057 "Number of 512-byte data units read by host, reported in thousands" ,
51- ["device " ], namespace = namespace , registry = registry ,
58+ ["controller " ], namespace = namespace , registry = registry ,
5259 ),
5360 "data_units_written" : Counter (
5461 "data_units_written_total" ,
5562 "Number of 512-byte data units written by host, reported in thousands" ,
56- ["device " ], namespace = namespace , registry = registry ,
63+ ["controller " ], namespace = namespace , registry = registry ,
5764 ),
5865 "host_read_commands" : Counter (
5966 "host_read_commands_total" ,
6067 "Device read commands from host" ,
61- ["device " ], namespace = namespace , registry = registry ,
68+ ["controller " ], namespace = namespace , registry = registry ,
6269 ),
6370 "host_write_commands" : Counter (
6471 "host_write_commands_total" ,
6572 "Device write commands from host" ,
66- ["device " ], namespace = namespace , registry = registry ,
73+ ["controller " ], namespace = namespace , registry = registry ,
6774 ),
6875 "media_errors" : Counter (
6976 "media_errors_total" ,
7077 "Device media errors total" ,
71- ["device " ], namespace = namespace , registry = registry ,
78+ ["controller " ], namespace = namespace , registry = registry ,
7279 ),
7380 "num_err_log_entries" : Counter (
7481 "num_err_log_entries_total" ,
7582 "Device error log entry count" ,
76- ["device" ], namespace = namespace , registry = registry ,
77- ),
78- "nvmecli" : Info (
79- "nvmecli" ,
80- "nvme-cli tool information" ,
81- ["version" ], namespace = namespace , registry = registry ,
83+ ["controller" ], namespace = namespace , registry = registry ,
8284 ),
8385 "percent_used" : Gauge (
8486 "percentage_used_ratio" ,
8587 "Device percentage used ratio" ,
86- ["device" ], namespace = namespace , registry = registry ,
87- ),
88- "physical_size" : Gauge (
89- "physical_size_bytes" ,
90- "Device size in bytes" ,
91- ["device" ], namespace = namespace , registry = registry ,
88+ ["controller" ], namespace = namespace , registry = registry ,
9289 ),
9390 "power_cycles" : Counter (
9491 "power_cycles_total" ,
9592 "Device number of power cycles" ,
96- ["device " ], namespace = namespace , registry = registry ,
93+ ["controller " ], namespace = namespace , registry = registry ,
9794 ),
9895 "power_on_hours" : Counter (
9996 "power_on_hours_total" ,
10097 "Device power-on hours" ,
101- ["device" ], namespace = namespace , registry = registry ,
102- ),
103- "sector_size" : Gauge (
104- "sector_size_bytes" ,
105- "Device sector size in bytes" ,
106- ["device" ], namespace = namespace , registry = registry ,
98+ ["controller" ], namespace = namespace , registry = registry ,
10799 ),
108100 "spare_thresh" : Gauge (
109101 "available_spare_threshold_ratio" ,
110102 "Device available spare threshold ratio" ,
111- ["device " ], namespace = namespace , registry = registry ,
103+ ["controller " ], namespace = namespace , registry = registry ,
112104 ),
113105 "temperature" : Gauge (
114106 "temperature_celsius" ,
115107 "Device temperature in degrees Celsius" ,
116- ["device " ], namespace = namespace , registry = registry ,
108+ ["controller " ], namespace = namespace , registry = registry ,
117109 ),
118110 "unsafe_shutdowns" : Counter (
119111 "unsafe_shutdowns_total" ,
120112 "Device number of unsafe shutdowns" ,
113+ ["controller" ], namespace = namespace , registry = registry ,
114+ ),
115+
116+ # Namespace-specific (e.g. "nvme0n1") metrics
117+ "physical_size" : Gauge (
118+ "physical_size_bytes" ,
119+ "Device size in bytes" ,
120+ ["device" ], namespace = namespace , registry = registry ,
121+ ),
122+ "sector_size" : Gauge (
123+ "sector_size_bytes" ,
124+ "Device sector size in bytes" ,
121125 ["device" ], namespace = namespace , registry = registry ,
122126 ),
123127 "used_bytes" : Gauge (
@@ -164,8 +168,10 @@ def main():
164168 for device in device_list ["Devices" ]:
165169 for subsys in device ["Subsystems" ]:
166170 for ctrl in subsys ["Controllers" ]:
171+ ctrl_dev = ctrl ["Controller" ]
172+
167173 metrics ["controller_info" ].labels (
168- ctrl [ "Controller" ] ,
174+ ctrl_dev ,
169175 ctrl ["ModelNumber" ],
170176 ctrl ["Firmware" ],
171177 ctrl ["SerialNumber" ].strip (),
@@ -179,50 +185,43 @@ def main():
179185 metrics ["physical_size" ].labels (device_name ).set (ns ["PhysicalSize" ])
180186 metrics ["used_bytes" ].labels (device_name ).set (ns ["UsedBytes" ])
181187
182- # FIXME: The smart-log should only need to be fetched once per controller, not
183- # per namespace. However, in order to preserve legacy metric labels, fetch it
184- # per namespace anyway. Most consumer grade SSDs will only have one namespace.
185- smart_log = exec_nvme_json ("smart-log" , os .path .join ("/dev" , device_name ))
186-
187- # Various counters in the NVMe specification are 128-bit, which would have to
188- # discard resolution if converted to a JSON number (i.e., float64_t). Instead,
189- # nvme-cli marshals them as strings. As such, they need to be explicitly cast
190- # to int or float when using them in Counter metrics.
191- metrics ["data_units_read" ].labels (device_name ).inc (
192- int (smart_log ["data_units_read" ])
193- )
194- metrics ["data_units_written" ].labels (device_name ).inc (
195- int (smart_log ["data_units_written" ])
196- )
197- metrics ["host_read_commands" ].labels (device_name ).inc (
198- int (smart_log ["host_read_commands" ])
199- )
200- metrics ["host_write_commands" ].labels (device_name ).inc (
201- int (smart_log ["host_write_commands" ])
202- )
203- metrics ["avail_spare" ].labels (device_name ).set (smart_log ["avail_spare" ] / 100 )
204- metrics ["spare_thresh" ].labels (device_name ).set (smart_log ["spare_thresh" ] / 100 )
205- metrics ["percent_used" ].labels (device_name ).set (smart_log ["percent_used" ] / 100 )
206- metrics ["critical_warning" ].labels (device_name ).set (
207- smart_log ["critical_warning" ]["value" ]
208- )
209- metrics ["media_errors" ].labels (device_name ).inc (int (smart_log ["media_errors" ]))
210- metrics ["num_err_log_entries" ].labels (device_name ).inc (
211- int (smart_log ["num_err_log_entries" ])
212- )
213- metrics ["power_cycles" ].labels (device_name ).inc (int (smart_log ["power_cycles" ]))
214- metrics ["power_on_hours" ].labels (device_name ).inc (
215- int (smart_log ["power_on_hours" ])
216- )
217- metrics ["controller_busy_time" ].labels (device_name ).inc (
218- int (smart_log ["controller_busy_time" ])
219- )
220- metrics ["unsafe_shutdowns" ].labels (device_name ).inc (
221- int (smart_log ["unsafe_shutdowns" ])
222- )
223-
224- # NVMe reports temperature in kelvins; convert it to degrees Celsius.
225- metrics ["temperature" ].labels (device_name ).set (smart_log ["temperature" ] - 273 )
188+ # Most SSDs (perhaps _all_ consumer grade SSDs) only contain a single namespace.
189+ # Fetch the device global SMART log by omitting any --namespace-id flag.
190+ smart_log = exec_nvme_json ("smart-log" , os .path .join ("/dev" , ctrl ["Controller" ]))
191+
192+ # Various counters in the NVMe specification are 128-bit, which would have to
193+ # discard resolution if converted to a JSON number (i.e., float64_t). Instead,
194+ # nvme-cli marshals them as strings. As such, they need to be explicitly cast to int
195+ # or float when using them in Counter metrics.
196+ metrics ["data_units_read" ].labels (ctrl_dev ).inc (int (smart_log ["data_units_read" ]))
197+ metrics ["data_units_written" ].labels (ctrl_dev ).inc (
198+ int (smart_log ["data_units_written" ])
199+ )
200+ metrics ["host_read_commands" ].labels (ctrl_dev ).inc (
201+ int (smart_log ["host_read_commands" ])
202+ )
203+ metrics ["host_write_commands" ].labels (ctrl_dev ).inc (
204+ int (smart_log ["host_write_commands" ])
205+ )
206+ metrics ["avail_spare" ].labels (ctrl_dev ).set (smart_log ["avail_spare" ] / 100 )
207+ metrics ["spare_thresh" ].labels (ctrl_dev ).set (smart_log ["spare_thresh" ] / 100 )
208+ metrics ["percent_used" ].labels (ctrl_dev ).set (smart_log ["percent_used" ] / 100 )
209+ metrics ["critical_warning" ].labels (ctrl_dev ).set (
210+ smart_log ["critical_warning" ]["value" ]
211+ )
212+ metrics ["media_errors" ].labels (ctrl_dev ).inc (int (smart_log ["media_errors" ]))
213+ metrics ["num_err_log_entries" ].labels (ctrl_dev ).inc (
214+ int (smart_log ["num_err_log_entries" ])
215+ )
216+ metrics ["power_cycles" ].labels (ctrl_dev ).inc (int (smart_log ["power_cycles" ]))
217+ metrics ["power_on_hours" ].labels (ctrl_dev ).inc (int (smart_log ["power_on_hours" ]))
218+ metrics ["controller_busy_time" ].labels (ctrl_dev ).inc (
219+ int (smart_log ["controller_busy_time" ])
220+ )
221+ metrics ["unsafe_shutdowns" ].labels (ctrl_dev ).inc (int (smart_log ["unsafe_shutdowns" ]))
222+
223+ # NVMe reports temperature in kelvins; convert it to degrees Celsius.
224+ metrics ["temperature" ].labels (ctrl_dev ).set (smart_log ["temperature" ] - 273 )
226225
227226
228227if __name__ == "__main__" :
0 commit comments