1
1
// SPDX-License-Identifier: GPL-2.0
2
2
/* Copyright (c) 2024, Intel Corporation. */
3
3
4
- #include "health.h"
5
4
#include "ice.h"
5
+ #include "ice_adminq_cmd.h" /* for enum ice_aqc_health_status_elem */
6
+ #include "health.h"
6
7
7
8
#define ICE_DEVLINK_FMSG_PUT_FIELD (fmsg , obj , name ) \
8
9
devlink_fmsg_put(fmsg, #name, (obj)->name)
9
10
11
+ #define ICE_HEALTH_STATUS_DATA_SIZE 2
12
+
13
+ struct ice_health_status {
14
+ enum ice_aqc_health_status code ;
15
+ const char * description ;
16
+ const char * solution ;
17
+ const char * data_label [ICE_HEALTH_STATUS_DATA_SIZE ];
18
+ };
19
+
20
+ /*
21
+ * In addition to the health status codes provided below, the firmware might
22
+ * generate Health Status Codes that are not pertinent to the end-user.
23
+ * For instance, Health Code 0x1002 is triggered when the command fails.
24
+ * Such codes should be disregarded by the end-user.
25
+ * The below lookup requires to be sorted by code.
26
+ */
27
+
28
+ static const char * const ice_common_port_solutions =
29
+ "Check your cable connection. Change or replace the module or cable. Manually set speed and duplex." ;
30
+ static const char * const ice_port_number_label = "Port Number" ;
31
+ static const char * const ice_update_nvm_solution = "Update to the latest NVM image." ;
32
+
33
+ static const struct ice_health_status ice_health_status_lookup [] = {
34
+ {ICE_AQC_HEALTH_STATUS_ERR_UNKNOWN_MOD_STRICT , "An unsupported module was detected." ,
35
+ ice_common_port_solutions , {ice_port_number_label }},
36
+ {ICE_AQC_HEALTH_STATUS_ERR_MOD_TYPE , "Module type is not supported." ,
37
+ "Change or replace the module or cable." , {ice_port_number_label }},
38
+ {ICE_AQC_HEALTH_STATUS_ERR_MOD_QUAL , "Module is not qualified." ,
39
+ ice_common_port_solutions , {ice_port_number_label }},
40
+ {ICE_AQC_HEALTH_STATUS_ERR_MOD_COMM ,
41
+ "Device cannot communicate with the module." ,
42
+ "Check your cable connection. Change or replace the module or cable. Manually set speed and duplex." ,
43
+ {ice_port_number_label }},
44
+ {ICE_AQC_HEALTH_STATUS_ERR_MOD_CONFLICT , "Unresolved module conflict." ,
45
+ "Manually set speed/duplex or change the port option. If the problem persists, use a cable/module that is found in the supported modules and cables list for this device." ,
46
+ {ice_port_number_label }},
47
+ {ICE_AQC_HEALTH_STATUS_ERR_MOD_NOT_PRESENT , "Module is not present." ,
48
+ "Check that the module is inserted correctly. If the problem persists, use a cable/module that is found in the supported modules and cables list for this device." ,
49
+ {ice_port_number_label }},
50
+ {ICE_AQC_HEALTH_STATUS_INFO_MOD_UNDERUTILIZED , "Underutilized module." ,
51
+ "Change or replace the module or cable. Change the port option." ,
52
+ {ice_port_number_label }},
53
+ {ICE_AQC_HEALTH_STATUS_ERR_UNKNOWN_MOD_LENIENT , "An unsupported module was detected." ,
54
+ ice_common_port_solutions , {ice_port_number_label }},
55
+ {ICE_AQC_HEALTH_STATUS_ERR_INVALID_LINK_CFG , "Invalid link configuration." ,
56
+ NULL , {ice_port_number_label }},
57
+ {ICE_AQC_HEALTH_STATUS_ERR_PORT_ACCESS , "Port hardware access error." ,
58
+ ice_update_nvm_solution , {ice_port_number_label }},
59
+ {ICE_AQC_HEALTH_STATUS_ERR_PORT_UNREACHABLE , "A port is unreachable." ,
60
+ "Change the port option. Update to the latest NVM image." },
61
+ {ICE_AQC_HEALTH_STATUS_INFO_PORT_SPEED_MOD_LIMITED , "Port speed is limited due to module." ,
62
+ "Change the module or configure the port option to match the current module speed. Change the port option." ,
63
+ {ice_port_number_label }},
64
+ {ICE_AQC_HEALTH_STATUS_ERR_PARALLEL_FAULT ,
65
+ "All configured link modes were attempted but failed to establish link. The device will restart the process to establish link." ,
66
+ "Check link partner connection and configuration." ,
67
+ {ice_port_number_label }},
68
+ {ICE_AQC_HEALTH_STATUS_INFO_PORT_SPEED_PHY_LIMITED ,
69
+ "Port speed is limited by PHY capabilities." ,
70
+ "Change the module to align to port option." , {ice_port_number_label }},
71
+ {ICE_AQC_HEALTH_STATUS_ERR_NETLIST_TOPO , "LOM topology netlist is corrupted." ,
72
+ ice_update_nvm_solution , {ice_port_number_label }},
73
+ {ICE_AQC_HEALTH_STATUS_ERR_NETLIST , "Unrecoverable netlist error." ,
74
+ ice_update_nvm_solution , {ice_port_number_label }},
75
+ {ICE_AQC_HEALTH_STATUS_ERR_TOPO_CONFLICT , "Port topology conflict." ,
76
+ "Change the port option. Update to the latest NVM image." },
77
+ {ICE_AQC_HEALTH_STATUS_ERR_LINK_HW_ACCESS , "Unrecoverable hardware access error." ,
78
+ ice_update_nvm_solution , {ice_port_number_label }},
79
+ {ICE_AQC_HEALTH_STATUS_ERR_LINK_RUNTIME , "Unrecoverable runtime error." ,
80
+ ice_update_nvm_solution , {ice_port_number_label }},
81
+ {ICE_AQC_HEALTH_STATUS_ERR_DNL_INIT , "Link management engine failed to initialize." ,
82
+ ice_update_nvm_solution , {ice_port_number_label }},
83
+ {ICE_AQC_HEALTH_STATUS_ERR_PHY_FW_LOAD ,
84
+ "Failed to load the firmware image in the external PHY." ,
85
+ ice_update_nvm_solution , {ice_port_number_label }},
86
+ {ICE_AQC_HEALTH_STATUS_INFO_RECOVERY , "The device is in firmware recovery mode." ,
87
+ ice_update_nvm_solution , {"Extended Error" }},
88
+ {ICE_AQC_HEALTH_STATUS_ERR_FLASH_ACCESS , "The flash chip cannot be accessed." ,
89
+ "If issue persists, call customer support." , {"Access Type" }},
90
+ {ICE_AQC_HEALTH_STATUS_ERR_NVM_AUTH , "NVM authentication failed." ,
91
+ ice_update_nvm_solution },
92
+ {ICE_AQC_HEALTH_STATUS_ERR_OROM_AUTH , "Option ROM authentication failed." ,
93
+ ice_update_nvm_solution },
94
+ {ICE_AQC_HEALTH_STATUS_ERR_DDP_AUTH , "DDP package authentication failed." ,
95
+ "Update to latest base driver and DDP package." },
96
+ {ICE_AQC_HEALTH_STATUS_ERR_NVM_COMPAT , "NVM image is incompatible." ,
97
+ ice_update_nvm_solution },
98
+ {ICE_AQC_HEALTH_STATUS_ERR_OROM_COMPAT , "Option ROM is incompatible." ,
99
+ ice_update_nvm_solution , {"Expected PCI Device ID" , "Expected Module ID" }},
100
+ {ICE_AQC_HEALTH_STATUS_ERR_DCB_MIB ,
101
+ "Supplied MIB file is invalid. DCB reverted to default configuration." ,
102
+ "Disable FW-LLDP and check DCBx system configuration." ,
103
+ {ice_port_number_label , "MIB ID" }},
104
+ };
105
+
106
+ static int ice_health_status_lookup_compare (const void * a , const void * b )
107
+ {
108
+ return ((struct ice_health_status * )a )-> code - ((struct ice_health_status * )b )-> code ;
109
+ }
110
+
111
+ static const struct ice_health_status * ice_get_health_status (u16 code )
112
+ {
113
+ struct ice_health_status key = { .code = code };
114
+
115
+ return bsearch (& key , ice_health_status_lookup , ARRAY_SIZE (ice_health_status_lookup ),
116
+ sizeof (struct ice_health_status ), ice_health_status_lookup_compare );
117
+ }
118
+
119
+ static void ice_describe_status_code (struct devlink_fmsg * fmsg ,
120
+ struct ice_aqc_health_status_elem * hse )
121
+ {
122
+ static const char * const aux_label [] = { "Aux Data 1" , "Aux Data 2" };
123
+ const struct ice_health_status * health_code ;
124
+ u32 internal_data [2 ];
125
+ u16 status_code ;
126
+
127
+ status_code = le16_to_cpu (hse -> health_status_code );
128
+
129
+ devlink_fmsg_put (fmsg , "Syndrome" , status_code );
130
+ if (status_code ) {
131
+ internal_data [0 ] = le32_to_cpu (hse -> internal_data1 );
132
+ internal_data [1 ] = le32_to_cpu (hse -> internal_data2 );
133
+
134
+ health_code = ice_get_health_status (status_code );
135
+ if (!health_code )
136
+ return ;
137
+
138
+ devlink_fmsg_string_pair_put (fmsg , "Description" , health_code -> description );
139
+ if (health_code -> solution )
140
+ devlink_fmsg_string_pair_put (fmsg , "Possible Solution" ,
141
+ health_code -> solution );
142
+
143
+ for (size_t i = 0 ; i < ICE_HEALTH_STATUS_DATA_SIZE ; i ++ ) {
144
+ if (internal_data [i ] != ICE_AQC_HEALTH_STATUS_UNDEFINED_DATA )
145
+ devlink_fmsg_u32_pair_put (fmsg ,
146
+ health_code -> data_label [i ] ?
147
+ health_code -> data_label [i ] :
148
+ aux_label [i ],
149
+ internal_data [i ]);
150
+ }
151
+ }
152
+ }
153
+
154
+ static int
155
+ ice_port_reporter_diagnose (struct devlink_health_reporter * reporter , struct devlink_fmsg * fmsg ,
156
+ struct netlink_ext_ack * extack )
157
+ {
158
+ struct ice_pf * pf = devlink_health_reporter_priv (reporter );
159
+
160
+ ice_describe_status_code (fmsg , & pf -> health_reporters .port_status );
161
+ return 0 ;
162
+ }
163
+
164
+ static int
165
+ ice_port_reporter_dump (struct devlink_health_reporter * reporter , struct devlink_fmsg * fmsg ,
166
+ void * priv_ctx , struct netlink_ext_ack __always_unused * extack )
167
+ {
168
+ struct ice_pf * pf = devlink_health_reporter_priv (reporter );
169
+
170
+ ice_describe_status_code (fmsg , & pf -> health_reporters .port_status );
171
+ return 0 ;
172
+ }
173
+
174
+ static int
175
+ ice_fw_reporter_diagnose (struct devlink_health_reporter * reporter , struct devlink_fmsg * fmsg ,
176
+ struct netlink_ext_ack * extack )
177
+ {
178
+ struct ice_pf * pf = devlink_health_reporter_priv (reporter );
179
+
180
+ ice_describe_status_code (fmsg , & pf -> health_reporters .fw_status );
181
+ return 0 ;
182
+ }
183
+
184
+ static int
185
+ ice_fw_reporter_dump (struct devlink_health_reporter * reporter , struct devlink_fmsg * fmsg ,
186
+ void * priv_ctx , struct netlink_ext_ack * extack )
187
+ {
188
+ struct ice_pf * pf = devlink_health_reporter_priv (reporter );
189
+
190
+ ice_describe_status_code (fmsg , & pf -> health_reporters .fw_status );
191
+ return 0 ;
192
+ }
193
+
194
+ static void ice_config_health_events (struct ice_pf * pf , bool enable )
195
+ {
196
+ u8 enable_bits = 0 ;
197
+ int ret ;
198
+
199
+ if (enable )
200
+ enable_bits = ICE_AQC_HEALTH_STATUS_SET_PF_SPECIFIC_MASK |
201
+ ICE_AQC_HEALTH_STATUS_SET_GLOBAL_MASK ;
202
+
203
+ ret = ice_aq_set_health_status_cfg (& pf -> hw , enable_bits );
204
+ if (ret )
205
+ dev_err (ice_pf_to_dev (pf ), "Failed to %s firmware health events, err %d aq_err %s\n" ,
206
+ str_enable_disable (enable ), ret ,
207
+ ice_aq_str (pf -> hw .adminq .sq_last_status ));
208
+ }
209
+
210
+ /**
211
+ * ice_process_health_status_event - Process the health status event from FW
212
+ * @pf: pointer to the PF structure
213
+ * @event: event structure containing the Health Status Event opcode
214
+ *
215
+ * Decode the Health Status Events and print the associated messages
216
+ */
217
+ void ice_process_health_status_event (struct ice_pf * pf , struct ice_rq_event_info * event )
218
+ {
219
+ const struct ice_aqc_health_status_elem * health_info ;
220
+ u16 count ;
221
+
222
+ health_info = (struct ice_aqc_health_status_elem * )event -> msg_buf ;
223
+ count = le16_to_cpu (event -> desc .params .get_health_status .health_status_count );
224
+
225
+ if (count > (event -> buf_len / sizeof (* health_info ))) {
226
+ dev_err (ice_pf_to_dev (pf ), "Received a health status event with invalid element count\n" );
227
+ return ;
228
+ }
229
+
230
+ for (size_t i = 0 ; i < count ; i ++ ) {
231
+ const struct ice_health_status * health_code ;
232
+ u16 status_code ;
233
+
234
+ status_code = le16_to_cpu (health_info -> health_status_code );
235
+ health_code = ice_get_health_status (status_code );
236
+
237
+ if (health_code ) {
238
+ switch (le16_to_cpu (health_info -> event_source )) {
239
+ case ICE_AQC_HEALTH_STATUS_GLOBAL :
240
+ pf -> health_reporters .fw_status = * health_info ;
241
+ devlink_health_report (pf -> health_reporters .fw ,
242
+ "FW syndrome reported" , NULL );
243
+ break ;
244
+ case ICE_AQC_HEALTH_STATUS_PF :
245
+ case ICE_AQC_HEALTH_STATUS_PORT :
246
+ pf -> health_reporters .port_status = * health_info ;
247
+ devlink_health_report (pf -> health_reporters .port ,
248
+ "Port syndrome reported" , NULL );
249
+ break ;
250
+ default :
251
+ dev_err (ice_pf_to_dev (pf ), "Health code with unknown source\n" );
252
+ }
253
+ } else {
254
+ u32 data1 , data2 ;
255
+ u16 source ;
256
+
257
+ source = le16_to_cpu (health_info -> event_source );
258
+ data1 = le32_to_cpu (health_info -> internal_data1 );
259
+ data2 = le32_to_cpu (health_info -> internal_data2 );
260
+ dev_dbg (ice_pf_to_dev (pf ),
261
+ "Received internal health status code 0x%08x, source: 0x%08x, data1: 0x%08x, data2: 0x%08x" ,
262
+ status_code , source , data1 , data2 );
263
+ }
264
+ health_info ++ ;
265
+ }
266
+ }
267
+
10
268
/**
11
269
* ice_devlink_health_report - boilerplate to call given @reporter
12
270
*
@@ -203,14 +461,26 @@ ice_init_devlink_rep(struct ice_pf *pf,
203
461
return rep ;
204
462
}
205
463
206
- #define ICE_DEFINE_HEALTH_REPORTER_OPS (_name ) \
207
- static const struct devlink_health_reporter_ops ice_ ## _name ## _reporter_ops = { \
464
+ #define ICE_HEALTH_REPORTER_OPS_FIELD (_name , _field ) \
465
+ ._field = ice_##_name##_reporter_##_field,
466
+
467
+ #define ICE_DEFINE_HEALTH_REPORTER_OPS_1 (_name , _field1 ) \
468
+ static const struct devlink_health_reporter_ops ice_##_name##_reporter_ops = { \
208
469
.name = #_name, \
209
- .dump = ice_ ## _name ## _reporter_dump, \
210
- }
470
+ ICE_HEALTH_REPORTER_OPS_FIELD(_name, _field1) \
471
+ }
472
+
473
+ #define ICE_DEFINE_HEALTH_REPORTER_OPS_2 (_name , _field1 , _field2 ) \
474
+ static const struct devlink_health_reporter_ops ice_##_name##_reporter_ops = { \
475
+ .name = #_name, \
476
+ ICE_HEALTH_REPORTER_OPS_FIELD(_name, _field1) \
477
+ ICE_HEALTH_REPORTER_OPS_FIELD(_name, _field2) \
478
+ }
211
479
212
- ICE_DEFINE_HEALTH_REPORTER_OPS (mdd );
213
- ICE_DEFINE_HEALTH_REPORTER_OPS (tx_hang );
480
+ ICE_DEFINE_HEALTH_REPORTER_OPS_1 (mdd , dump );
481
+ ICE_DEFINE_HEALTH_REPORTER_OPS_1 (tx_hang , dump );
482
+ ICE_DEFINE_HEALTH_REPORTER_OPS_2 (fw , dump , diagnose );
483
+ ICE_DEFINE_HEALTH_REPORTER_OPS_2 (port , dump , diagnose );
214
484
215
485
/**
216
486
* ice_health_init - allocate and init all ice devlink health reporters and
@@ -224,6 +494,12 @@ void ice_health_init(struct ice_pf *pf)
224
494
225
495
reps -> mdd = ice_init_devlink_rep (pf , & ice_mdd_reporter_ops );
226
496
reps -> tx_hang = ice_init_devlink_rep (pf , & ice_tx_hang_reporter_ops );
497
+
498
+ if (ice_is_fw_health_report_supported (& pf -> hw )) {
499
+ reps -> fw = ice_init_devlink_rep (pf , & ice_fw_reporter_ops );
500
+ reps -> port = ice_init_devlink_rep (pf , & ice_port_reporter_ops );
501
+ ice_config_health_events (pf , true);
502
+ }
227
503
}
228
504
229
505
/**
@@ -246,6 +522,11 @@ void ice_health_deinit(struct ice_pf *pf)
246
522
{
247
523
ice_deinit_devl_reporter (pf -> health_reporters .mdd );
248
524
ice_deinit_devl_reporter (pf -> health_reporters .tx_hang );
525
+ if (ice_is_fw_health_report_supported (& pf -> hw )) {
526
+ ice_deinit_devl_reporter (pf -> health_reporters .fw );
527
+ ice_deinit_devl_reporter (pf -> health_reporters .port );
528
+ ice_config_health_events (pf , false);
529
+ }
249
530
}
250
531
251
532
static
0 commit comments