xref: /linux/drivers/net/ethernet/mellanox/mlx5/core/en/health.c (revision 3503d56cc7233ced602e38a4c13caa64f00ab2aa)
1 // SPDX-License-Identifier: GPL-2.0
2 // Copyright (c) 2019 Mellanox Technologies.
3 
4 #include "health.h"
5 #include "lib/eq.h"
6 #include "lib/mlx5.h"
7 
8 int mlx5e_reporter_named_obj_nest_start(struct devlink_fmsg *fmsg, char *name)
9 {
10 	int err;
11 
12 	err = devlink_fmsg_pair_nest_start(fmsg, name);
13 	if (err)
14 		return err;
15 
16 	err = devlink_fmsg_obj_nest_start(fmsg);
17 	if (err)
18 		return err;
19 
20 	return 0;
21 }
22 
23 int mlx5e_reporter_named_obj_nest_end(struct devlink_fmsg *fmsg)
24 {
25 	int err;
26 
27 	err = devlink_fmsg_obj_nest_end(fmsg);
28 	if (err)
29 		return err;
30 
31 	err = devlink_fmsg_pair_nest_end(fmsg);
32 	if (err)
33 		return err;
34 
35 	return 0;
36 }
37 
38 int mlx5e_reporter_cq_diagnose(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg)
39 {
40 	struct mlx5e_priv *priv = cq->channel->priv;
41 	u32 out[MLX5_ST_SZ_DW(query_cq_out)] = {};
42 	u8 hw_status;
43 	void *cqc;
44 	int err;
45 
46 	err = mlx5_core_query_cq(priv->mdev, &cq->mcq, out);
47 	if (err)
48 		return err;
49 
50 	cqc = MLX5_ADDR_OF(query_cq_out, out, cq_context);
51 	hw_status = MLX5_GET(cqc, cqc, status);
52 
53 	err = mlx5e_reporter_named_obj_nest_start(fmsg, "CQ");
54 	if (err)
55 		return err;
56 
57 	err = devlink_fmsg_u32_pair_put(fmsg, "cqn", cq->mcq.cqn);
58 	if (err)
59 		return err;
60 
61 	err = devlink_fmsg_u8_pair_put(fmsg, "HW status", hw_status);
62 	if (err)
63 		return err;
64 
65 	err = mlx5e_reporter_named_obj_nest_end(fmsg);
66 	if (err)
67 		return err;
68 
69 	return 0;
70 }
71 
72 int mlx5e_reporter_cq_common_diagnose(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg)
73 {
74 	u8 cq_log_stride;
75 	u32 cq_sz;
76 	int err;
77 
78 	cq_sz = mlx5_cqwq_get_size(&cq->wq);
79 	cq_log_stride = mlx5_cqwq_get_log_stride_size(&cq->wq);
80 
81 	err = mlx5e_reporter_named_obj_nest_start(fmsg, "CQ");
82 	if (err)
83 		return err;
84 
85 	err = devlink_fmsg_u64_pair_put(fmsg, "stride size", BIT(cq_log_stride));
86 	if (err)
87 		return err;
88 
89 	err = devlink_fmsg_u32_pair_put(fmsg, "size", cq_sz);
90 	if (err)
91 		return err;
92 
93 	err = mlx5e_reporter_named_obj_nest_end(fmsg);
94 	if (err)
95 		return err;
96 
97 	return 0;
98 }
99 
100 int mlx5e_health_create_reporters(struct mlx5e_priv *priv)
101 {
102 	int err;
103 
104 	err = mlx5e_reporter_tx_create(priv);
105 	if (err)
106 		return err;
107 
108 	err = mlx5e_reporter_rx_create(priv);
109 	if (err)
110 		return err;
111 
112 	return 0;
113 }
114 
115 void mlx5e_health_destroy_reporters(struct mlx5e_priv *priv)
116 {
117 	mlx5e_reporter_rx_destroy(priv);
118 	mlx5e_reporter_tx_destroy(priv);
119 }
120 
121 void mlx5e_health_channels_update(struct mlx5e_priv *priv)
122 {
123 	if (priv->tx_reporter)
124 		devlink_health_reporter_state_update(priv->tx_reporter,
125 						     DEVLINK_HEALTH_REPORTER_STATE_HEALTHY);
126 	if (priv->rx_reporter)
127 		devlink_health_reporter_state_update(priv->rx_reporter,
128 						     DEVLINK_HEALTH_REPORTER_STATE_HEALTHY);
129 }
130 
131 int mlx5e_health_sq_to_ready(struct mlx5e_channel *channel, u32 sqn)
132 {
133 	struct mlx5_core_dev *mdev = channel->mdev;
134 	struct net_device *dev = channel->netdev;
135 	struct mlx5e_modify_sq_param msp = {};
136 	int err;
137 
138 	msp.curr_state = MLX5_SQC_STATE_ERR;
139 	msp.next_state = MLX5_SQC_STATE_RST;
140 
141 	err = mlx5e_modify_sq(mdev, sqn, &msp);
142 	if (err) {
143 		netdev_err(dev, "Failed to move sq 0x%x to reset\n", sqn);
144 		return err;
145 	}
146 
147 	memset(&msp, 0, sizeof(msp));
148 	msp.curr_state = MLX5_SQC_STATE_RST;
149 	msp.next_state = MLX5_SQC_STATE_RDY;
150 
151 	err = mlx5e_modify_sq(mdev, sqn, &msp);
152 	if (err) {
153 		netdev_err(dev, "Failed to move sq 0x%x to ready\n", sqn);
154 		return err;
155 	}
156 
157 	return 0;
158 }
159 
160 int mlx5e_health_recover_channels(struct mlx5e_priv *priv)
161 {
162 	int err = 0;
163 
164 	rtnl_lock();
165 	mutex_lock(&priv->state_lock);
166 
167 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
168 		goto out;
169 
170 	err = mlx5e_safe_reopen_channels(priv);
171 
172 out:
173 	mutex_unlock(&priv->state_lock);
174 	rtnl_unlock();
175 
176 	return err;
177 }
178 
179 int mlx5e_health_channel_eq_recover(struct mlx5_eq_comp *eq, struct mlx5e_channel *channel)
180 {
181 	u32 eqe_count;
182 
183 	netdev_err(channel->netdev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
184 		   eq->core.eqn, eq->core.cons_index, eq->core.irqn);
185 
186 	eqe_count = mlx5_eq_poll_irq_disabled(eq);
187 	if (!eqe_count)
188 		return -EIO;
189 
190 	netdev_err(channel->netdev, "Recovered %d eqes on EQ 0x%x\n",
191 		   eqe_count, eq->core.eqn);
192 
193 	channel->stats->eq_rearm++;
194 	return 0;
195 }
196 
197 int mlx5e_health_report(struct mlx5e_priv *priv,
198 			struct devlink_health_reporter *reporter, char *err_str,
199 			struct mlx5e_err_ctx *err_ctx)
200 {
201 	netdev_err(priv->netdev, "%s\n", err_str);
202 
203 	if (!reporter)
204 		return err_ctx->recover(err_ctx->ctx);
205 
206 	return devlink_health_report(reporter, err_str, err_ctx);
207 }
208 
209 #define MLX5_HEALTH_DEVLINK_MAX_SIZE 1024
210 static int mlx5e_health_rsc_fmsg_binary(struct devlink_fmsg *fmsg,
211 					const void *value, u32 value_len)
212 
213 {
214 	u32 data_size;
215 	u32 offset;
216 	int err;
217 
218 	for (offset = 0; offset < value_len; offset += data_size) {
219 		data_size = value_len - offset;
220 		if (data_size > MLX5_HEALTH_DEVLINK_MAX_SIZE)
221 			data_size = MLX5_HEALTH_DEVLINK_MAX_SIZE;
222 		err = devlink_fmsg_binary_put(fmsg, value + offset, data_size);
223 		if (err)
224 			break;
225 	}
226 	return err;
227 }
228 
229 int mlx5e_health_rsc_fmsg_dump(struct mlx5e_priv *priv, struct mlx5_rsc_key *key,
230 			       struct devlink_fmsg *fmsg)
231 {
232 	struct mlx5_core_dev *mdev = priv->mdev;
233 	struct mlx5_rsc_dump_cmd *cmd;
234 	struct page *page;
235 	int cmd_err, err;
236 	int end_err;
237 	int size;
238 
239 	if (IS_ERR_OR_NULL(mdev->rsc_dump))
240 		return -EOPNOTSUPP;
241 
242 	page = alloc_page(GFP_KERNEL);
243 	if (!page)
244 		return -ENOMEM;
245 
246 	err = devlink_fmsg_binary_pair_nest_start(fmsg, "data");
247 	if (err)
248 		return err;
249 
250 	cmd = mlx5_rsc_dump_cmd_create(mdev, key);
251 	if (IS_ERR(cmd)) {
252 		err = PTR_ERR(cmd);
253 		goto free_page;
254 	}
255 
256 	do {
257 		cmd_err = mlx5_rsc_dump_next(mdev, cmd, page, &size);
258 		if (cmd_err < 0) {
259 			err = cmd_err;
260 			goto destroy_cmd;
261 		}
262 
263 		err = mlx5e_health_rsc_fmsg_binary(fmsg, page_address(page), size);
264 		if (err)
265 			goto destroy_cmd;
266 
267 	} while (cmd_err > 0);
268 
269 destroy_cmd:
270 	mlx5_rsc_dump_cmd_destroy(cmd);
271 	end_err = devlink_fmsg_binary_pair_nest_end(fmsg);
272 	if (end_err)
273 		err = end_err;
274 free_page:
275 	__free_page(page);
276 	return err;
277 }
278 
279 int mlx5e_health_queue_dump(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
280 			    int queue_idx, char *lbl)
281 {
282 	struct mlx5_rsc_key key = {};
283 	int err;
284 
285 	key.rsc = MLX5_SGMT_TYPE_FULL_QPC;
286 	key.index1 = queue_idx;
287 	key.size = PAGE_SIZE;
288 	key.num_of_obj1 = 1;
289 
290 	err = devlink_fmsg_obj_nest_start(fmsg);
291 	if (err)
292 		return err;
293 
294 	err = mlx5e_reporter_named_obj_nest_start(fmsg, lbl);
295 	if (err)
296 		return err;
297 
298 	err = devlink_fmsg_u32_pair_put(fmsg, "index", queue_idx);
299 	if (err)
300 		return err;
301 
302 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
303 	if (err)
304 		return err;
305 
306 	err = mlx5e_reporter_named_obj_nest_end(fmsg);
307 	if (err)
308 		return err;
309 
310 	return devlink_fmsg_obj_nest_end(fmsg);
311 }
312