xref: /linux/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c (revision cbdb1f163af2bb90d01be1f0263df1d8d5c9d9d3)
1 /*
2  * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <linux/etherdevice.h>
34 #include <linux/mlx5/driver.h>
35 
36 #include "mlx5_core.h"
37 #include "lib/mlx5.h"
38 #include "lib/eq.h"
39 #include "fpga/core.h"
40 #include "fpga/conn.h"
41 
42 static const char *const mlx5_fpga_error_strings[] = {
43 	"Null Syndrome",
44 	"Corrupted DDR",
45 	"Flash Timeout",
46 	"Internal Link Error",
47 	"Watchdog HW Failure",
48 	"I2C Failure",
49 	"Image Changed",
50 	"Temperature Critical",
51 };
52 
53 static const char * const mlx5_fpga_qp_error_strings[] = {
54 	"Null Syndrome",
55 	"Retry Counter Expired",
56 	"RNR Expired",
57 };
58 static struct mlx5_fpga_device *mlx5_fpga_device_alloc(void)
59 {
60 	struct mlx5_fpga_device *fdev = NULL;
61 
62 	fdev = kzalloc(sizeof(*fdev), GFP_KERNEL);
63 	if (!fdev)
64 		return NULL;
65 
66 	spin_lock_init(&fdev->state_lock);
67 	fdev->state = MLX5_FPGA_STATUS_NONE;
68 	return fdev;
69 }
70 
71 static const char *mlx5_fpga_image_name(enum mlx5_fpga_image image)
72 {
73 	switch (image) {
74 	case MLX5_FPGA_IMAGE_USER:
75 		return "user";
76 	case MLX5_FPGA_IMAGE_FACTORY:
77 		return "factory";
78 	default:
79 		return "unknown";
80 	}
81 }
82 
83 static const char *mlx5_fpga_name(u32 fpga_id)
84 {
85 	static char ret[32];
86 
87 	switch (fpga_id) {
88 	case MLX5_FPGA_NEWTON:
89 		return "Newton";
90 	case MLX5_FPGA_EDISON:
91 		return "Edison";
92 	case MLX5_FPGA_MORSE:
93 		return "Morse";
94 	case MLX5_FPGA_MORSEQ:
95 		return "MorseQ";
96 	}
97 
98 	snprintf(ret, sizeof(ret), "Unknown %d", fpga_id);
99 	return ret;
100 }
101 
102 static int mlx5_is_fpga_lookaside(u32 fpga_id)
103 {
104 	return fpga_id != MLX5_FPGA_NEWTON && fpga_id != MLX5_FPGA_EDISON;
105 }
106 
107 static int mlx5_fpga_device_load_check(struct mlx5_fpga_device *fdev)
108 {
109 	struct mlx5_fpga_query query;
110 	int err;
111 
112 	err = mlx5_fpga_query(fdev->mdev, &query);
113 	if (err) {
114 		mlx5_fpga_err(fdev, "Failed to query status: %d\n", err);
115 		return err;
116 	}
117 
118 	fdev->last_admin_image = query.admin_image;
119 	fdev->last_oper_image = query.oper_image;
120 
121 	mlx5_fpga_info(fdev, "Status %u; Admin image %u; Oper image %u\n",
122 		       query.status, query.admin_image, query.oper_image);
123 
124 	/* for FPGA lookaside projects FPGA load status is not important */
125 	if (mlx5_is_fpga_lookaside(MLX5_CAP_FPGA(fdev->mdev, fpga_id)))
126 		return 0;
127 
128 	if (query.status != MLX5_FPGA_STATUS_SUCCESS) {
129 		mlx5_fpga_err(fdev, "%s image failed to load; status %u\n",
130 			      mlx5_fpga_image_name(fdev->last_oper_image),
131 			      query.status);
132 		return -EIO;
133 	}
134 
135 	return 0;
136 }
137 
138 static int mlx5_fpga_device_brb(struct mlx5_fpga_device *fdev)
139 {
140 	int err;
141 	struct mlx5_core_dev *mdev = fdev->mdev;
142 
143 	err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_ON);
144 	if (err) {
145 		mlx5_fpga_err(fdev, "Failed to set bypass on: %d\n", err);
146 		return err;
147 	}
148 	err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_RESET_SANDBOX);
149 	if (err) {
150 		mlx5_fpga_err(fdev, "Failed to reset SBU: %d\n", err);
151 		return err;
152 	}
153 	err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_OFF);
154 	if (err) {
155 		mlx5_fpga_err(fdev, "Failed to set bypass off: %d\n", err);
156 		return err;
157 	}
158 	return 0;
159 }
160 
161 static int mlx5_fpga_event(struct mlx5_fpga_device *, unsigned long, void *);
162 
163 static int fpga_err_event(struct notifier_block *nb, unsigned long event, void *eqe)
164 {
165 	struct mlx5_fpga_device *fdev = mlx5_nb_cof(nb, struct mlx5_fpga_device, fpga_err_nb);
166 
167 	return mlx5_fpga_event(fdev, event, eqe);
168 }
169 
170 static int fpga_qp_err_event(struct notifier_block *nb, unsigned long event, void *eqe)
171 {
172 	struct mlx5_fpga_device *fdev = mlx5_nb_cof(nb, struct mlx5_fpga_device, fpga_qp_err_nb);
173 
174 	return mlx5_fpga_event(fdev, event, eqe);
175 }
176 
177 int mlx5_fpga_device_start(struct mlx5_core_dev *mdev)
178 {
179 	struct mlx5_fpga_device *fdev = mdev->fpga;
180 	unsigned int max_num_qps;
181 	unsigned long flags;
182 	u32 fpga_id;
183 	int err;
184 
185 	if (!fdev)
186 		return 0;
187 
188 	err = mlx5_fpga_caps(fdev->mdev);
189 	if (err)
190 		goto out;
191 
192 	err = mlx5_fpga_device_load_check(fdev);
193 	if (err)
194 		goto out;
195 
196 	fpga_id = MLX5_CAP_FPGA(fdev->mdev, fpga_id);
197 	mlx5_fpga_info(fdev, "FPGA card %s:%u\n", mlx5_fpga_name(fpga_id), fpga_id);
198 
199 	/* No QPs if FPGA does not participate in net processing */
200 	if (mlx5_is_fpga_lookaside(fpga_id))
201 		goto out;
202 
203 	mlx5_fpga_info(fdev, "%s(%d): image, version %u; SBU %06x:%04x version %d\n",
204 		       mlx5_fpga_image_name(fdev->last_oper_image),
205 		       fdev->last_oper_image,
206 		       MLX5_CAP_FPGA(fdev->mdev, image_version),
207 		       MLX5_CAP_FPGA(fdev->mdev, ieee_vendor_id),
208 		       MLX5_CAP_FPGA(fdev->mdev, sandbox_product_id),
209 		       MLX5_CAP_FPGA(fdev->mdev, sandbox_product_version));
210 
211 	max_num_qps = MLX5_CAP_FPGA(mdev, shell_caps.max_num_qps);
212 	if (!max_num_qps) {
213 		mlx5_fpga_err(fdev, "FPGA reports 0 QPs in SHELL_CAPS\n");
214 		err = -ENOTSUPP;
215 		goto out;
216 	}
217 
218 	err = mlx5_core_reserve_gids(mdev, max_num_qps);
219 	if (err)
220 		goto out;
221 
222 	MLX5_NB_INIT(&fdev->fpga_err_nb, fpga_err_event, FPGA_ERROR);
223 	MLX5_NB_INIT(&fdev->fpga_qp_err_nb, fpga_qp_err_event, FPGA_QP_ERROR);
224 	mlx5_eq_notifier_register(fdev->mdev, &fdev->fpga_err_nb);
225 	mlx5_eq_notifier_register(fdev->mdev, &fdev->fpga_qp_err_nb);
226 
227 	err = mlx5_fpga_conn_device_init(fdev);
228 	if (err)
229 		goto err_rsvd_gid;
230 
231 	if (fdev->last_oper_image == MLX5_FPGA_IMAGE_USER) {
232 		err = mlx5_fpga_device_brb(fdev);
233 		if (err)
234 			goto err_conn_init;
235 	}
236 
237 	goto out;
238 
239 err_conn_init:
240 	mlx5_fpga_conn_device_cleanup(fdev);
241 
242 err_rsvd_gid:
243 	mlx5_eq_notifier_unregister(fdev->mdev, &fdev->fpga_err_nb);
244 	mlx5_eq_notifier_unregister(fdev->mdev, &fdev->fpga_qp_err_nb);
245 	mlx5_core_unreserve_gids(mdev, max_num_qps);
246 out:
247 	spin_lock_irqsave(&fdev->state_lock, flags);
248 	fdev->state = err ? MLX5_FPGA_STATUS_FAILURE : MLX5_FPGA_STATUS_SUCCESS;
249 	spin_unlock_irqrestore(&fdev->state_lock, flags);
250 	return err;
251 }
252 
253 int mlx5_fpga_init(struct mlx5_core_dev *mdev)
254 {
255 	struct mlx5_fpga_device *fdev = NULL;
256 
257 	if (!MLX5_CAP_GEN(mdev, fpga)) {
258 		mlx5_core_dbg(mdev, "FPGA capability not present\n");
259 		return 0;
260 	}
261 
262 	mlx5_core_dbg(mdev, "Initializing FPGA\n");
263 
264 	fdev = mlx5_fpga_device_alloc();
265 	if (!fdev)
266 		return -ENOMEM;
267 
268 	fdev->mdev = mdev;
269 	mdev->fpga = fdev;
270 
271 	return 0;
272 }
273 
274 void mlx5_fpga_device_stop(struct mlx5_core_dev *mdev)
275 {
276 	struct mlx5_fpga_device *fdev = mdev->fpga;
277 	unsigned int max_num_qps;
278 	unsigned long flags;
279 	int err;
280 
281 	if (!fdev)
282 		return;
283 
284 	if (mlx5_is_fpga_lookaside(MLX5_CAP_FPGA(fdev->mdev, fpga_id)))
285 		return;
286 
287 	spin_lock_irqsave(&fdev->state_lock, flags);
288 	if (fdev->state != MLX5_FPGA_STATUS_SUCCESS) {
289 		spin_unlock_irqrestore(&fdev->state_lock, flags);
290 		return;
291 	}
292 	fdev->state = MLX5_FPGA_STATUS_NONE;
293 	spin_unlock_irqrestore(&fdev->state_lock, flags);
294 
295 	if (fdev->last_oper_image == MLX5_FPGA_IMAGE_USER) {
296 		err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_ON);
297 		if (err)
298 			mlx5_fpga_err(fdev, "Failed to re-set SBU bypass on: %d\n",
299 				      err);
300 	}
301 
302 	mlx5_fpga_conn_device_cleanup(fdev);
303 	mlx5_eq_notifier_unregister(fdev->mdev, &fdev->fpga_err_nb);
304 	mlx5_eq_notifier_unregister(fdev->mdev, &fdev->fpga_qp_err_nb);
305 
306 	max_num_qps = MLX5_CAP_FPGA(mdev, shell_caps.max_num_qps);
307 	mlx5_core_unreserve_gids(mdev, max_num_qps);
308 }
309 
310 void mlx5_fpga_cleanup(struct mlx5_core_dev *mdev)
311 {
312 	struct mlx5_fpga_device *fdev = mdev->fpga;
313 
314 	mlx5_fpga_device_stop(mdev);
315 	kfree(fdev);
316 	mdev->fpga = NULL;
317 }
318 
319 static const char *mlx5_fpga_syndrome_to_string(u8 syndrome)
320 {
321 	if (syndrome < ARRAY_SIZE(mlx5_fpga_error_strings))
322 		return mlx5_fpga_error_strings[syndrome];
323 	return "Unknown";
324 }
325 
326 static const char *mlx5_fpga_qp_syndrome_to_string(u8 syndrome)
327 {
328 	if (syndrome < ARRAY_SIZE(mlx5_fpga_qp_error_strings))
329 		return mlx5_fpga_qp_error_strings[syndrome];
330 	return "Unknown";
331 }
332 
333 static int mlx5_fpga_event(struct mlx5_fpga_device *fdev,
334 			   unsigned long event, void *eqe)
335 {
336 	void *data = ((struct mlx5_eqe *)eqe)->data.raw;
337 	const char *event_name;
338 	bool teardown = false;
339 	unsigned long flags;
340 	u8 syndrome;
341 
342 	switch (event) {
343 	case MLX5_EVENT_TYPE_FPGA_ERROR:
344 		syndrome = MLX5_GET(fpga_error_event, data, syndrome);
345 		event_name = mlx5_fpga_syndrome_to_string(syndrome);
346 		break;
347 	case MLX5_EVENT_TYPE_FPGA_QP_ERROR:
348 		syndrome = MLX5_GET(fpga_qp_error_event, data, syndrome);
349 		event_name = mlx5_fpga_qp_syndrome_to_string(syndrome);
350 		break;
351 	default:
352 		return NOTIFY_DONE;
353 	}
354 
355 	spin_lock_irqsave(&fdev->state_lock, flags);
356 	switch (fdev->state) {
357 	case MLX5_FPGA_STATUS_SUCCESS:
358 		mlx5_fpga_warn(fdev, "Error %u: %s\n", syndrome, event_name);
359 		teardown = true;
360 		break;
361 	default:
362 		mlx5_fpga_warn_ratelimited(fdev, "Unexpected error event %u: %s\n",
363 					   syndrome, event_name);
364 	}
365 	spin_unlock_irqrestore(&fdev->state_lock, flags);
366 	/* We tear-down the card's interfaces and functionality because
367 	 * the FPGA bump-on-the-wire is misbehaving and we lose ability
368 	 * to communicate with the network. User may still be able to
369 	 * recover by re-programming or debugging the FPGA
370 	 */
371 	if (teardown)
372 		mlx5_trigger_health_work(fdev->mdev);
373 
374 	return NOTIFY_OK;
375 }
376