xref: /linux/arch/powerpc/platforms/powernv/opal-hmi.c (revision 3bdab16c55f57a24245c97d707241dd9b48d1a91)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * OPAL hypervisor Maintenance interrupt handling support in PowerNV.
4  *
5  * Copyright 2014 IBM Corporation
6  * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
7  */
8 
9 #undef DEBUG
10 
11 #include <linux/kernel.h>
12 #include <linux/init.h>
13 #include <linux/of.h>
14 #include <linux/mm.h>
15 #include <linux/slab.h>
16 
17 #include <asm/opal.h>
18 #include <asm/cputable.h>
19 #include <asm/machdep.h>
20 
21 #include "powernv.h"
22 
23 static int opal_hmi_handler_nb_init;
24 struct OpalHmiEvtNode {
25 	struct list_head list;
26 	struct OpalHMIEvent hmi_evt;
27 };
28 
29 struct xstop_reason {
30 	uint32_t xstop_reason;
31 	const char *unit_failed;
32 	const char *description;
33 };
34 
35 static LIST_HEAD(opal_hmi_evt_list);
36 static DEFINE_SPINLOCK(opal_hmi_evt_lock);
37 
38 static void print_core_checkstop_reason(const char *level,
39 					struct OpalHMIEvent *hmi_evt)
40 {
41 	int i;
42 	static const struct xstop_reason xstop_reason[] = {
43 		{ CORE_CHECKSTOP_IFU_REGFILE, "IFU",
44 				"RegFile core check stop" },
45 		{ CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
46 		{ CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
47 				"Core checkstop during recovery" },
48 		{ CORE_CHECKSTOP_ISU_REGFILE, "ISU",
49 				"RegFile core check stop (mapper error)" },
50 		{ CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
51 		{ CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
52 		{ CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
53 		{ CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
54 				"Recovery in maintenance mode" },
55 		{ CORE_CHECKSTOP_LSU_REGFILE, "LSU",
56 				"RegFile core check stop" },
57 		{ CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
58 				"Forward Progress Error" },
59 		{ CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
60 		{ CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
61 		{ CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
62 				"Hypervisor Resource error - core check stop" },
63 		{ CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
64 				"Hang Recovery Failed (core check stop)" },
65 		{ CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
66 				"Ambiguous Hang Detected (unknown source)" },
67 		{ CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
68 				"Debug Trigger Error inject" },
69 		{ CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
70 				"Hypervisor check stop via SPRC/SPRD" },
71 	};
72 
73 	/* Validity check */
74 	if (!hmi_evt->u.xstop_error.xstop_reason) {
75 		printk("%s	Unknown Core check stop.\n", level);
76 		return;
77 	}
78 
79 	printk("%s	CPU PIR: %08x\n", level,
80 			be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
81 	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
82 		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
83 					xstop_reason[i].xstop_reason)
84 			printk("%s	[Unit: %-3s] %s\n", level,
85 					xstop_reason[i].unit_failed,
86 					xstop_reason[i].description);
87 }
88 
89 static void print_nx_checkstop_reason(const char *level,
90 					struct OpalHMIEvent *hmi_evt)
91 {
92 	int i;
93 	static const struct xstop_reason xstop_reason[] = {
94 		{ NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
95 					"SHM invalid state error" },
96 		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
97 					"DMA invalid state error bit 15" },
98 		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
99 					"DMA invalid state error bit 16" },
100 		{ NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
101 					"Channel 0 invalid state error" },
102 		{ NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
103 					"Channel 1 invalid state error" },
104 		{ NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
105 					"Channel 2 invalid state error" },
106 		{ NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
107 					"Channel 3 invalid state error" },
108 		{ NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
109 					"Channel 4 invalid state error" },
110 		{ NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
111 					"Channel 5 invalid state error" },
112 		{ NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
113 					"Channel 6 invalid state error" },
114 		{ NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
115 					"Channel 7 invalid state error" },
116 		{ NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
117 					"UE error on CRB(CSB address, CCB)" },
118 		{ NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
119 					"SUE error on CRB(CSB address, CCB)" },
120 		{ NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
121 		"CRB Kill ISN received while holding ISN with UE error" },
122 	};
123 
124 	/* Validity check */
125 	if (!hmi_evt->u.xstop_error.xstop_reason) {
126 		printk("%s	Unknown NX check stop.\n", level);
127 		return;
128 	}
129 
130 	printk("%s	NX checkstop on CHIP ID: %x\n", level,
131 			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
132 	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
133 		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
134 					xstop_reason[i].xstop_reason)
135 			printk("%s	[Unit: %-3s] %s\n", level,
136 					xstop_reason[i].unit_failed,
137 					xstop_reason[i].description);
138 }
139 
140 static void print_checkstop_reason(const char *level,
141 					struct OpalHMIEvent *hmi_evt)
142 {
143 	uint8_t type = hmi_evt->u.xstop_error.xstop_type;
144 	switch (type) {
145 	case CHECKSTOP_TYPE_CORE:
146 		print_core_checkstop_reason(level, hmi_evt);
147 		break;
148 	case CHECKSTOP_TYPE_NX:
149 		print_nx_checkstop_reason(level, hmi_evt);
150 		break;
151 	default:
152 		printk("%s	Unknown Malfunction Alert of type %d\n",
153 		       level, type);
154 		break;
155 	}
156 }
157 
158 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
159 {
160 	const char *level, *sevstr, *error_info;
161 	static const char *hmi_error_types[] = {
162 		"Malfunction Alert",
163 		"Processor Recovery done",
164 		"Processor recovery occurred again",
165 		"Processor recovery occurred for masked error",
166 		"Timer facility experienced an error",
167 		"TFMR SPR is corrupted",
168 		"UPS (Uninterrupted Power System) Overflow indication",
169 		"An XSCOM operation failure",
170 		"An XSCOM operation completed",
171 		"SCOM has set a reserved FIR bit to cause recovery",
172 		"Debug trigger has set a reserved FIR bit to cause recovery",
173 		"A hypervisor resource error occurred",
174 		"CAPP recovery process is in progress",
175 	};
176 
177 	/* Print things out */
178 	if (hmi_evt->version < OpalHMIEvt_V1) {
179 		pr_err("HMI Interrupt, Unknown event version %d !\n",
180 			hmi_evt->version);
181 		return;
182 	}
183 	switch (hmi_evt->severity) {
184 	case OpalHMI_SEV_NO_ERROR:
185 		level = KERN_INFO;
186 		sevstr = "Harmless";
187 		break;
188 	case OpalHMI_SEV_WARNING:
189 		level = KERN_WARNING;
190 		sevstr = "";
191 		break;
192 	case OpalHMI_SEV_ERROR_SYNC:
193 		level = KERN_ERR;
194 		sevstr = "Severe";
195 		break;
196 	case OpalHMI_SEV_FATAL:
197 	default:
198 		level = KERN_ERR;
199 		sevstr = "Fatal";
200 		break;
201 	}
202 
203 	printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
204 		level, sevstr,
205 		hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
206 		"Recovered" : "Not recovered");
207 	error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
208 			hmi_error_types[hmi_evt->type]
209 			: "Unknown";
210 	printk("%s Error detail: %s\n", level, error_info);
211 	printk("%s	HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
212 	if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
213 		(hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
214 		printk("%s	TFMR: %016llx\n", level,
215 						be64_to_cpu(hmi_evt->tfmr));
216 
217 	if (hmi_evt->version < OpalHMIEvt_V2)
218 		return;
219 
220 	/* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
221 	if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
222 		print_checkstop_reason(level, hmi_evt);
223 }
224 
225 static void hmi_event_handler(struct work_struct *work)
226 {
227 	unsigned long flags;
228 	struct OpalHMIEvent *hmi_evt;
229 	struct OpalHmiEvtNode *msg_node;
230 	uint8_t disposition;
231 	struct opal_msg msg;
232 	int unrecoverable = 0;
233 
234 	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
235 	while (!list_empty(&opal_hmi_evt_list)) {
236 		msg_node = list_entry(opal_hmi_evt_list.next,
237 					   struct OpalHmiEvtNode, list);
238 		list_del(&msg_node->list);
239 		spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
240 
241 		hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
242 		print_hmi_event_info(hmi_evt);
243 		disposition = hmi_evt->disposition;
244 		kfree(msg_node);
245 
246 		/*
247 		 * Check if HMI event has been recovered or not. If not
248 		 * then kernel can't continue, we need to panic.
249 		 * But before we do that, display all the HMI event
250 		 * available on the list and set unrecoverable flag to 1.
251 		 */
252 		if (disposition != OpalHMI_DISPOSITION_RECOVERED)
253 			unrecoverable = 1;
254 
255 		spin_lock_irqsave(&opal_hmi_evt_lock, flags);
256 	}
257 	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
258 
259 	if (unrecoverable) {
260 		/* Pull all HMI events from OPAL before we panic. */
261 		while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
262 			u32 type;
263 
264 			type = be32_to_cpu(msg.msg_type);
265 
266 			/* skip if not HMI event */
267 			if (type != OPAL_MSG_HMI_EVT)
268 				continue;
269 
270 			/* HMI event info starts from param[0] */
271 			hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
272 			print_hmi_event_info(hmi_evt);
273 		}
274 
275 		pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
276 	}
277 }
278 
279 static DECLARE_WORK(hmi_event_work, hmi_event_handler);
280 /*
281  * opal_handle_hmi_event - notifier handler that queues up HMI events
282  * to be preocessed later.
283  */
284 static int opal_handle_hmi_event(struct notifier_block *nb,
285 			  unsigned long msg_type, void *msg)
286 {
287 	unsigned long flags;
288 	struct OpalHMIEvent *hmi_evt;
289 	struct opal_msg *hmi_msg = msg;
290 	struct OpalHmiEvtNode *msg_node;
291 
292 	/* Sanity Checks */
293 	if (msg_type != OPAL_MSG_HMI_EVT)
294 		return 0;
295 
296 	/* HMI event info starts from param[0] */
297 	hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
298 
299 	/* Delay the logging of HMI events to workqueue. */
300 	msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
301 	if (!msg_node) {
302 		pr_err("HMI: out of memory, Opal message event not handled\n");
303 		return -ENOMEM;
304 	}
305 	memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt));
306 
307 	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
308 	list_add(&msg_node->list, &opal_hmi_evt_list);
309 	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
310 
311 	schedule_work(&hmi_event_work);
312 	return 0;
313 }
314 
315 static struct notifier_block opal_hmi_handler_nb = {
316 	.notifier_call	= opal_handle_hmi_event,
317 	.next		= NULL,
318 	.priority	= 0,
319 };
320 
321 int __init opal_hmi_handler_init(void)
322 {
323 	int ret;
324 
325 	if (!opal_hmi_handler_nb_init) {
326 		ret = opal_message_notifier_register(
327 				OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
328 		if (ret) {
329 			pr_err("%s: Can't register OPAL event notifier (%d)\n",
330 			       __func__, ret);
331 			return ret;
332 		}
333 		opal_hmi_handler_nb_init = 1;
334 	}
335 	return 0;
336 }
337