xref: /illumos-gate/usr/src/cmd/fm/modules/common/sensor-transport/sensor_transport.c (revision 581cede61ac9c14d8d4ea452562a567189eead78)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <fm/fmd_api.h>
27 #include <fm/libtopo.h>
28 #include <fm/topo_hc.h>
29 #include <fm/topo_mod.h>
30 #include <fm/topo_method.h>
31 
32 #include <sys/fm/protocol.h>
33 #include <sys/systeminfo.h>
34 
35 #include <string.h>
36 
37 #define	ST_EREPORT_CLASS	"ereport.sensor.failure"
38 
39 typedef struct sensor_fault {
40 	struct sensor_fault	*sf_next;
41 	char			*sf_fru;
42 	boolean_t		sf_last_faulted;
43 	boolean_t		sf_faulted;
44 	boolean_t		sf_unknown;
45 } sensor_fault_t;
46 
47 typedef struct sensor_transport {
48 	fmd_hdl_t	*st_hdl;
49 	fmd_xprt_t	*st_xprt;
50 	hrtime_t	st_interval;
51 	id_t		st_timer;
52 	sensor_fault_t	*st_faults;
53 	boolean_t	st_first;
54 } sensor_transport_t;
55 
56 typedef struct st_stats {
57 	fmd_stat_t st_bad_fmri;
58 	fmd_stat_t st_topo_errs;
59 	fmd_stat_t st_repairs;
60 } st_stats_t;
61 
62 st_stats_t st_stats = {
63 	{ "bad_fmri", FMD_TYPE_UINT64, "bad or missing resource/FRU FMRI" },
64 	{ "topo_errors", FMD_TYPE_UINT64, "errors walking topology" },
65 	{ "repairs", FMD_TYPE_UINT64, "auto repairs" }
66 };
67 
68 static int
69 st_check_component(topo_hdl_t *thp, tnode_t *node, void *arg)
70 {
71 	sensor_transport_t *stp = arg;
72 	fmd_hdl_t *hdl = stp->st_hdl;
73 	const char *name = topo_node_name(node);
74 	nvlist_t *nvl, *props, *rsrc, *fru;
75 	char *fmri;
76 	int err;
77 	int32_t last_source, source = -1;
78 	boolean_t nonrecov, faulted, predictive, source_diff;
79 	nvpair_t *nvp;
80 	uint64_t ena;
81 	nvlist_t *event;
82 	sensor_fault_t *sfp, **current;
83 
84 	if (strcmp(name, FAN) != 0 && strcmp(name, PSU) != 0)
85 		return (0);
86 
87 	if (topo_method_invoke(node, TOPO_METH_SENSOR_FAILURE,
88 	    TOPO_METH_SENSOR_FAILURE_VERSION, NULL, &nvl, &err) != 0) {
89 		if (err == ETOPO_METHOD_NOTSUP) {
90 			fmd_hdl_debug(hdl, "Method %s not supported on %s=%d",
91 			    TOPO_METH_SENSOR_FAILURE, name,
92 			    topo_node_instance(node));
93 			return (0);
94 		}
95 		nvl = NULL;
96 	}
97 
98 	if (topo_node_resource(node, &rsrc, NULL) != 0) {
99 		st_stats.st_bad_fmri.fmds_value.ui64++;
100 		nvlist_free(nvl);
101 		return (0);
102 	}
103 
104 	if (topo_node_fru(node, &fru, NULL, NULL) != 0) {
105 		st_stats.st_bad_fmri.fmds_value.ui64++;
106 		nvlist_free(nvl);
107 		nvlist_free(rsrc);
108 		return (0);
109 	}
110 
111 	if (topo_fmri_nvl2str(thp, fru, &fmri, &err) != 0) {
112 		st_stats.st_bad_fmri.fmds_value.ui64++;
113 		nvlist_free(nvl);
114 		nvlist_free(fru);
115 		nvlist_free(rsrc);
116 		return (0);
117 	}
118 
119 	nvlist_free(fru);
120 
121 	faulted = nonrecov = source_diff = B_FALSE;
122 	predictive = B_TRUE;
123 	if (nvl != NULL)  {
124 		nvp = NULL;
125 		while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
126 			if (nvpair_value_nvlist(nvp, &props) != 0)
127 				continue;
128 
129 			faulted = B_TRUE;
130 
131 			/*
132 			 * We need some simple rules to handle the case where
133 			 * there are multiple facility nodes that indicate
134 			 * a problem with this FRU, but disagree on the values
135 			 * of nonrecov, predictive or source:
136 			 *
137 			 * 1) nonrecov will be set to true if one or more
138 			 *   facility nodes indicates true.  Otherwise it will
139 			 *   default to false
140 			 *
141 			 * 2) predictive will default to false and remain false
142 			 *    if one or more facility nodes indicate false.
143 			 *
144 			 * 3) source will be set to unknown unless all facility
145 			 *    nodes agree on the source
146 			 */
147 			if (nonrecov == B_FALSE)
148 				if (nvlist_lookup_boolean_value(props,
149 				    "nonrecov", &nonrecov) != 0)
150 					nonrecov = B_FALSE;
151 			if (predictive == B_TRUE)
152 				if (nvlist_lookup_boolean_value(props,
153 				    "predictive", &predictive) != 0)
154 					predictive = B_FALSE;
155 
156 			last_source = source;
157 			if (nvlist_lookup_uint32(props, "source",
158 			    (uint32_t *)&source) != 0)
159 				source = TOPO_SENSOR_ERRSRC_UNKNOWN;
160 			if (last_source != -1 && last_source != source)
161 				source_diff = B_TRUE;
162 		}
163 		if (source_diff)
164 			source = TOPO_SENSOR_ERRSRC_UNKNOWN;
165 	}
166 
167 	/*
168 	 * See if we know about this fru.
169 	 */
170 	for (current = &stp->st_faults; *current != NULL;
171 	    current = &(*current)->sf_next) {
172 		if (topo_fmri_strcmp(thp, fmri,
173 		    (*current)->sf_fru))
174 			break;
175 	}
176 
177 	sfp = *current;
178 	if (sfp == NULL) {
179 		/*
180 		 * We add this FRU to our list under two circumstances:
181 		 *
182 		 * 	1. This FRU is faulted and needs to be remembered to
183 		 *	   avoid duplicate ereports.
184 		 *
185 		 * 	2. This is the initial pass, and we want to repair the
186 		 *	   FRU in case it was repaired while we were offline.
187 		 */
188 		if (stp->st_first || faulted) {
189 			sfp = fmd_hdl_zalloc(hdl, sizeof (sensor_fault_t),
190 			    FMD_SLEEP);
191 			sfp->sf_fru = fmd_hdl_strdup(hdl, fmri, FMD_SLEEP);
192 			sfp->sf_next = stp->st_faults;
193 			stp->st_faults = sfp;
194 		} else {
195 			goto out;
196 		}
197 	}
198 
199 	if (nvl == NULL)
200 		sfp->sf_unknown = B_TRUE;
201 
202 	if (faulted) {
203 		/*
204 		 * Construct and post the ereport.
205 		 *
206 		 * XXFM we only post one ereport per fru.  It should be possible
207 		 * to uniquely identify faulty resources instead and post one
208 		 * per resource, even if they share the same FRU.
209 		 */
210 		if (!sfp->sf_last_faulted) {
211 			ena = fmd_event_ena_create(hdl);
212 			event = fmd_nvl_alloc(hdl, FMD_SLEEP);
213 
214 			(void) nvlist_add_string(event, "type", name);
215 			(void) nvlist_add_boolean_value(event, "nonrecov",
216 			    nonrecov);
217 			(void) nvlist_add_boolean_value(event, "predictive",
218 			    predictive);
219 			(void) nvlist_add_uint32(event, "source",
220 			    (uint32_t)source);
221 			(void) nvlist_add_nvlist(event, "details", nvl);
222 			(void) nvlist_add_string(event, FM_CLASS,
223 			    ST_EREPORT_CLASS);
224 			(void) nvlist_add_uint8(event, FM_VERSION,
225 			    FM_EREPORT_VERSION);
226 			(void) nvlist_add_uint64(event, FM_EREPORT_ENA, ena);
227 			(void) nvlist_add_nvlist(event, FM_EREPORT_DETECTOR,
228 			    rsrc);
229 
230 			fmd_xprt_post(hdl, stp->st_xprt, event, 0);
231 			fmd_hdl_debug(hdl, "posted ereport: %s",
232 			    ST_EREPORT_CLASS);
233 		}
234 
235 		sfp->sf_faulted = B_TRUE;
236 	}
237 
238 out:
239 	topo_hdl_strfree(thp, fmri);
240 	nvlist_free(rsrc);
241 	nvlist_free(nvl);
242 	return (0);
243 }
244 
245 /*ARGSUSED*/
246 static void
247 st_timeout(fmd_hdl_t *hdl, id_t id, void *data)
248 {
249 	sensor_transport_t *stp;
250 	sensor_fault_t *sfp, **current;
251 	topo_hdl_t *thp;
252 	topo_walk_t *twp;
253 	int err;
254 
255 	fmd_hdl_debug(hdl, "timeout: checking topology");
256 
257 	stp = fmd_hdl_getspecific(hdl);
258 	thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION);
259 
260 	if ((twp = topo_walk_init(thp, FM_FMRI_SCHEME_HC, st_check_component,
261 	    stp, &err)) == NULL) {
262 		fmd_hdl_topo_rele(hdl, thp);
263 		fmd_hdl_error(hdl, "failed to walk topology: %s\n",
264 		    topo_strerror(err));
265 		st_stats.st_topo_errs.fmds_value.ui64++;
266 		return;
267 	}
268 
269 	/*
270 	 * Initialize values in our internal FRU list for this iteration of
271 	 * sensor reads.  Keep track of whether the FRU was faulted in the
272 	 * previous pass so we don't send multiple ereports for the same
273 	 * problem.
274 	 */
275 	for (sfp = stp->st_faults; sfp != NULL; sfp = sfp->sf_next) {
276 		sfp->sf_unknown = B_FALSE;
277 		sfp->sf_last_faulted = sfp->sf_faulted;
278 		sfp->sf_faulted = B_FALSE;
279 	}
280 
281 	if (topo_walk_step(twp, TOPO_WALK_CHILD) == TOPO_WALK_ERR) {
282 		topo_walk_fini(twp);
283 		fmd_hdl_topo_rele(hdl, thp);
284 		fmd_hdl_error(hdl, "failed to walk topology\n");
285 		st_stats.st_topo_errs.fmds_value.ui64++;
286 		return;
287 	}
288 
289 	/*
290 	 * Remove any faults that weren't seen in the last pass.
291 	 */
292 	for (current = &stp->st_faults; *current != NULL; ) {
293 		sfp = *current;
294 		if (!sfp->sf_faulted && !sfp->sf_unknown) {
295 			fmd_hdl_debug(hdl, "repairing %s", sfp->sf_fru);
296 			fmd_repair_fru(hdl, sfp->sf_fru);
297 			st_stats.st_repairs.fmds_value.ui64++;
298 			*current = sfp->sf_next;
299 			fmd_hdl_strfree(hdl, sfp->sf_fru);
300 			fmd_hdl_free(hdl, sfp, sizeof (sensor_fault_t));
301 		} else {
302 			current = &sfp->sf_next;
303 		}
304 	}
305 
306 	stp->st_first = B_FALSE;
307 	topo_walk_fini(twp);
308 	fmd_hdl_topo_rele(hdl, thp);
309 
310 	stp->st_timer = fmd_timer_install(hdl, NULL, NULL, stp->st_interval);
311 }
312 
313 static const fmd_prop_t fmd_props[] = {
314 	{ "interval", FMD_TYPE_TIME, "1min" },
315 	{ NULL, 0, NULL }
316 };
317 
318 static const fmd_hdl_ops_t fmd_ops = {
319 	NULL,			/* fmdo_recv */
320 	st_timeout,		/* fmdo_timeout */
321 	NULL, 			/* fmdo_close */
322 	NULL,			/* fmdo_stats */
323 	NULL,			/* fmdo_gc */
324 	NULL,			/* fmdo_send */
325 	NULL			/* fmdo_topo */
326 };
327 
328 static const fmd_hdl_info_t fmd_info = {
329 	"Sensor Transport Agent", "1.0", &fmd_ops, fmd_props
330 };
331 
332 void
333 _fmd_init(fmd_hdl_t *hdl)
334 {
335 	sensor_transport_t *stp;
336 	char buf[SYS_NMLN];
337 
338 	/*
339 	 * The sensor-transport module is currently only supported on x86
340 	 * platforms.  So to avoid unnecessarily wasting cpu cycles on sparc
341 	 * walking the hc scheme tree every 60 seconds, we'll bail out before
342 	 * registering the handle.
343 	 */
344 	if ((sysinfo(SI_ARCHITECTURE, buf, sizeof (buf)) == -1) ||
345 	    (strcmp(buf, "i386") != 0))
346 		return;
347 
348 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0)
349 		return;
350 
351 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC,
352 	    sizeof (st_stats) / sizeof (fmd_stat_t),
353 	    (fmd_stat_t *)&st_stats);
354 
355 	stp = fmd_hdl_zalloc(hdl, sizeof (sensor_transport_t), FMD_SLEEP);
356 	stp->st_interval = fmd_prop_get_int64(hdl, "interval");
357 
358 	fmd_hdl_setspecific(hdl, stp);
359 
360 	stp->st_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY, NULL, NULL);
361 	stp->st_hdl = hdl;
362 	stp->st_first = B_TRUE;
363 
364 	/* kick off the first asynchronous discovery */
365 	stp->st_timer = fmd_timer_install(hdl, NULL, NULL, 0);
366 }
367 
368 void
369 _fmd_fini(fmd_hdl_t *hdl)
370 {
371 	sensor_transport_t *stp;
372 	sensor_fault_t *sfp;
373 
374 	stp = fmd_hdl_getspecific(hdl);
375 	if (stp != NULL) {
376 		fmd_xprt_close(hdl, stp->st_xprt);
377 
378 		while ((sfp = stp->st_faults) != NULL) {
379 			stp->st_faults = sfp->sf_next;
380 
381 			fmd_hdl_strfree(hdl, sfp->sf_fru);
382 			fmd_hdl_free(hdl, sfp, sizeof (sensor_fault_t));
383 		}
384 
385 		fmd_hdl_free(hdl, stp, sizeof (sensor_transport_t));
386 	}
387 }
388