1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <fm/fmd_api.h> 27 #include <fm/libtopo.h> 28 #include <fm/topo_hc.h> 29 #include <fm/topo_mod.h> 30 #include <fm/topo_method.h> 31 32 #include <sys/fm/protocol.h> 33 #include <sys/systeminfo.h> 34 35 #include <string.h> 36 37 #define ST_EREPORT_CLASS "ereport.sensor.failure" 38 39 typedef struct sensor_fault { 40 struct sensor_fault *sf_next; 41 char *sf_fru; 42 boolean_t sf_last_faulted; 43 boolean_t sf_faulted; 44 boolean_t sf_unknown; 45 } sensor_fault_t; 46 47 typedef struct sensor_transport { 48 fmd_hdl_t *st_hdl; 49 fmd_xprt_t *st_xprt; 50 hrtime_t st_interval; 51 id_t st_timer; 52 sensor_fault_t *st_faults; 53 boolean_t st_first; 54 } sensor_transport_t; 55 56 typedef struct st_stats { 57 fmd_stat_t st_bad_fmri; 58 fmd_stat_t st_topo_errs; 59 fmd_stat_t st_repairs; 60 } st_stats_t; 61 62 st_stats_t st_stats = { 63 { "bad_fmri", FMD_TYPE_UINT64, "bad or missing resource/FRU FMRI" }, 64 { "topo_errors", FMD_TYPE_UINT64, "errors walking topology" }, 65 { "repairs", FMD_TYPE_UINT64, "auto repairs" } 66 }; 67 68 static int 69 st_check_component(topo_hdl_t *thp, tnode_t *node, void *arg) 70 { 71 sensor_transport_t *stp = arg; 72 fmd_hdl_t *hdl = stp->st_hdl; 73 const char *name = topo_node_name(node); 74 nvlist_t *nvl, *props, *rsrc, *fru; 75 char *fmri; 76 int err; 77 int32_t last_source, source = -1; 78 boolean_t nonrecov, faulted, predictive, source_diff; 79 nvpair_t *nvp; 80 uint64_t ena; 81 nvlist_t *event; 82 sensor_fault_t *sfp, **current; 83 84 if (strcmp(name, FAN) != 0 && strcmp(name, PSU) != 0) 85 return (0); 86 87 if (topo_method_invoke(node, TOPO_METH_SENSOR_FAILURE, 88 TOPO_METH_SENSOR_FAILURE_VERSION, NULL, &nvl, &err) != 0) { 89 if (err == ETOPO_METHOD_NOTSUP) { 90 fmd_hdl_debug(hdl, "Method %s not supported on %s=%d", 91 TOPO_METH_SENSOR_FAILURE, name, 92 topo_node_instance(node)); 93 return (0); 94 } 95 nvl = NULL; 96 } 97 98 if (topo_node_resource(node, &rsrc, NULL) != 0) { 99 st_stats.st_bad_fmri.fmds_value.ui64++; 100 nvlist_free(nvl); 101 return (0); 102 } 103 104 if (topo_node_fru(node, &fru, NULL, NULL) != 0) { 105 st_stats.st_bad_fmri.fmds_value.ui64++; 106 nvlist_free(nvl); 107 nvlist_free(rsrc); 108 return (0); 109 } 110 111 if (topo_fmri_nvl2str(thp, fru, &fmri, &err) != 0) { 112 st_stats.st_bad_fmri.fmds_value.ui64++; 113 nvlist_free(nvl); 114 nvlist_free(fru); 115 nvlist_free(rsrc); 116 return (0); 117 } 118 119 nvlist_free(fru); 120 121 faulted = nonrecov = source_diff = B_FALSE; 122 predictive = B_TRUE; 123 if (nvl != NULL) { 124 nvp = NULL; 125 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 126 if (nvpair_value_nvlist(nvp, &props) != 0) 127 continue; 128 129 faulted = B_TRUE; 130 131 /* 132 * We need some simple rules to handle the case where 133 * there are multiple facility nodes that indicate 134 * a problem with this FRU, but disagree on the values 135 * of nonrecov, predictive or source: 136 * 137 * 1) nonrecov will be set to true if one or more 138 * facility nodes indicates true. Otherwise it will 139 * default to false 140 * 141 * 2) predictive will default to false and remain false 142 * if one or more facility nodes indicate false. 143 * 144 * 3) source will be set to unknown unless all facility 145 * nodes agree on the source 146 */ 147 if (nonrecov == B_FALSE) 148 if (nvlist_lookup_boolean_value(props, 149 "nonrecov", &nonrecov) != 0) 150 nonrecov = B_FALSE; 151 if (predictive == B_TRUE) 152 if (nvlist_lookup_boolean_value(props, 153 "predictive", &predictive) != 0) 154 predictive = B_FALSE; 155 156 last_source = source; 157 if (nvlist_lookup_uint32(props, "source", 158 (uint32_t *)&source) != 0) 159 source = TOPO_SENSOR_ERRSRC_UNKNOWN; 160 if (last_source != -1 && last_source != source) 161 source_diff = B_TRUE; 162 } 163 if (source_diff) 164 source = TOPO_SENSOR_ERRSRC_UNKNOWN; 165 } 166 167 /* 168 * See if we know about this fru. 169 */ 170 for (current = &stp->st_faults; *current != NULL; 171 current = &(*current)->sf_next) { 172 if (topo_fmri_strcmp(thp, fmri, 173 (*current)->sf_fru)) 174 break; 175 } 176 177 sfp = *current; 178 if (sfp == NULL) { 179 /* 180 * We add this FRU to our list under two circumstances: 181 * 182 * 1. This FRU is faulted and needs to be remembered to 183 * avoid duplicate ereports. 184 * 185 * 2. This is the initial pass, and we want to repair the 186 * FRU in case it was repaired while we were offline. 187 */ 188 if (stp->st_first || faulted) { 189 sfp = fmd_hdl_zalloc(hdl, sizeof (sensor_fault_t), 190 FMD_SLEEP); 191 sfp->sf_fru = fmd_hdl_strdup(hdl, fmri, FMD_SLEEP); 192 sfp->sf_next = stp->st_faults; 193 stp->st_faults = sfp; 194 } else { 195 goto out; 196 } 197 } 198 199 if (nvl == NULL) 200 sfp->sf_unknown = B_TRUE; 201 202 if (faulted) { 203 /* 204 * Construct and post the ereport. 205 * 206 * XXFM we only post one ereport per fru. It should be possible 207 * to uniquely identify faulty resources instead and post one 208 * per resource, even if they share the same FRU. 209 */ 210 if (!sfp->sf_last_faulted) { 211 ena = fmd_event_ena_create(hdl); 212 event = fmd_nvl_alloc(hdl, FMD_SLEEP); 213 214 (void) nvlist_add_string(event, "type", name); 215 (void) nvlist_add_boolean_value(event, "nonrecov", 216 nonrecov); 217 (void) nvlist_add_boolean_value(event, "predictive", 218 predictive); 219 (void) nvlist_add_uint32(event, "source", 220 (uint32_t)source); 221 (void) nvlist_add_nvlist(event, "details", nvl); 222 (void) nvlist_add_string(event, FM_CLASS, 223 ST_EREPORT_CLASS); 224 (void) nvlist_add_uint8(event, FM_VERSION, 225 FM_EREPORT_VERSION); 226 (void) nvlist_add_uint64(event, FM_EREPORT_ENA, ena); 227 (void) nvlist_add_nvlist(event, FM_EREPORT_DETECTOR, 228 rsrc); 229 230 fmd_xprt_post(hdl, stp->st_xprt, event, 0); 231 fmd_hdl_debug(hdl, "posted ereport: %s", 232 ST_EREPORT_CLASS); 233 } 234 235 sfp->sf_faulted = B_TRUE; 236 } 237 238 out: 239 topo_hdl_strfree(thp, fmri); 240 nvlist_free(rsrc); 241 nvlist_free(nvl); 242 return (0); 243 } 244 245 /*ARGSUSED*/ 246 static void 247 st_timeout(fmd_hdl_t *hdl, id_t id, void *data) 248 { 249 sensor_transport_t *stp; 250 sensor_fault_t *sfp, **current; 251 topo_hdl_t *thp; 252 topo_walk_t *twp; 253 int err; 254 255 fmd_hdl_debug(hdl, "timeout: checking topology"); 256 257 stp = fmd_hdl_getspecific(hdl); 258 thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); 259 260 if ((twp = topo_walk_init(thp, FM_FMRI_SCHEME_HC, st_check_component, 261 stp, &err)) == NULL) { 262 fmd_hdl_topo_rele(hdl, thp); 263 fmd_hdl_error(hdl, "failed to walk topology: %s\n", 264 topo_strerror(err)); 265 st_stats.st_topo_errs.fmds_value.ui64++; 266 return; 267 } 268 269 /* 270 * Initialize values in our internal FRU list for this iteration of 271 * sensor reads. Keep track of whether the FRU was faulted in the 272 * previous pass so we don't send multiple ereports for the same 273 * problem. 274 */ 275 for (sfp = stp->st_faults; sfp != NULL; sfp = sfp->sf_next) { 276 sfp->sf_unknown = B_FALSE; 277 sfp->sf_last_faulted = sfp->sf_faulted; 278 sfp->sf_faulted = B_FALSE; 279 } 280 281 if (topo_walk_step(twp, TOPO_WALK_CHILD) == TOPO_WALK_ERR) { 282 topo_walk_fini(twp); 283 fmd_hdl_topo_rele(hdl, thp); 284 fmd_hdl_error(hdl, "failed to walk topology\n"); 285 st_stats.st_topo_errs.fmds_value.ui64++; 286 return; 287 } 288 289 /* 290 * Remove any faults that weren't seen in the last pass. 291 */ 292 for (current = &stp->st_faults; *current != NULL; ) { 293 sfp = *current; 294 if (!sfp->sf_faulted && !sfp->sf_unknown) { 295 fmd_hdl_debug(hdl, "repairing %s", sfp->sf_fru); 296 fmd_repair_fru(hdl, sfp->sf_fru); 297 st_stats.st_repairs.fmds_value.ui64++; 298 *current = sfp->sf_next; 299 fmd_hdl_strfree(hdl, sfp->sf_fru); 300 fmd_hdl_free(hdl, sfp, sizeof (sensor_fault_t)); 301 } else { 302 current = &sfp->sf_next; 303 } 304 } 305 306 stp->st_first = B_FALSE; 307 topo_walk_fini(twp); 308 fmd_hdl_topo_rele(hdl, thp); 309 310 stp->st_timer = fmd_timer_install(hdl, NULL, NULL, stp->st_interval); 311 } 312 313 static const fmd_prop_t fmd_props[] = { 314 { "interval", FMD_TYPE_TIME, "1min" }, 315 { NULL, 0, NULL } 316 }; 317 318 static const fmd_hdl_ops_t fmd_ops = { 319 NULL, /* fmdo_recv */ 320 st_timeout, /* fmdo_timeout */ 321 NULL, /* fmdo_close */ 322 NULL, /* fmdo_stats */ 323 NULL, /* fmdo_gc */ 324 NULL, /* fmdo_send */ 325 NULL /* fmdo_topo */ 326 }; 327 328 static const fmd_hdl_info_t fmd_info = { 329 "Sensor Transport Agent", "1.0", &fmd_ops, fmd_props 330 }; 331 332 void 333 _fmd_init(fmd_hdl_t *hdl) 334 { 335 sensor_transport_t *stp; 336 char buf[SYS_NMLN]; 337 338 /* 339 * The sensor-transport module is currently only supported on x86 340 * platforms. So to avoid unnecessarily wasting cpu cycles on sparc 341 * walking the hc scheme tree every 60 seconds, we'll bail out before 342 * registering the handle. 343 */ 344 if ((sysinfo(SI_ARCHITECTURE, buf, sizeof (buf)) == -1) || 345 (strcmp(buf, "i386") != 0)) 346 return; 347 348 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) 349 return; 350 351 (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, 352 sizeof (st_stats) / sizeof (fmd_stat_t), 353 (fmd_stat_t *)&st_stats); 354 355 stp = fmd_hdl_zalloc(hdl, sizeof (sensor_transport_t), FMD_SLEEP); 356 stp->st_interval = fmd_prop_get_int64(hdl, "interval"); 357 358 fmd_hdl_setspecific(hdl, stp); 359 360 stp->st_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY, NULL, NULL); 361 stp->st_hdl = hdl; 362 stp->st_first = B_TRUE; 363 364 /* kick off the first asynchronous discovery */ 365 stp->st_timer = fmd_timer_install(hdl, NULL, NULL, 0); 366 } 367 368 void 369 _fmd_fini(fmd_hdl_t *hdl) 370 { 371 sensor_transport_t *stp; 372 sensor_fault_t *sfp; 373 374 stp = fmd_hdl_getspecific(hdl); 375 if (stp != NULL) { 376 fmd_xprt_close(hdl, stp->st_xprt); 377 378 while ((sfp = stp->st_faults) != NULL) { 379 stp->st_faults = sfp->sf_next; 380 381 fmd_hdl_strfree(hdl, sfp->sf_fru); 382 fmd_hdl_free(hdl, sfp, sizeof (sensor_fault_t)); 383 } 384 385 fmd_hdl_free(hdl, stp, sizeof (sensor_transport_t)); 386 } 387 } 388