xref: /illumos-gate/usr/src/uts/common/io/mem.c (revision 5f82aa32fbc5dc2c59bca6ff315f44a4c4c9ea86)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright (c) 2016 by Delphix. All rights reserved.
25  */
26 
27 /*
28  * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
29  * Copyright 2017 James S Blachly, MD <james.blachly@gmail.com>
30  */
31 
32 /*
33  * Memory special file
34  */
35 
36 #include <sys/types.h>
37 #include <sys/param.h>
38 #include <sys/user.h>
39 #include <sys/buf.h>
40 #include <sys/systm.h>
41 #include <sys/cred.h>
42 #include <sys/vm.h>
43 #include <sys/uio.h>
44 #include <sys/mman.h>
45 #include <sys/kmem.h>
46 #include <vm/seg.h>
47 #include <vm/page.h>
48 #include <sys/stat.h>
49 #include <sys/vmem.h>
50 #include <sys/memlist.h>
51 #include <sys/bootconf.h>
52 
53 #include <vm/seg_vn.h>
54 #include <vm/seg_dev.h>
55 #include <vm/seg_kmem.h>
56 #include <vm/seg_kp.h>
57 #include <vm/seg_kpm.h>
58 #include <vm/hat.h>
59 
60 #include <sys/conf.h>
61 #include <sys/mem.h>
62 #include <sys/types.h>
63 #include <sys/conf.h>
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/errno.h>
67 #include <sys/modctl.h>
68 #include <sys/memlist.h>
69 #include <sys/ddi.h>
70 #include <sys/sunddi.h>
71 #include <sys/debug.h>
72 #include <sys/fm/protocol.h>
73 
74 #if defined(__sparc)
75 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
76 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
77     uint64_t *, int *, int *, int *);
78 extern size_t cpu_get_name_bufsize(void);
79 extern int cpu_get_mem_sid(char *, char *, int, int *);
80 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
81 #elif defined(__x86)
82 #include <sys/cpu_module.h>
83 #endif	/* __sparc */
84 
85 /*
86  * Turn a byte length into a pagecount.  The DDI btop takes a
87  * 32-bit size on 32-bit machines, this handles 64-bit sizes for
88  * large physical-memory 32-bit machines.
89  */
90 #define	BTOP(x)	((pgcnt_t)((x) >> _pageshift))
91 
92 static kmutex_t mm_lock;
93 static caddr_t mm_map;
94 
95 static dev_info_t *mm_dip;	/* private copy of devinfo pointer */
96 
97 static int mm_kmem_io_access;
98 
99 static int mm_kstat_update(kstat_t *ksp, int rw);
100 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
101 
102 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
103 
104 #define	MM_KMEMLOG_NENTRIES	64
105 
106 static int mm_kmemlogent;
107 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES];
108 
109 /*
110  * On kmem/allmem writes, we log information that might be useful in the event
111  * that a write is errant (that is, due to operator error) and induces a later
112  * problem.  Note that (in particular) in the event of such operator-induced
113  * corruption, a search over the kernel address space for the corrupted
114  * address will yield the ring buffer entry that recorded the write.  And
115  * should it seem baroque or otherwise unnecessary, yes, we need this kind of
116  * auditing facility and yes, we learned that the hard way: disturbingly,
117  * there exist recommendations for "tuning" the system that involve writing to
118  * kernel memory addresses via the kernel debugger, and -- as we discovered --
119  * these can easily be applied incorrectly or unsafely, yielding an entirely
120  * undebuggable "can't happen" kind of panic.
121  */
122 static void
123 mm_logkmem(struct uio *uio)
124 {
125 	mm_logentry_t *ent;
126 	proc_t *p = curthread->t_procp;
127 
128 	mutex_enter(&mm_lock);
129 
130 	ent = &mm_kmemlog[mm_kmemlogent++];
131 
132 	if (mm_kmemlogent == MM_KMEMLOG_NENTRIES)
133 		mm_kmemlogent = 0;
134 
135 	ent->mle_vaddr = (uintptr_t)uio->uio_loffset;
136 	ent->mle_len = uio->uio_resid;
137 	gethrestime(&ent->mle_hrestime);
138 	ent->mle_hrtime = gethrtime();
139 	ent->mle_pid = p->p_pidp->pid_id;
140 
141 	(void) strncpy(ent->mle_psargs,
142 	    p->p_user.u_psargs, sizeof (ent->mle_psargs));
143 
144 	mutex_exit(&mm_lock);
145 }
146 
147 /*ARGSUSED1*/
148 static int
149 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
150 {
151 	int i;
152 	struct mem_minor {
153 		char *name;
154 		minor_t minor;
155 		int privonly;
156 		const char *rdpriv;
157 		const char *wrpriv;
158 		mode_t priv_mode;
159 	} mm[] = {
160 		{ "mem",	M_MEM,		0,	NULL,	"all",	0640 },
161 		{ "kmem",	M_KMEM,		0,	NULL,	"all",	0640 },
162 		{ "allkmem",	M_ALLKMEM,	0,	"all",	"all",	0600 },
163 		{ "null",	M_NULL,	PRIVONLY_DEV,	NULL,	NULL,	0666 },
164 		{ "zero",	M_ZERO, PRIVONLY_DEV,	NULL,	NULL,	0666 },
165 		{ "full",	M_FULL, PRIVONLY_DEV,	NULL,	NULL,	0666 },
166 	};
167 	kstat_t *ksp;
168 
169 	mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
170 	mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
171 
172 	for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
173 		if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
174 		    mm[i].minor, DDI_PSEUDO, mm[i].privonly,
175 		    mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
176 		    DDI_FAILURE) {
177 			ddi_remove_minor_node(devi, NULL);
178 			return (DDI_FAILURE);
179 		}
180 	}
181 
182 	mm_dip = devi;
183 
184 	ksp = kstat_create("mm", 0, "phys_installed", "misc",
185 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
186 	if (ksp != NULL) {
187 		ksp->ks_update = mm_kstat_update;
188 		ksp->ks_snapshot = mm_kstat_snapshot;
189 		ksp->ks_lock = &mm_lock; /* XXX - not really needed */
190 		kstat_install(ksp);
191 	}
192 
193 	mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
194 	    "kmem_io_access", 0);
195 
196 	return (DDI_SUCCESS);
197 }
198 
199 /*ARGSUSED*/
200 static int
201 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
202 {
203 	register int error;
204 
205 	switch (infocmd) {
206 	case DDI_INFO_DEVT2DEVINFO:
207 		*result = (void *)mm_dip;
208 		error = DDI_SUCCESS;
209 		break;
210 	case DDI_INFO_DEVT2INSTANCE:
211 		*result = (void *)0;
212 		error = DDI_SUCCESS;
213 		break;
214 	default:
215 		error = DDI_FAILURE;
216 	}
217 	return (error);
218 }
219 
220 /*ARGSUSED1*/
221 static int
222 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
223 {
224 	switch (getminor(*devp)) {
225 	case M_NULL:
226 	case M_ZERO:
227 	case M_FULL:
228 	case M_MEM:
229 	case M_KMEM:
230 	case M_ALLKMEM:
231 		/* standard devices */
232 		break;
233 
234 	default:
235 		/* Unsupported or unknown type */
236 		return (EINVAL);
237 	}
238 	/* must be character device */
239 	if (typ != OTYP_CHR)
240 		return (EINVAL);
241 	return (0);
242 }
243 
244 struct pollhead	mm_pollhd;
245 
246 /*ARGSUSED*/
247 static int
248 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
249     struct pollhead **phpp)
250 {
251 	switch (getminor(dev)) {
252 	case M_NULL:
253 	case M_ZERO:
254 	case M_FULL:
255 	case M_MEM:
256 	case M_KMEM:
257 	case M_ALLKMEM:
258 		*reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
259 		    POLLWRNORM | POLLRDBAND | POLLWRBAND);
260 		/*
261 		 * A non NULL pollhead pointer should be returned in case
262 		 * user polls for 0 events.
263 		 */
264 		*phpp = !anyyet && !*reventsp ?
265 		    &mm_pollhd : (struct pollhead *)NULL;
266 		return (0);
267 	default:
268 		/* no other devices currently support polling */
269 		return (ENXIO);
270 	}
271 }
272 
273 static int
274 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
275     char *name, caddr_t valuep, int *lengthp)
276 {
277 	/*
278 	 * implement zero size to reduce overhead (avoid two failing
279 	 * property lookups per stat).
280 	 */
281 	return (ddi_prop_op_size(dev, dip, prop_op,
282 	    flags, name, valuep, lengthp, 0));
283 }
284 
285 static int
286 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
287     page_t *pp)
288 {
289 	int error = 0;
290 	int devload = 0;
291 	int is_memory = pf_is_memory(pfn);
292 	size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
293 	    (size_t)uio->uio_iov->iov_len);
294 	caddr_t va = NULL;
295 
296 	mutex_enter(&mm_lock);
297 
298 	if (is_memory && kpm_enable) {
299 		if (pp)
300 			va = hat_kpm_mapin(pp, NULL);
301 		else
302 			va = hat_kpm_mapin_pfn(pfn);
303 	}
304 
305 	if (va == NULL) {
306 		hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
307 		    (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE),
308 		    HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK);
309 		va = mm_map;
310 		devload = 1;
311 	}
312 
313 	if (!is_memory) {
314 		if (allowio) {
315 			size_t c = uio->uio_iov->iov_len;
316 
317 			if (ddi_peekpokeio(NULL, uio, rw,
318 			    (caddr_t)(uintptr_t)uio->uio_loffset, c,
319 			    sizeof (int32_t)) != DDI_SUCCESS)
320 				error = EFAULT;
321 		} else
322 			error = EIO;
323 	} else
324 		error = uiomove(va + pageoff, nbytes, rw, uio);
325 
326 	if (devload)
327 		hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
328 	else if (pp)
329 		hat_kpm_mapout(pp, NULL, va);
330 	else
331 		hat_kpm_mapout_pfn(pfn);
332 
333 	mutex_exit(&mm_lock);
334 	return (error);
335 }
336 
337 static int
338 mmpagelock(struct as *as, caddr_t va)
339 {
340 	struct seg *seg;
341 	int i;
342 
343 	AS_LOCK_ENTER(as, RW_READER);
344 	seg = as_segat(as, va);
345 	i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
346 	AS_LOCK_EXIT(as);
347 
348 	return (i);
349 }
350 
351 #ifdef	__sparc
352 
353 #define	NEED_LOCK_KVADDR(kva)	mmpagelock(&kas, kva)
354 
355 #else	/* __i386, __amd64 */
356 
357 #define	NEED_LOCK_KVADDR(va)	0
358 
359 #endif	/* __sparc */
360 
361 /*ARGSUSED3*/
362 static int
363 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
364 {
365 	pfn_t v;
366 	struct iovec *iov;
367 	int error = 0;
368 	size_t c;
369 	ssize_t oresid = uio->uio_resid;
370 	minor_t minor = getminor(dev);
371 
372 	while (uio->uio_resid > 0 && error == 0) {
373 		iov = uio->uio_iov;
374 		if (iov->iov_len == 0) {
375 			uio->uio_iov++;
376 			uio->uio_iovcnt--;
377 			if (uio->uio_iovcnt < 0)
378 				panic("mmrw");
379 			continue;
380 		}
381 		switch (minor) {
382 
383 		case M_MEM:
384 			memlist_read_lock();
385 			if (!address_in_memlist(phys_install,
386 			    (uint64_t)uio->uio_loffset, 1)) {
387 				memlist_read_unlock();
388 				error = EFAULT;
389 				break;
390 			}
391 			memlist_read_unlock();
392 
393 			v = BTOP((u_offset_t)uio->uio_loffset);
394 			error = mmio(uio, rw, v,
395 			    uio->uio_loffset & PAGEOFFSET, 0, NULL);
396 			break;
397 
398 		case M_KMEM:
399 		case M_ALLKMEM:
400 			{
401 			page_t **ppp = NULL;
402 			caddr_t vaddr = (caddr_t)uio->uio_offset;
403 			int try_lock = NEED_LOCK_KVADDR(vaddr);
404 			int locked = 0;
405 
406 			if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
407 				break;
408 
409 			if (rw == UIO_WRITE)
410 				mm_logkmem(uio);
411 
412 			/*
413 			 * If vaddr does not map a valid page, as_pagelock()
414 			 * will return failure. Hence we can't check the
415 			 * return value and return EFAULT here as we'd like.
416 			 * seg_kp and seg_kpm do not properly support
417 			 * as_pagelock() for this context so we avoid it
418 			 * using the try_lock set check above.  Some day when
419 			 * the kernel page locking gets redesigned all this
420 			 * muck can be cleaned up.
421 			 */
422 			if (try_lock)
423 				locked = (as_pagelock(&kas, &ppp, vaddr,
424 				    PAGESIZE, S_WRITE) == 0);
425 
426 			v = hat_getpfnum(kas.a_hat,
427 			    (caddr_t)(uintptr_t)uio->uio_loffset);
428 			if (v == PFN_INVALID) {
429 				if (locked)
430 					as_pageunlock(&kas, ppp, vaddr,
431 					    PAGESIZE, S_WRITE);
432 				error = EFAULT;
433 				break;
434 			}
435 
436 			error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
437 			    minor == M_ALLKMEM || mm_kmem_io_access,
438 			    (locked && ppp) ? *ppp : NULL);
439 			if (locked)
440 				as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
441 				    S_WRITE);
442 			}
443 
444 			break;
445 
446 		case M_FULL:
447 			if (rw == UIO_WRITE) {
448 				error = ENOSPC;
449 				break;
450 			}
451 			/* else it's a read, fall through to zero case */
452 			/*FALLTHROUGH*/
453 
454 		case M_ZERO:
455 			if (rw == UIO_READ) {
456 				label_t ljb;
457 
458 				if (on_fault(&ljb)) {
459 					no_fault();
460 					error = EFAULT;
461 					break;
462 				}
463 				uzero(iov->iov_base, iov->iov_len);
464 				no_fault();
465 				uio->uio_resid -= iov->iov_len;
466 				uio->uio_loffset += iov->iov_len;
467 				break;
468 			}
469 			/* else it's a write, fall through to NULL case */
470 			/*FALLTHROUGH*/
471 
472 		case M_NULL:
473 			if (rw == UIO_READ)
474 				return (0);
475 			c = iov->iov_len;
476 			iov->iov_base += c;
477 			iov->iov_len -= c;
478 			uio->uio_loffset += c;
479 			uio->uio_resid -= c;
480 			break;
481 
482 		}
483 	}
484 	return (uio->uio_resid == oresid ? error : 0);
485 }
486 
487 static int
488 mmread(dev_t dev, struct uio *uio, cred_t *cred)
489 {
490 	return (mmrw(dev, uio, UIO_READ, cred));
491 }
492 
493 static int
494 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
495 {
496 	return (mmrw(dev, uio, UIO_WRITE, cred));
497 }
498 
499 /*
500  * Private ioctl for libkvm to support kvm_physaddr().
501  * Given an address space and a VA, compute the PA.
502  */
503 static int
504 mmioctl_vtop(intptr_t data)
505 {
506 #ifdef _SYSCALL32
507 	mem_vtop32_t vtop32;
508 #endif
509 	mem_vtop_t mem_vtop;
510 	proc_t *p;
511 	pfn_t pfn = (pfn_t)PFN_INVALID;
512 	pid_t pid = 0;
513 	struct as *as;
514 	struct seg *seg;
515 
516 	if (get_udatamodel() == DATAMODEL_NATIVE) {
517 		if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
518 			return (EFAULT);
519 	}
520 #ifdef _SYSCALL32
521 	else {
522 		if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
523 			return (EFAULT);
524 		mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
525 		mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
526 
527 		if (mem_vtop.m_as != NULL)
528 			return (EINVAL);
529 	}
530 #endif
531 
532 	if (mem_vtop.m_as == &kas) {
533 		pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
534 	} else {
535 		if (mem_vtop.m_as == NULL) {
536 			/*
537 			 * Assume the calling process's address space if the
538 			 * caller didn't specify one.
539 			 */
540 			p = curthread->t_procp;
541 			if (p == NULL)
542 				return (EIO);
543 			mem_vtop.m_as = p->p_as;
544 		}
545 
546 		mutex_enter(&pidlock);
547 		for (p = practive; p != NULL; p = p->p_next) {
548 			if (p->p_as == mem_vtop.m_as) {
549 				pid = p->p_pid;
550 				break;
551 			}
552 		}
553 		mutex_exit(&pidlock);
554 		if (p == NULL)
555 			return (EIO);
556 		p = sprlock(pid);
557 		if (p == NULL)
558 			return (EIO);
559 		as = p->p_as;
560 		if (as == mem_vtop.m_as) {
561 			mutex_exit(&p->p_lock);
562 			AS_LOCK_ENTER(as, RW_READER);
563 			for (seg = AS_SEGFIRST(as); seg != NULL;
564 			    seg = AS_SEGNEXT(as, seg))
565 				if ((uintptr_t)mem_vtop.m_va -
566 				    (uintptr_t)seg->s_base < seg->s_size)
567 					break;
568 			if (seg != NULL)
569 				pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
570 			AS_LOCK_EXIT(as);
571 			mutex_enter(&p->p_lock);
572 		}
573 		sprunlock(p);
574 	}
575 	mem_vtop.m_pfn = pfn;
576 	if (pfn == PFN_INVALID)
577 		return (EIO);
578 
579 	if (get_udatamodel() == DATAMODEL_NATIVE) {
580 		if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
581 			return (EFAULT);
582 	}
583 #ifdef _SYSCALL32
584 	else {
585 		vtop32.m_pfn = mem_vtop.m_pfn;
586 		if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
587 			return (EFAULT);
588 	}
589 #endif
590 
591 	return (0);
592 }
593 
594 /*
595  * Given a PA, execute the given page retire command on it.
596  */
597 static int
598 mmioctl_page_retire(int cmd, intptr_t data)
599 {
600 	extern int page_retire_test(void);
601 	uint64_t pa;
602 
603 	if (copyin((void *)data, &pa, sizeof (uint64_t))) {
604 		return (EFAULT);
605 	}
606 
607 	switch (cmd) {
608 	case MEM_PAGE_ISRETIRED:
609 		return (page_retire_check(pa, NULL));
610 
611 	case MEM_PAGE_UNRETIRE:
612 		return (page_unretire(pa));
613 
614 	case MEM_PAGE_RETIRE:
615 		return (page_retire(pa, PR_FMA));
616 
617 	case MEM_PAGE_RETIRE_MCE:
618 		return (page_retire(pa, PR_MCE));
619 
620 	case MEM_PAGE_RETIRE_UE:
621 		return (page_retire(pa, PR_UE));
622 
623 	case MEM_PAGE_GETERRORS:
624 		{
625 			uint64_t page_errors;
626 			int rc = page_retire_check(pa, &page_errors);
627 			if (copyout(&page_errors, (void *)data,
628 			    sizeof (uint64_t))) {
629 				return (EFAULT);
630 			}
631 			return (rc);
632 		}
633 
634 	case MEM_PAGE_RETIRE_TEST:
635 		return (page_retire_test());
636 
637 	}
638 
639 	return (EINVAL);
640 }
641 
642 #ifdef __sparc
643 /*
644  * Given a syndrome, syndrome type, and address return the
645  * associated memory name in the provided data buffer.
646  */
647 static int
648 mmioctl_get_mem_name(intptr_t data)
649 {
650 	mem_name_t mem_name;
651 	void *buf;
652 	size_t bufsize;
653 	int len, err;
654 
655 	if ((bufsize = cpu_get_name_bufsize()) == 0)
656 		return (ENOTSUP);
657 
658 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
659 		return (err);
660 
661 	buf = kmem_alloc(bufsize, KM_SLEEP);
662 
663 	/*
664 	 * Call into cpu specific code to do the lookup.
665 	 */
666 	if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
667 	    mem_name.m_addr, buf, bufsize, &len)) != 0) {
668 		kmem_free(buf, bufsize);
669 		return (err);
670 	}
671 
672 	if (len >= mem_name.m_namelen) {
673 		kmem_free(buf, bufsize);
674 		return (ENOSPC);
675 	}
676 
677 	if (copyoutstr(buf, (char *)mem_name.m_name,
678 	    mem_name.m_namelen, NULL) != 0) {
679 		kmem_free(buf, bufsize);
680 		return (EFAULT);
681 	}
682 
683 	kmem_free(buf, bufsize);
684 	return (0);
685 }
686 
687 /*
688  * Given a syndrome and address return information about the associated memory.
689  */
690 static int
691 mmioctl_get_mem_info(intptr_t data)
692 {
693 	mem_info_t mem_info;
694 	int err;
695 
696 	if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
697 		return (EFAULT);
698 
699 	if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
700 	    &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
701 	    &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
702 		return (err);
703 
704 	if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
705 		return (EFAULT);
706 
707 	return (0);
708 }
709 
710 /*
711  * Given a memory name, return its associated serial id
712  */
713 static int
714 mmioctl_get_mem_sid(intptr_t data)
715 {
716 	mem_name_t mem_name;
717 	void *buf;
718 	void *name;
719 	size_t	name_len;
720 	size_t bufsize;
721 	int len, err;
722 
723 	if ((bufsize = cpu_get_name_bufsize()) == 0)
724 		return (ENOTSUP);
725 
726 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
727 		return (err);
728 
729 	buf = kmem_alloc(bufsize, KM_SLEEP);
730 
731 	if (mem_name.m_namelen > 1024)
732 		mem_name.m_namelen = 1024; /* cap at 1024 bytes */
733 
734 	name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
735 
736 	if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
737 	    mem_name.m_namelen, &name_len)) != 0) {
738 		kmem_free(buf, bufsize);
739 		kmem_free(name, mem_name.m_namelen);
740 		return (err);
741 	}
742 
743 	/*
744 	 * Call into cpu specific code to do the lookup.
745 	 */
746 	if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
747 		kmem_free(buf, bufsize);
748 		kmem_free(name, mem_name.m_namelen);
749 		return (err);
750 	}
751 
752 	if (len > mem_name.m_sidlen) {
753 		kmem_free(buf, bufsize);
754 		kmem_free(name, mem_name.m_namelen);
755 		return (ENAMETOOLONG);
756 	}
757 
758 	if (copyoutstr(buf, (char *)mem_name.m_sid,
759 	    mem_name.m_sidlen, NULL) != 0) {
760 		kmem_free(buf, bufsize);
761 		kmem_free(name, mem_name.m_namelen);
762 		return (EFAULT);
763 	}
764 
765 	kmem_free(buf, bufsize);
766 	kmem_free(name, mem_name.m_namelen);
767 	return (0);
768 }
769 #endif	/* __sparc */
770 
771 /*
772  * Private ioctls for
773  *	libkvm to support kvm_physaddr().
774  *	FMA support for page_retire() and memory attribute information.
775  */
776 /*ARGSUSED*/
777 static int
778 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
779 {
780 	if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
781 	    (cmd != MEM_VTOP && getminor(dev) != M_MEM))
782 		return (ENXIO);
783 
784 	switch (cmd) {
785 	case MEM_VTOP:
786 		return (mmioctl_vtop(data));
787 
788 	case MEM_PAGE_RETIRE:
789 	case MEM_PAGE_ISRETIRED:
790 	case MEM_PAGE_UNRETIRE:
791 	case MEM_PAGE_RETIRE_MCE:
792 	case MEM_PAGE_RETIRE_UE:
793 	case MEM_PAGE_GETERRORS:
794 	case MEM_PAGE_RETIRE_TEST:
795 		return (mmioctl_page_retire(cmd, data));
796 
797 #ifdef __sparc
798 	case MEM_NAME:
799 		return (mmioctl_get_mem_name(data));
800 
801 	case MEM_INFO:
802 		return (mmioctl_get_mem_info(data));
803 
804 	case MEM_SID:
805 		return (mmioctl_get_mem_sid(data));
806 #else
807 	case MEM_NAME:
808 	case MEM_INFO:
809 	case MEM_SID:
810 		return (ENOTSUP);
811 #endif	/* __sparc */
812 	}
813 	return (ENXIO);
814 }
815 
816 /*ARGSUSED2*/
817 static int
818 mmmmap(dev_t dev, off_t off, int prot)
819 {
820 	pfn_t pf;
821 	struct memlist *pmem;
822 	minor_t minor = getminor(dev);
823 
824 	switch (minor) {
825 	case M_MEM:
826 		pf = btop(off);
827 		memlist_read_lock();
828 		for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
829 			if (pf >= BTOP(pmem->ml_address) &&
830 			    pf < BTOP(pmem->ml_address + pmem->ml_size)) {
831 				memlist_read_unlock();
832 				return (impl_obmem_pfnum(pf));
833 			}
834 		}
835 		memlist_read_unlock();
836 		break;
837 
838 	case M_KMEM:
839 	case M_ALLKMEM:
840 		/* no longer supported with KPR */
841 		return (-1);
842 
843 	case M_FULL:
844 	case M_ZERO:
845 		/*
846 		 * We shouldn't be mmap'ing to /dev/zero here as
847 		 * mmsegmap() should have already converted
848 		 * a mapping request for this device to a mapping
849 		 * using seg_vn for anonymous memory.
850 		 */
851 		break;
852 
853 	}
854 	return (-1);
855 }
856 
857 /*
858  * This function is called when a memory device is mmap'ed.
859  * Set up the mapping to the correct device driver.
860  */
861 static int
862 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
863     uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
864 {
865 	struct segvn_crargs vn_a;
866 	struct segdev_crargs dev_a;
867 	int error;
868 	minor_t minor;
869 	off_t i;
870 
871 	minor = getminor(dev);
872 
873 	as_rangelock(as);
874 	/*
875 	 * No need to worry about vac alignment on /dev/zero
876 	 * since this is a "clone" object that doesn't yet exist.
877 	 */
878 	error = choose_addr(as, addrp, len, off,
879 	    (minor == M_MEM) || (minor == M_KMEM), flags);
880 	if (error != 0) {
881 		as_rangeunlock(as);
882 		return (error);
883 	}
884 
885 	switch (minor) {
886 	case M_MEM:
887 		/* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
888 		if ((flags & MAP_TYPE) != MAP_SHARED) {
889 			as_rangeunlock(as);
890 			return (EINVAL);
891 		}
892 
893 		/*
894 		 * Check to ensure that the entire range is
895 		 * legal and we are not trying to map in
896 		 * more than the device will let us.
897 		 */
898 		for (i = 0; i < len; i += PAGESIZE) {
899 			if (mmmmap(dev, off + i, maxprot) == -1) {
900 				as_rangeunlock(as);
901 				return (ENXIO);
902 			}
903 		}
904 
905 		/*
906 		 * Use seg_dev segment driver for /dev/mem mapping.
907 		 */
908 		dev_a.mapfunc = mmmmap;
909 		dev_a.dev = dev;
910 		dev_a.offset = off;
911 		dev_a.type = (flags & MAP_TYPE);
912 		dev_a.prot = (uchar_t)prot;
913 		dev_a.maxprot = (uchar_t)maxprot;
914 		dev_a.hat_attr = 0;
915 
916 		/*
917 		 * Make /dev/mem mappings non-consistent since we can't
918 		 * alias pages that don't have page structs behind them,
919 		 * such as kernel stack pages. If someone mmap()s a kernel
920 		 * stack page and if we give them a tte with cv, a line from
921 		 * that page can get into both pages of the spitfire d$.
922 		 * But snoop from another processor will only invalidate
923 		 * the first page. This later caused kernel (xc_attention)
924 		 * to go into an infinite loop at pil 13 and no interrupts
925 		 * could come in. See 1203630.
926 		 *
927 		 */
928 		dev_a.hat_flags = HAT_LOAD_NOCONSIST;
929 		dev_a.devmap_data = NULL;
930 
931 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
932 		break;
933 
934 	case M_ZERO:
935 		/*
936 		 * Use seg_vn segment driver for /dev/zero mapping.
937 		 * Passing in a NULL amp gives us the "cloning" effect.
938 		 */
939 		vn_a.vp = NULL;
940 		vn_a.offset = 0;
941 		vn_a.type = (flags & MAP_TYPE);
942 		vn_a.prot = prot;
943 		vn_a.maxprot = maxprot;
944 		vn_a.flags = flags & ~MAP_TYPE;
945 		vn_a.cred = cred;
946 		vn_a.amp = NULL;
947 		vn_a.szc = 0;
948 		vn_a.lgrp_mem_policy_flags = 0;
949 		error = as_map(as, *addrp, len, segvn_create, &vn_a);
950 		break;
951 
952 	case M_KMEM:
953 	case M_ALLKMEM:
954 		/* No longer supported with KPR. */
955 		error = ENXIO;
956 		break;
957 
958 	case M_NULL:
959 		/*
960 		 * Use seg_dev segment driver for /dev/null mapping.
961 		 */
962 		dev_a.mapfunc = mmmmap;
963 		dev_a.dev = dev;
964 		dev_a.offset = off;
965 		dev_a.type = 0;		/* neither PRIVATE nor SHARED */
966 		dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
967 		dev_a.hat_attr = 0;
968 		dev_a.hat_flags = 0;
969 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
970 		break;
971 
972 	default:
973 		error = ENXIO;
974 	}
975 
976 	as_rangeunlock(as);
977 	return (error);
978 }
979 
980 static struct cb_ops mm_cb_ops = {
981 	mmopen,			/* open */
982 	nulldev,		/* close */
983 	nodev,			/* strategy */
984 	nodev,			/* print */
985 	nodev,			/* dump */
986 	mmread,			/* read */
987 	mmwrite,		/* write */
988 	mmioctl,		/* ioctl */
989 	nodev,			/* devmap */
990 	mmmmap,			/* mmap */
991 	mmsegmap,		/* segmap */
992 	mmchpoll,		/* poll */
993 	mmpropop,		/* prop_op */
994 	0,			/* streamtab  */
995 	D_NEW | D_MP | D_64BIT | D_U64BIT
996 };
997 
998 static struct dev_ops mm_ops = {
999 	DEVO_REV,		/* devo_rev, */
1000 	0,			/* refcnt  */
1001 	mm_info,		/* get_dev_info */
1002 	nulldev,		/* identify */
1003 	nulldev,		/* probe */
1004 	mm_attach,		/* attach */
1005 	nodev,			/* detach */
1006 	nodev,			/* reset */
1007 	&mm_cb_ops,		/* driver operations */
1008 	(struct bus_ops *)0,	/* bus operations */
1009 	NULL,			/* power */
1010 	ddi_quiesce_not_needed,		/* quiesce */
1011 };
1012 
1013 static struct modldrv modldrv = {
1014 	&mod_driverops, "memory driver", &mm_ops,
1015 };
1016 
1017 static struct modlinkage modlinkage = {
1018 	MODREV_1, &modldrv, NULL
1019 };
1020 
1021 int
1022 _init(void)
1023 {
1024 	return (mod_install(&modlinkage));
1025 }
1026 
1027 int
1028 _info(struct modinfo *modinfop)
1029 {
1030 	return (mod_info(&modlinkage, modinfop));
1031 }
1032 
1033 int
1034 _fini(void)
1035 {
1036 	return (mod_remove(&modlinkage));
1037 }
1038 
1039 static int
1040 mm_kstat_update(kstat_t *ksp, int rw)
1041 {
1042 	struct memlist *pmem;
1043 	uint_t count;
1044 
1045 	if (rw == KSTAT_WRITE)
1046 		return (EACCES);
1047 
1048 	count = 0;
1049 	memlist_read_lock();
1050 	for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
1051 		count++;
1052 	}
1053 	memlist_read_unlock();
1054 
1055 	ksp->ks_ndata = count;
1056 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1057 
1058 	return (0);
1059 }
1060 
1061 static int
1062 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1063 {
1064 	struct memlist *pmem;
1065 	struct memunit {
1066 		uint64_t address;
1067 		uint64_t size;
1068 	} *kspmem;
1069 
1070 	if (rw == KSTAT_WRITE)
1071 		return (EACCES);
1072 
1073 	ksp->ks_snaptime = gethrtime();
1074 
1075 	kspmem = (struct memunit *)buf;
1076 	memlist_read_lock();
1077 	for (pmem = phys_install; pmem != NULL;
1078 	    pmem = pmem->ml_next, kspmem++) {
1079 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1080 			break;
1081 		kspmem->address = pmem->ml_address;
1082 		kspmem->size = pmem->ml_size;
1083 	}
1084 	memlist_read_unlock();
1085 
1086 	return (0);
1087 }
1088 
1089 /*
1090  * Read a mem_name_t from user-space and store it in the mem_name_t
1091  * pointed to by the mem_name argument.
1092  */
1093 static int
1094 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1095 {
1096 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1097 		if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1098 			return (EFAULT);
1099 	}
1100 #ifdef	_SYSCALL32
1101 	else {
1102 		mem_name32_t mem_name32;
1103 
1104 		if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1105 			return (EFAULT);
1106 		mem_name->m_addr = mem_name32.m_addr;
1107 		mem_name->m_synd = mem_name32.m_synd;
1108 		mem_name->m_type[0] = mem_name32.m_type[0];
1109 		mem_name->m_type[1] = mem_name32.m_type[1];
1110 		mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1111 		mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1112 		mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1113 		mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1114 	}
1115 #endif	/* _SYSCALL32 */
1116 
1117 	return (0);
1118 }
1119