xref: /illumos-gate/usr/src/uts/i86pc/dboot/dboot_startkern.c (revision 5f82aa32fbc5dc2c59bca6ff315f44a4c4c9ea86)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright 2013 Joyent, Inc.  All rights reserved.
27  */
28 
29 
30 #include <sys/types.h>
31 #include <sys/machparam.h>
32 #include <sys/x86_archext.h>
33 #include <sys/systm.h>
34 #include <sys/mach_mmu.h>
35 #include <sys/multiboot.h>
36 #include <sys/multiboot2.h>
37 #include <sys/multiboot2_impl.h>
38 #include <sys/sysmacros.h>
39 #include <sys/sha1.h>
40 #include <util/string.h>
41 #include <util/strtolctype.h>
42 
43 #if defined(__xpv)
44 
45 #include <sys/hypervisor.h>
46 uintptr_t xen_virt_start;
47 pfn_t *mfn_to_pfn_mapping;
48 
49 #else /* !__xpv */
50 
51 extern multiboot_header_t mb_header;
52 extern uint32_t mb2_load_addr;
53 extern int have_cpuid(void);
54 
55 #endif /* !__xpv */
56 
57 #include <sys/inttypes.h>
58 #include <sys/bootinfo.h>
59 #include <sys/mach_mmu.h>
60 #include <sys/boot_console.h>
61 
62 #include "dboot_asm.h"
63 #include "dboot_printf.h"
64 #include "dboot_xboot.h"
65 #include "dboot_elfload.h"
66 
67 #define	SHA1_ASCII_LENGTH	(SHA1_DIGEST_LENGTH * 2)
68 
69 /*
70  * This file contains code that runs to transition us from either a multiboot
71  * compliant loader (32 bit non-paging) or a XPV domain loader to
72  * regular kernel execution. Its task is to setup the kernel memory image
73  * and page tables.
74  *
75  * The code executes as:
76  *	- 32 bits under GRUB (for 32 or 64 bit Solaris)
77  * 	- a 32 bit program for the 32-bit PV hypervisor
78  *	- a 64 bit program for the 64-bit PV hypervisor (at least for now)
79  *
80  * Under the PV hypervisor, we must create mappings for any memory beyond the
81  * initial start of day allocation (such as the kernel itself).
82  *
83  * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
84  * Since we are running in real mode, so all such memory is accessible.
85  */
86 
87 /*
88  * Standard bits used in PTE (page level) and PTP (internal levels)
89  */
90 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
91 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
92 
93 /*
94  * This is the target addresses (physical) where the kernel text and data
95  * nucleus pages will be unpacked. On the hypervisor this is actually a
96  * virtual address.
97  */
98 paddr_t ktext_phys;
99 uint32_t ksize = 2 * FOUR_MEG;	/* kernel nucleus is 8Meg */
100 
101 static uint64_t target_kernel_text;	/* value to use for KERNEL_TEXT */
102 
103 /*
104  * The stack is setup in assembler before entering startup_kernel()
105  */
106 char stack_space[STACK_SIZE];
107 
108 /*
109  * Used to track physical memory allocation
110  */
111 static paddr_t next_avail_addr = 0;
112 
113 #if defined(__xpv)
114 /*
115  * Additional information needed for hypervisor memory allocation.
116  * Only memory up to scratch_end is mapped by page tables.
117  * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
118  * to derive a pfn from a pointer, you subtract mfn_base.
119  */
120 
121 static paddr_t scratch_end = 0;	/* we can't write all of mem here */
122 static paddr_t mfn_base;		/* addr corresponding to mfn_list[0] */
123 start_info_t *xen_info;
124 
125 #else	/* __xpv */
126 
127 /*
128  * If on the metal, then we have a multiboot loader.
129  */
130 uint32_t mb_magic;			/* magic from boot loader */
131 uint32_t mb_addr;			/* multiboot info package from loader */
132 int multiboot_version;
133 multiboot_info_t *mb_info;
134 multiboot2_info_header_t *mb2_info;
135 multiboot_tag_mmap_t *mb2_mmap_tagp;
136 int num_entries;			/* mmap entry count */
137 boolean_t num_entries_set;		/* is mmap entry count set */
138 uintptr_t load_addr;
139 
140 #endif	/* __xpv */
141 
142 /*
143  * This contains information passed to the kernel
144  */
145 struct xboot_info boot_info[2];	/* extra space to fix alignement for amd64 */
146 struct xboot_info *bi;
147 
148 /*
149  * Page table and memory stuff.
150  */
151 static paddr_t max_mem;			/* maximum memory address */
152 
153 /*
154  * Information about processor MMU
155  */
156 int amd64_support = 0;
157 int largepage_support = 0;
158 int pae_support = 0;
159 int pge_support = 0;
160 int NX_support = 0;
161 
162 /*
163  * Low 32 bits of kernel entry address passed back to assembler.
164  * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
165  */
166 uint32_t entry_addr_low;
167 
168 /*
169  * Memlists for the kernel. We shouldn't need a lot of these.
170  */
171 #define	MAX_MEMLIST (50)
172 struct boot_memlist memlists[MAX_MEMLIST];
173 uint_t memlists_used = 0;
174 struct boot_memlist pcimemlists[MAX_MEMLIST];
175 uint_t pcimemlists_used = 0;
176 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
177 uint_t rsvdmemlists_used = 0;
178 
179 /*
180  * This should match what's in the bootloader.  It's arbitrary, but GRUB
181  * in particular has limitations on how much space it can use before it
182  * stops working properly.  This should be enough.
183  */
184 struct boot_modules modules[MAX_BOOT_MODULES];
185 uint_t modules_used = 0;
186 
187 #ifdef __xpv
188 /*
189  * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
190  * definition in Xen source.
191  */
192 typedef struct {
193 	uint32_t	base_addr_low;
194 	uint32_t	base_addr_high;
195 	uint32_t	length_low;
196 	uint32_t	length_high;
197 	uint32_t	type;
198 } mmap_t;
199 
200 /*
201  * There is 512KB of scratch area after the boot stack page.
202  * We'll use that for everything except the kernel nucleus pages which are too
203  * big to fit there and are allocated last anyway.
204  */
205 #define	MAXMAPS	100
206 static mmap_t map_buffer[MAXMAPS];
207 #else
208 typedef mb_memory_map_t mmap_t;
209 #endif
210 
211 /*
212  * Debugging macros
213  */
214 uint_t prom_debug = 0;
215 uint_t map_debug = 0;
216 
217 static char noname[2] = "-";
218 
219 /*
220  * Either hypervisor-specific or grub-specific code builds the initial
221  * memlists. This code does the sort/merge/link for final use.
222  */
223 static void
224 sort_physinstall(void)
225 {
226 	int i;
227 #if !defined(__xpv)
228 	int j;
229 	struct boot_memlist tmp;
230 
231 	/*
232 	 * Now sort the memlists, in case they weren't in order.
233 	 * Yeah, this is a bubble sort; small, simple and easy to get right.
234 	 */
235 	DBG_MSG("Sorting phys-installed list\n");
236 	for (j = memlists_used - 1; j > 0; --j) {
237 		for (i = 0; i < j; ++i) {
238 			if (memlists[i].addr < memlists[i + 1].addr)
239 				continue;
240 			tmp = memlists[i];
241 			memlists[i] = memlists[i + 1];
242 			memlists[i + 1] = tmp;
243 		}
244 	}
245 
246 	/*
247 	 * Merge any memlists that don't have holes between them.
248 	 */
249 	for (i = 0; i <= memlists_used - 1; ++i) {
250 		if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
251 			continue;
252 
253 		if (prom_debug)
254 			dboot_printf(
255 			    "merging mem segs %" PRIx64 "...%" PRIx64
256 			    " w/ %" PRIx64 "...%" PRIx64 "\n",
257 			    memlists[i].addr,
258 			    memlists[i].addr + memlists[i].size,
259 			    memlists[i + 1].addr,
260 			    memlists[i + 1].addr + memlists[i + 1].size);
261 
262 		memlists[i].size += memlists[i + 1].size;
263 		for (j = i + 1; j < memlists_used - 1; ++j)
264 			memlists[j] = memlists[j + 1];
265 		--memlists_used;
266 		DBG(memlists_used);
267 		--i;	/* after merging we need to reexamine, so do this */
268 	}
269 #endif	/* __xpv */
270 
271 	if (prom_debug) {
272 		dboot_printf("\nFinal memlists:\n");
273 		for (i = 0; i < memlists_used; ++i) {
274 			dboot_printf("\t%d: addr=%" PRIx64 " size=%"
275 			    PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
276 		}
277 	}
278 
279 	/*
280 	 * link together the memlists with native size pointers
281 	 */
282 	memlists[0].next = 0;
283 	memlists[0].prev = 0;
284 	for (i = 1; i < memlists_used; ++i) {
285 		memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
286 		memlists[i].next = 0;
287 		memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
288 	}
289 	bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists;
290 	DBG(bi->bi_phys_install);
291 }
292 
293 /*
294  * build bios reserved memlists
295  */
296 static void
297 build_rsvdmemlists(void)
298 {
299 	int i;
300 
301 	rsvdmemlists[0].next = 0;
302 	rsvdmemlists[0].prev = 0;
303 	for (i = 1; i < rsvdmemlists_used; ++i) {
304 		rsvdmemlists[i].prev =
305 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
306 		rsvdmemlists[i].next = 0;
307 		rsvdmemlists[i - 1].next =
308 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
309 	}
310 	bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists;
311 	DBG(bi->bi_rsvdmem);
312 }
313 
314 #if defined(__xpv)
315 
316 /*
317  * halt on the hypervisor after a delay to drain console output
318  */
319 void
320 dboot_halt(void)
321 {
322 	uint_t i = 10000;
323 
324 	while (--i)
325 		(void) HYPERVISOR_yield();
326 	(void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
327 }
328 
329 /*
330  * From a machine address, find the corresponding pseudo-physical address.
331  * Pseudo-physical address are contiguous and run from mfn_base in each VM.
332  * Machine addresses are the real underlying hardware addresses.
333  * These are needed for page table entries. Note that this routine is
334  * poorly protected. A bad value of "ma" will cause a page fault.
335  */
336 paddr_t
337 ma_to_pa(maddr_t ma)
338 {
339 	ulong_t pgoff = ma & MMU_PAGEOFFSET;
340 	ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
341 	paddr_t pa;
342 
343 	if (pfn >= xen_info->nr_pages)
344 		return (-(paddr_t)1);
345 	pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
346 #ifdef DEBUG
347 	if (ma != pa_to_ma(pa))
348 		dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
349 		    "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
350 #endif
351 	return (pa);
352 }
353 
354 /*
355  * From a pseudo-physical address, find the corresponding machine address.
356  */
357 maddr_t
358 pa_to_ma(paddr_t pa)
359 {
360 	pfn_t pfn;
361 	ulong_t mfn;
362 
363 	pfn = mmu_btop(pa - mfn_base);
364 	if (pa < mfn_base || pfn >= xen_info->nr_pages)
365 		dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
366 	mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
367 #ifdef DEBUG
368 	if (mfn_to_pfn_mapping[mfn] != pfn)
369 		dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
370 		    pfn, mfn, mfn_to_pfn_mapping[mfn]);
371 #endif
372 	return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
373 }
374 
375 #endif	/* __xpv */
376 
377 x86pte_t
378 get_pteval(paddr_t table, uint_t index)
379 {
380 	if (pae_support)
381 		return (((x86pte_t *)(uintptr_t)table)[index]);
382 	return (((x86pte32_t *)(uintptr_t)table)[index]);
383 }
384 
385 /*ARGSUSED*/
386 void
387 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
388 {
389 #ifdef __xpv
390 	mmu_update_t t;
391 	maddr_t mtable = pa_to_ma(table);
392 	int retcnt;
393 
394 	t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
395 	t.val = pteval;
396 	if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
397 		dboot_panic("HYPERVISOR_mmu_update() failed");
398 #else /* __xpv */
399 	uintptr_t tab_addr = (uintptr_t)table;
400 
401 	if (pae_support)
402 		((x86pte_t *)tab_addr)[index] = pteval;
403 	else
404 		((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
405 	if (level == top_level && level == 2)
406 		reload_cr3();
407 #endif /* __xpv */
408 }
409 
410 paddr_t
411 make_ptable(x86pte_t *pteval, uint_t level)
412 {
413 	paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
414 
415 	if (level == top_level && level == 2)
416 		*pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
417 	else
418 		*pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
419 
420 #ifdef __xpv
421 	/* Remove write permission to the new page table. */
422 	if (HYPERVISOR_update_va_mapping(new_table,
423 	    *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
424 		dboot_panic("HYP_update_va_mapping error");
425 #endif
426 
427 	if (map_debug)
428 		dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
429 		    PRIx64 "\n", level, (ulong_t)new_table, *pteval);
430 	return (new_table);
431 }
432 
433 x86pte_t *
434 map_pte(paddr_t table, uint_t index)
435 {
436 	return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
437 }
438 
439 /*
440  * dump out the contents of page tables...
441  */
442 static void
443 dump_tables(void)
444 {
445 	uint_t save_index[4];	/* for recursion */
446 	char *save_table[4];	/* for recursion */
447 	uint_t	l;
448 	uint64_t va;
449 	uint64_t pgsize;
450 	int index;
451 	int i;
452 	x86pte_t pteval;
453 	char *table;
454 	static char *tablist = "\t\t\t";
455 	char *tabs = tablist + 3 - top_level;
456 	uint_t pa, pa1;
457 #if !defined(__xpv)
458 #define	maddr_t paddr_t
459 #endif /* !__xpv */
460 
461 	dboot_printf("Finished pagetables:\n");
462 	table = (char *)(uintptr_t)top_page_table;
463 	l = top_level;
464 	va = 0;
465 	for (index = 0; index < ptes_per_table; ++index) {
466 		pgsize = 1ull << shift_amt[l];
467 		if (pae_support)
468 			pteval = ((x86pte_t *)table)[index];
469 		else
470 			pteval = ((x86pte32_t *)table)[index];
471 		if (pteval == 0)
472 			goto next_entry;
473 
474 		dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
475 		    tabs + l, (void *)table, index, (uint64_t)pteval, va);
476 		pa = ma_to_pa(pteval & MMU_PAGEMASK);
477 		dboot_printf(" physaddr=%x\n", pa);
478 
479 		/*
480 		 * Don't try to walk hypervisor private pagetables
481 		 */
482 		if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
483 			save_table[l] = table;
484 			save_index[l] = index;
485 			--l;
486 			index = -1;
487 			table = (char *)(uintptr_t)
488 			    ma_to_pa(pteval & MMU_PAGEMASK);
489 			goto recursion;
490 		}
491 
492 		/*
493 		 * shorten dump for consecutive mappings
494 		 */
495 		for (i = 1; index + i < ptes_per_table; ++i) {
496 			if (pae_support)
497 				pteval = ((x86pte_t *)table)[index + i];
498 			else
499 				pteval = ((x86pte32_t *)table)[index + i];
500 			if (pteval == 0)
501 				break;
502 			pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
503 			if (pa1 != pa + i * pgsize)
504 				break;
505 		}
506 		if (i > 2) {
507 			dboot_printf("%s...\n", tabs + l);
508 			va += pgsize * (i - 2);
509 			index += i - 2;
510 		}
511 next_entry:
512 		va += pgsize;
513 		if (l == 3 && index == 256)	/* VA hole */
514 			va = 0xffff800000000000ull;
515 recursion:
516 		;
517 	}
518 	if (l < top_level) {
519 		++l;
520 		index = save_index[l];
521 		table = save_table[l];
522 		goto recursion;
523 	}
524 }
525 
526 /*
527  * Add a mapping for the machine page at the given virtual address.
528  */
529 static void
530 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
531 {
532 	x86pte_t *ptep;
533 	x86pte_t pteval;
534 
535 	pteval = ma | pte_bits;
536 	if (level > 0)
537 		pteval |= PT_PAGESIZE;
538 	if (va >= target_kernel_text && pge_support)
539 		pteval |= PT_GLOBAL;
540 
541 	if (map_debug && ma != va)
542 		dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
543 		    " pte=0x%" PRIx64 " l=%d\n",
544 		    (uint64_t)ma, (uint64_t)va, pteval, level);
545 
546 #if defined(__xpv)
547 	/*
548 	 * see if we can avoid find_pte() on the hypervisor
549 	 */
550 	if (HYPERVISOR_update_va_mapping(va, pteval,
551 	    UVMF_INVLPG | UVMF_LOCAL) == 0)
552 		return;
553 #endif
554 
555 	/*
556 	 * Find the pte that will map this address. This creates any
557 	 * missing intermediate level page tables
558 	 */
559 	ptep = find_pte(va, NULL, level, 0);
560 
561 	/*
562 	 * When paravirtualized, we must use hypervisor calls to modify the
563 	 * PTE, since paging is active. On real hardware we just write to
564 	 * the pagetables which aren't in use yet.
565 	 */
566 #if defined(__xpv)
567 	ptep = ptep;	/* shut lint up */
568 	if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
569 		dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
570 		    " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
571 		    (uint64_t)va, level, (uint64_t)ma, pteval);
572 #else
573 	if (va < 1024 * 1024)
574 		pteval |= PT_NOCACHE;		/* for video RAM */
575 	if (pae_support)
576 		*ptep = pteval;
577 	else
578 		*((x86pte32_t *)ptep) = (x86pte32_t)pteval;
579 #endif
580 }
581 
582 /*
583  * Add a mapping for the physical page at the given virtual address.
584  */
585 static void
586 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
587 {
588 	map_ma_at_va(pa_to_ma(pa), va, level);
589 }
590 
591 /*
592  * This is called to remove start..end from the
593  * possible range of PCI addresses.
594  */
595 const uint64_t pci_lo_limit = 0x00100000ul;
596 const uint64_t pci_hi_limit = 0xfff00000ul;
597 static void
598 exclude_from_pci(uint64_t start, uint64_t end)
599 {
600 	int i;
601 	int j;
602 	struct boot_memlist *ml;
603 
604 	for (i = 0; i < pcimemlists_used; ++i) {
605 		ml = &pcimemlists[i];
606 
607 		/* delete the entire range? */
608 		if (start <= ml->addr && ml->addr + ml->size <= end) {
609 			--pcimemlists_used;
610 			for (j = i; j < pcimemlists_used; ++j)
611 				pcimemlists[j] = pcimemlists[j + 1];
612 			--i;	/* to revisit the new one at this index */
613 		}
614 
615 		/* split a range? */
616 		else if (ml->addr < start && end < ml->addr + ml->size) {
617 
618 			++pcimemlists_used;
619 			if (pcimemlists_used > MAX_MEMLIST)
620 				dboot_panic("too many pcimemlists");
621 
622 			for (j = pcimemlists_used - 1; j > i; --j)
623 				pcimemlists[j] = pcimemlists[j - 1];
624 			ml->size = start - ml->addr;
625 
626 			++ml;
627 			ml->size = (ml->addr + ml->size) - end;
628 			ml->addr = end;
629 			++i;	/* skip on to next one */
630 		}
631 
632 		/* cut memory off the start? */
633 		else if (ml->addr < end && end < ml->addr + ml->size) {
634 			ml->size -= end - ml->addr;
635 			ml->addr = end;
636 		}
637 
638 		/* cut memory off the end? */
639 		else if (ml->addr <= start && start < ml->addr + ml->size) {
640 			ml->size = start - ml->addr;
641 		}
642 	}
643 }
644 
645 /*
646  * During memory allocation, find the highest address not used yet.
647  */
648 static void
649 check_higher(paddr_t a)
650 {
651 	if (a < next_avail_addr)
652 		return;
653 	next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
654 	DBG(next_avail_addr);
655 }
656 
657 static int
658 dboot_loader_mmap_entries(void)
659 {
660 #if !defined(__xpv)
661 	if (num_entries_set == B_TRUE)
662 		return (num_entries);
663 
664 	switch (multiboot_version) {
665 	case 1:
666 		DBG(mb_info->flags);
667 		if (mb_info->flags & 0x40) {
668 			mb_memory_map_t *mmap;
669 
670 			DBG(mb_info->mmap_addr);
671 			DBG(mb_info->mmap_length);
672 			check_higher(mb_info->mmap_addr + mb_info->mmap_length);
673 
674 			for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
675 			    (uint32_t)mmap < mb_info->mmap_addr +
676 			    mb_info->mmap_length;
677 			    mmap = (mb_memory_map_t *)((uint32_t)mmap +
678 			    mmap->size + sizeof (mmap->size)))
679 				++num_entries;
680 
681 			num_entries_set = B_TRUE;
682 		}
683 		break;
684 	case 2:
685 		num_entries_set = B_TRUE;
686 		num_entries = dboot_multiboot2_mmap_nentries(mb2_info,
687 		    mb2_mmap_tagp);
688 		break;
689 	default:
690 		dboot_panic("Unknown multiboot version: %d\n",
691 		    multiboot_version);
692 		break;
693 	}
694 	return (num_entries);
695 #else
696 	return (MAXMAPS);
697 #endif
698 }
699 
700 static uint32_t
701 dboot_loader_mmap_get_type(int index)
702 {
703 #if !defined(__xpv)
704 	mb_memory_map_t *mp, *mpend;
705 	int i;
706 
707 	switch (multiboot_version) {
708 	case 1:
709 		mp = (mb_memory_map_t *)mb_info->mmap_addr;
710 		mpend = (mb_memory_map_t *)
711 		    (mb_info->mmap_addr + mb_info->mmap_length);
712 
713 		for (i = 0; mp < mpend && i != index; i++)
714 			mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
715 			    sizeof (mp->size));
716 		if (mp >= mpend) {
717 			dboot_panic("dboot_loader_mmap_get_type(): index "
718 			    "out of bounds: %d\n", index);
719 		}
720 		return (mp->type);
721 
722 	case 2:
723 		return (dboot_multiboot2_mmap_get_type(mb2_info,
724 		    mb2_mmap_tagp, index));
725 
726 	default:
727 		dboot_panic("Unknown multiboot version: %d\n",
728 		    multiboot_version);
729 		break;
730 	}
731 	return (0);
732 #else
733 	return (map_buffer[index].type);
734 #endif
735 }
736 
737 static uint64_t
738 dboot_loader_mmap_get_base(int index)
739 {
740 #if !defined(__xpv)
741 	mb_memory_map_t *mp, *mpend;
742 	int i;
743 
744 	switch (multiboot_version) {
745 	case 1:
746 		mp = (mb_memory_map_t *)mb_info->mmap_addr;
747 		mpend = (mb_memory_map_t *)
748 		    (mb_info->mmap_addr + mb_info->mmap_length);
749 
750 		for (i = 0; mp < mpend && i != index; i++)
751 			mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
752 			    sizeof (mp->size));
753 		if (mp >= mpend) {
754 			dboot_panic("dboot_loader_mmap_get_base(): index "
755 			    "out of bounds: %d\n", index);
756 		}
757 		return (((uint64_t)mp->base_addr_high << 32) +
758 		    (uint64_t)mp->base_addr_low);
759 
760 	case 2:
761 		return (dboot_multiboot2_mmap_get_base(mb2_info,
762 		    mb2_mmap_tagp, index));
763 
764 	default:
765 		dboot_panic("Unknown multiboot version: %d\n",
766 		    multiboot_version);
767 		break;
768 	}
769 	return (0);
770 #else
771 	return (((uint64_t)map_buffer[index].base_addr_high << 32) +
772 	    (uint64_t)map_buffer[index].base_addr_low);
773 #endif
774 }
775 
776 static uint64_t
777 dboot_loader_mmap_get_length(int index)
778 {
779 #if !defined(__xpv)
780 	mb_memory_map_t *mp, *mpend;
781 	int i;
782 
783 	switch (multiboot_version) {
784 	case 1:
785 		mp = (mb_memory_map_t *)mb_info->mmap_addr;
786 		mpend = (mb_memory_map_t *)
787 		    (mb_info->mmap_addr + mb_info->mmap_length);
788 
789 		for (i = 0; mp < mpend && i != index; i++)
790 			mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
791 			    sizeof (mp->size));
792 		if (mp >= mpend) {
793 			dboot_panic("dboot_loader_mmap_get_length(): index "
794 			    "out of bounds: %d\n", index);
795 		}
796 		return (((uint64_t)mp->length_high << 32) +
797 		    (uint64_t)mp->length_low);
798 
799 	case 2:
800 		return (dboot_multiboot2_mmap_get_length(mb2_info,
801 		    mb2_mmap_tagp, index));
802 
803 	default:
804 		dboot_panic("Unknown multiboot version: %d\n",
805 		    multiboot_version);
806 		break;
807 	}
808 	return (0);
809 #else
810 	return (((uint64_t)map_buffer[index].length_high << 32) +
811 	    (uint64_t)map_buffer[index].length_low);
812 #endif
813 }
814 
815 static void
816 build_pcimemlists(void)
817 {
818 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
819 	uint64_t start;
820 	uint64_t end;
821 	int i, num;
822 
823 	/*
824 	 * initialize
825 	 */
826 	pcimemlists[0].addr = pci_lo_limit;
827 	pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
828 	pcimemlists_used = 1;
829 
830 	num = dboot_loader_mmap_entries();
831 	/*
832 	 * Fill in PCI memlists.
833 	 */
834 	for (i = 0; i < num; ++i) {
835 		start = dboot_loader_mmap_get_base(i);
836 		end = start + dboot_loader_mmap_get_length(i);
837 
838 		if (prom_debug)
839 			dboot_printf("\ttype: %d %" PRIx64 "..%"
840 			    PRIx64 "\n", dboot_loader_mmap_get_type(i),
841 			    start, end);
842 
843 		/*
844 		 * page align start and end
845 		 */
846 		start = (start + page_offset) & ~page_offset;
847 		end &= ~page_offset;
848 		if (end <= start)
849 			continue;
850 
851 		exclude_from_pci(start, end);
852 	}
853 
854 	/*
855 	 * Finish off the pcimemlist
856 	 */
857 	if (prom_debug) {
858 		for (i = 0; i < pcimemlists_used; ++i) {
859 			dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
860 			    PRIx64 "\n", pcimemlists[i].addr,
861 			    pcimemlists[i].addr + pcimemlists[i].size);
862 		}
863 	}
864 	pcimemlists[0].next = 0;
865 	pcimemlists[0].prev = 0;
866 	for (i = 1; i < pcimemlists_used; ++i) {
867 		pcimemlists[i].prev =
868 		    (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
869 		pcimemlists[i].next = 0;
870 		pcimemlists[i - 1].next =
871 		    (native_ptr_t)(uintptr_t)(pcimemlists + i);
872 	}
873 	bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
874 	DBG(bi->bi_pcimem);
875 }
876 
877 #if defined(__xpv)
878 /*
879  * Initialize memory allocator stuff from hypervisor-supplied start info.
880  */
881 static void
882 init_mem_alloc(void)
883 {
884 	int	local;	/* variables needed to find start region */
885 	paddr_t	scratch_start;
886 	xen_memory_map_t map;
887 
888 	DBG_MSG("Entered init_mem_alloc()\n");
889 
890 	/*
891 	 * Free memory follows the stack. There's at least 512KB of scratch
892 	 * space, rounded up to at least 2Mb alignment.  That should be enough
893 	 * for the page tables we'll need to build.  The nucleus memory is
894 	 * allocated last and will be outside the addressible range.  We'll
895 	 * switch to new page tables before we unpack the kernel
896 	 */
897 	scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
898 	DBG(scratch_start);
899 	scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
900 	DBG(scratch_end);
901 
902 	/*
903 	 * For paranoia, leave some space between hypervisor data and ours.
904 	 * Use 500 instead of 512.
905 	 */
906 	next_avail_addr = scratch_end - 500 * 1024;
907 	DBG(next_avail_addr);
908 
909 	/*
910 	 * The domain builder gives us at most 1 module
911 	 */
912 	DBG(xen_info->mod_len);
913 	if (xen_info->mod_len > 0) {
914 		DBG(xen_info->mod_start);
915 		modules[0].bm_addr =
916 		    (native_ptr_t)(uintptr_t)xen_info->mod_start;
917 		modules[0].bm_size = xen_info->mod_len;
918 		bi->bi_module_cnt = 1;
919 		bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
920 	} else {
921 		bi->bi_module_cnt = 0;
922 		bi->bi_modules = (native_ptr_t)(uintptr_t)NULL;
923 	}
924 	DBG(bi->bi_module_cnt);
925 	DBG(bi->bi_modules);
926 
927 	DBG(xen_info->mfn_list);
928 	DBG(xen_info->nr_pages);
929 	max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
930 	DBG(max_mem);
931 
932 	/*
933 	 * Using pseudo-physical addresses, so only 1 memlist element
934 	 */
935 	memlists[0].addr = 0;
936 	DBG(memlists[0].addr);
937 	memlists[0].size = max_mem;
938 	DBG(memlists[0].size);
939 	memlists_used = 1;
940 	DBG(memlists_used);
941 
942 	/*
943 	 * finish building physinstall list
944 	 */
945 	sort_physinstall();
946 
947 	/*
948 	 * build bios reserved memlists
949 	 */
950 	build_rsvdmemlists();
951 
952 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
953 		/*
954 		 * build PCI Memory list
955 		 */
956 		map.nr_entries = MAXMAPS;
957 		/*LINTED: constant in conditional context*/
958 		set_xen_guest_handle(map.buffer, map_buffer);
959 		if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
960 			dboot_panic("getting XENMEM_machine_memory_map failed");
961 		build_pcimemlists();
962 	}
963 }
964 
965 #else	/* !__xpv */
966 
967 static void
968 dboot_multiboot1_xboot_consinfo(void)
969 {
970 }
971 
972 static void
973 dboot_multiboot2_xboot_consinfo(void)
974 {
975 }
976 
977 static int
978 dboot_multiboot_modcount(void)
979 {
980 	switch (multiboot_version) {
981 	case 1:
982 		return (mb_info->mods_count);
983 
984 	case 2:
985 		return (dboot_multiboot2_modcount(mb2_info));
986 
987 	default:
988 		dboot_panic("Unknown multiboot version: %d\n",
989 		    multiboot_version);
990 		break;
991 	}
992 	return (0);
993 }
994 
995 static uint32_t
996 dboot_multiboot_modstart(int index)
997 {
998 	switch (multiboot_version) {
999 	case 1:
1000 		return (((mb_module_t *)mb_info->mods_addr)[index].mod_start);
1001 
1002 	case 2:
1003 		return (dboot_multiboot2_modstart(mb2_info, index));
1004 
1005 	default:
1006 		dboot_panic("Unknown multiboot version: %d\n",
1007 		    multiboot_version);
1008 		break;
1009 	}
1010 	return (0);
1011 }
1012 
1013 static uint32_t
1014 dboot_multiboot_modend(int index)
1015 {
1016 	switch (multiboot_version) {
1017 	case 1:
1018 		return (((mb_module_t *)mb_info->mods_addr)[index].mod_end);
1019 
1020 	case 2:
1021 		return (dboot_multiboot2_modend(mb2_info, index));
1022 
1023 	default:
1024 		dboot_panic("Unknown multiboot version: %d\n",
1025 		    multiboot_version);
1026 		break;
1027 	}
1028 	return (0);
1029 }
1030 
1031 static char *
1032 dboot_multiboot_modcmdline(int index)
1033 {
1034 	switch (multiboot_version) {
1035 	case 1:
1036 		return ((char *)((mb_module_t *)
1037 		    mb_info->mods_addr)[index].mod_name);
1038 
1039 	case 2:
1040 		return (dboot_multiboot2_modcmdline(mb2_info, index));
1041 
1042 	default:
1043 		dboot_panic("Unknown multiboot version: %d\n",
1044 		    multiboot_version);
1045 		break;
1046 	}
1047 	return (0);
1048 }
1049 
1050 /*
1051  * Find the environment module for console setup.
1052  * Since we need the console to print early boot messages, the console is set up
1053  * before anything else and therefore we need to pick up the environment module
1054  * early too.
1055  *
1056  * Note, we just will search for and if found, will pass the env
1057  * module to console setup, the proper module list processing will happen later.
1058  */
1059 static void
1060 dboot_find_env(void)
1061 {
1062 	int i, modcount;
1063 	uint32_t mod_start, mod_end;
1064 	char *cmdline;
1065 
1066 	modcount = dboot_multiboot_modcount();
1067 
1068 	for (i = 0; i < modcount; ++i) {
1069 		cmdline = dboot_multiboot_modcmdline(i);
1070 		if (cmdline == NULL)
1071 			continue;
1072 
1073 		if (strstr(cmdline, "type=environment") == NULL)
1074 			continue;
1075 
1076 		mod_start = dboot_multiboot_modstart(i);
1077 		mod_end = dboot_multiboot_modend(i);
1078 		modules[0].bm_addr = (native_ptr_t)(uintptr_t)mod_start;
1079 		modules[0].bm_size = mod_end - mod_start;
1080 		modules[0].bm_name = (native_ptr_t)(uintptr_t)NULL;
1081 		modules[0].bm_hash = (native_ptr_t)(uintptr_t)NULL;
1082 		modules[0].bm_type = BMT_ENV;
1083 		bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1084 		bi->bi_module_cnt = 1;
1085 		return;
1086 	}
1087 }
1088 
1089 static boolean_t
1090 dboot_multiboot_basicmeminfo(uint32_t *lower, uint32_t *upper)
1091 {
1092 	boolean_t rv = B_FALSE;
1093 
1094 	switch (multiboot_version) {
1095 	case 1:
1096 		if (mb_info->flags & 0x01) {
1097 			*lower = mb_info->mem_lower;
1098 			*upper = mb_info->mem_upper;
1099 			rv = B_TRUE;
1100 		}
1101 		break;
1102 
1103 	case 2:
1104 		return (dboot_multiboot2_basicmeminfo(mb2_info, lower, upper));
1105 
1106 	default:
1107 		dboot_panic("Unknown multiboot version: %d\n",
1108 		    multiboot_version);
1109 		break;
1110 	}
1111 	return (rv);
1112 }
1113 
1114 static uint8_t
1115 dboot_a2h(char v)
1116 {
1117 	if (v >= 'a')
1118 		return (v - 'a' + 0xa);
1119 	else if (v >= 'A')
1120 		return (v - 'A' + 0xa);
1121 	else if (v >= '0')
1122 		return (v - '0');
1123 	else
1124 		dboot_panic("bad ASCII hex character %c\n", v);
1125 
1126 	return (0);
1127 }
1128 
1129 static void
1130 digest_a2h(const char *ascii, uint8_t *digest)
1131 {
1132 	unsigned int i;
1133 
1134 	for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1135 		digest[i] = dboot_a2h(ascii[i * 2]) << 4;
1136 		digest[i] |= dboot_a2h(ascii[i * 2 + 1]);
1137 	}
1138 }
1139 
1140 /*
1141  * Generate a SHA-1 hash of the first len bytes of image, and compare it with
1142  * the ASCII-format hash found in the 40-byte buffer at ascii.  If they
1143  * match, return 0, otherwise -1.  This works only for images smaller than
1144  * 4 GB, which should not be a problem.
1145  */
1146 static int
1147 check_image_hash(uint_t midx)
1148 {
1149 	const char *ascii;
1150 	const void *image;
1151 	size_t len;
1152 	SHA1_CTX ctx;
1153 	uint8_t digest[SHA1_DIGEST_LENGTH];
1154 	uint8_t baseline[SHA1_DIGEST_LENGTH];
1155 	unsigned int i;
1156 
1157 	ascii = (const char *)(uintptr_t)modules[midx].bm_hash;
1158 	image = (const void *)(uintptr_t)modules[midx].bm_addr;
1159 	len = (size_t)modules[midx].bm_size;
1160 
1161 	digest_a2h(ascii, baseline);
1162 
1163 	SHA1Init(&ctx);
1164 	SHA1Update(&ctx, image, len);
1165 	SHA1Final(digest, &ctx);
1166 
1167 	for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1168 		if (digest[i] != baseline[i])
1169 			return (-1);
1170 	}
1171 
1172 	return (0);
1173 }
1174 
1175 static const char *
1176 type_to_str(boot_module_type_t type)
1177 {
1178 	switch (type) {
1179 	case BMT_ROOTFS:
1180 		return ("rootfs");
1181 	case BMT_FILE:
1182 		return ("file");
1183 	case BMT_HASH:
1184 		return ("hash");
1185 	case BMT_ENV:
1186 		return ("environment");
1187 	default:
1188 		return ("unknown");
1189 	}
1190 }
1191 
1192 static void
1193 check_images(void)
1194 {
1195 	uint_t i;
1196 	char displayhash[SHA1_ASCII_LENGTH + 1];
1197 
1198 	for (i = 0; i < modules_used; i++) {
1199 		if (prom_debug) {
1200 			dboot_printf("module #%d: name %s type %s "
1201 			    "addr %lx size %lx\n",
1202 			    i, (char *)(uintptr_t)modules[i].bm_name,
1203 			    type_to_str(modules[i].bm_type),
1204 			    (ulong_t)modules[i].bm_addr,
1205 			    (ulong_t)modules[i].bm_size);
1206 		}
1207 
1208 		if (modules[i].bm_type == BMT_HASH ||
1209 		    modules[i].bm_hash == (native_ptr_t)(uintptr_t)NULL) {
1210 			DBG_MSG("module has no hash; skipping check\n");
1211 			continue;
1212 		}
1213 		(void) memcpy(displayhash,
1214 		    (void *)(uintptr_t)modules[i].bm_hash,
1215 		    SHA1_ASCII_LENGTH);
1216 		displayhash[SHA1_ASCII_LENGTH] = '\0';
1217 		if (prom_debug) {
1218 			dboot_printf("checking expected hash [%s]: ",
1219 			    displayhash);
1220 		}
1221 
1222 		if (check_image_hash(i) != 0)
1223 			dboot_panic("hash mismatch!\n");
1224 		else
1225 			DBG_MSG("OK\n");
1226 	}
1227 }
1228 
1229 /*
1230  * Determine the module's starting address, size, name, and type, and fill the
1231  * boot_modules structure.  This structure is used by the bop code, except for
1232  * hashes which are checked prior to transferring control to the kernel.
1233  */
1234 static void
1235 process_module(int midx)
1236 {
1237 	uint32_t mod_start = dboot_multiboot_modstart(midx);
1238 	uint32_t mod_end = dboot_multiboot_modend(midx);
1239 	char *cmdline = dboot_multiboot_modcmdline(midx);
1240 	char *p, *q;
1241 
1242 	check_higher(mod_end);
1243 	if (prom_debug) {
1244 		dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n",
1245 		    midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end);
1246 	}
1247 
1248 	if (mod_start > mod_end) {
1249 		dboot_panic("module #%d: module start address 0x%lx greater "
1250 		    "than end address 0x%lx", midx,
1251 		    (ulong_t)mod_start, (ulong_t)mod_end);
1252 	}
1253 
1254 	/*
1255 	 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes
1256 	 * the address of the last valid byte in a module plus 1 as mod_end.
1257 	 * This is of course a bug; the multiboot specification simply states
1258 	 * that mod_start and mod_end "contain the start and end addresses of
1259 	 * the boot module itself" which is pretty obviously not what GRUB is
1260 	 * doing.  However, fixing it requires that not only this code be
1261 	 * changed but also that other code consuming this value and values
1262 	 * derived from it be fixed, and that the kernel and GRUB must either
1263 	 * both have the bug or neither.  While there are a lot of combinations
1264 	 * that will work, there are also some that won't, so for simplicity
1265 	 * we'll just cope with the bug.  That means we won't actually hash the
1266 	 * byte at mod_end, and we will expect that mod_end for the hash file
1267 	 * itself is one greater than some multiple of 41 (40 bytes of ASCII
1268 	 * hash plus a newline for each module).  We set bm_size to the true
1269 	 * correct number of bytes in each module, achieving exactly this.
1270 	 */
1271 
1272 	modules[midx].bm_addr = (native_ptr_t)(uintptr_t)mod_start;
1273 	modules[midx].bm_size = mod_end - mod_start;
1274 	modules[midx].bm_name = (native_ptr_t)(uintptr_t)cmdline;
1275 	modules[midx].bm_hash = (native_ptr_t)(uintptr_t)NULL;
1276 	modules[midx].bm_type = BMT_FILE;
1277 
1278 	if (cmdline == NULL) {
1279 		modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname;
1280 		return;
1281 	}
1282 
1283 	p = cmdline;
1284 	modules[midx].bm_name =
1285 	    (native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r");
1286 
1287 	while (p != NULL) {
1288 		q = strsep(&p, " \t\f\n\r");
1289 		if (strncmp(q, "name=", 5) == 0) {
1290 			if (q[5] != '\0' && !isspace(q[5])) {
1291 				modules[midx].bm_name =
1292 				    (native_ptr_t)(uintptr_t)(q + 5);
1293 			}
1294 			continue;
1295 		}
1296 
1297 		if (strncmp(q, "type=", 5) == 0) {
1298 			if (q[5] == '\0' || isspace(q[5]))
1299 				continue;
1300 			q += 5;
1301 			if (strcmp(q, "rootfs") == 0) {
1302 				modules[midx].bm_type = BMT_ROOTFS;
1303 			} else if (strcmp(q, "hash") == 0) {
1304 				modules[midx].bm_type = BMT_HASH;
1305 			} else if (strcmp(q, "environment") == 0) {
1306 				modules[midx].bm_type = BMT_ENV;
1307 			} else if (strcmp(q, "file") != 0) {
1308 				dboot_printf("\tmodule #%d: unknown module "
1309 				    "type '%s'; defaulting to 'file'",
1310 				    midx, q);
1311 			}
1312 			continue;
1313 		}
1314 
1315 		if (strncmp(q, "hash=", 5) == 0) {
1316 			if (q[5] != '\0' && !isspace(q[5])) {
1317 				modules[midx].bm_hash =
1318 				    (native_ptr_t)(uintptr_t)(q + 5);
1319 			}
1320 			continue;
1321 		}
1322 
1323 		dboot_printf("ignoring unknown option '%s'\n", q);
1324 	}
1325 }
1326 
1327 /*
1328  * Backward compatibility: if there are exactly one or two modules, both
1329  * of type 'file' and neither with an embedded hash value, we have been
1330  * given the legacy style modules.  In this case we need to treat the first
1331  * module as a rootfs and the second as a hash referencing that module.
1332  * Otherwise, even if the configuration is invalid, we assume that the
1333  * operator knows what he's doing or at least isn't being bitten by this
1334  * interface change.
1335  */
1336 static void
1337 fixup_modules(void)
1338 {
1339 	if (modules_used == 0 || modules_used > 2)
1340 		return;
1341 
1342 	if (modules[0].bm_type != BMT_FILE ||
1343 	    modules_used > 1 && modules[1].bm_type != BMT_FILE) {
1344 		return;
1345 	}
1346 
1347 	if (modules[0].bm_hash != (native_ptr_t)(uintptr_t)NULL ||
1348 	    modules_used > 1 &&
1349 	    modules[1].bm_hash != (native_ptr_t)(uintptr_t)NULL) {
1350 		return;
1351 	}
1352 
1353 	modules[0].bm_type = BMT_ROOTFS;
1354 	if (modules_used > 1) {
1355 		modules[1].bm_type = BMT_HASH;
1356 		modules[1].bm_name = modules[0].bm_name;
1357 	}
1358 }
1359 
1360 /*
1361  * For modules that do not have assigned hashes but have a separate hash module,
1362  * find the assigned hash module and set the primary module's bm_hash to point
1363  * to the hash data from that module.  We will then ignore modules of type
1364  * BMT_HASH from this point forward.
1365  */
1366 static void
1367 assign_module_hashes(void)
1368 {
1369 	uint_t i, j;
1370 
1371 	for (i = 0; i < modules_used; i++) {
1372 		if (modules[i].bm_type == BMT_HASH ||
1373 		    modules[i].bm_hash != (native_ptr_t)(uintptr_t)NULL) {
1374 			continue;
1375 		}
1376 
1377 		for (j = 0; j < modules_used; j++) {
1378 			if (modules[j].bm_type != BMT_HASH ||
1379 			    strcmp((char *)(uintptr_t)modules[j].bm_name,
1380 			    (char *)(uintptr_t)modules[i].bm_name) != 0) {
1381 				continue;
1382 			}
1383 
1384 			if (modules[j].bm_size < SHA1_ASCII_LENGTH) {
1385 				dboot_printf("Short hash module of length "
1386 				    "0x%lx bytes; ignoring\n",
1387 				    (ulong_t)modules[j].bm_size);
1388 			} else {
1389 				modules[i].bm_hash = modules[j].bm_addr;
1390 			}
1391 			break;
1392 		}
1393 	}
1394 }
1395 
1396 /*
1397  * Walk through the module information finding the last used address.
1398  * The first available address will become the top level page table.
1399  */
1400 static void
1401 dboot_process_modules(void)
1402 {
1403 	int i, modcount;
1404 	extern char _end[];
1405 
1406 	DBG_MSG("\nFinding Modules\n");
1407 	modcount = dboot_multiboot_modcount();
1408 	if (modcount > MAX_BOOT_MODULES) {
1409 		dboot_panic("Too many modules (%d) -- the maximum is %d.",
1410 		    modcount, MAX_BOOT_MODULES);
1411 	}
1412 	/*
1413 	 * search the modules to find the last used address
1414 	 * we'll build the module list while we're walking through here
1415 	 */
1416 	check_higher((paddr_t)(uintptr_t)&_end);
1417 	for (i = 0; i < modcount; ++i) {
1418 		process_module(i);
1419 		modules_used++;
1420 	}
1421 	bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1422 	DBG(bi->bi_modules);
1423 	bi->bi_module_cnt = modcount;
1424 	DBG(bi->bi_module_cnt);
1425 
1426 	fixup_modules();
1427 	assign_module_hashes();
1428 	check_images();
1429 }
1430 
1431 /*
1432  * We then build the phys_install memlist from the multiboot information.
1433  */
1434 static void
1435 dboot_process_mmap(void)
1436 {
1437 	uint64_t start;
1438 	uint64_t end;
1439 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
1440 	uint32_t lower, upper;
1441 	int i, mmap_entries;
1442 
1443 	/*
1444 	 * Walk through the memory map from multiboot and build our memlist
1445 	 * structures. Note these will have native format pointers.
1446 	 */
1447 	DBG_MSG("\nFinding Memory Map\n");
1448 	num_entries = 0;
1449 	num_entries_set = B_FALSE;
1450 	max_mem = 0;
1451 	if ((mmap_entries = dboot_loader_mmap_entries()) > 0) {
1452 		for (i = 0; i < mmap_entries; i++) {
1453 			uint32_t type = dboot_loader_mmap_get_type(i);
1454 			start = dboot_loader_mmap_get_base(i);
1455 			end = start + dboot_loader_mmap_get_length(i);
1456 
1457 			if (prom_debug)
1458 				dboot_printf("\ttype: %d %" PRIx64 "..%"
1459 				    PRIx64 "\n", type, start, end);
1460 
1461 			/*
1462 			 * page align start and end
1463 			 */
1464 			start = (start + page_offset) & ~page_offset;
1465 			end &= ~page_offset;
1466 			if (end <= start)
1467 				continue;
1468 
1469 			/*
1470 			 * only type 1 is usable RAM
1471 			 */
1472 			switch (type) {
1473 			case 1:
1474 				if (end > max_mem)
1475 					max_mem = end;
1476 				memlists[memlists_used].addr = start;
1477 				memlists[memlists_used].size = end - start;
1478 				++memlists_used;
1479 				if (memlists_used > MAX_MEMLIST)
1480 					dboot_panic("too many memlists");
1481 				break;
1482 			case 2:
1483 				rsvdmemlists[rsvdmemlists_used].addr = start;
1484 				rsvdmemlists[rsvdmemlists_used].size =
1485 				    end - start;
1486 				++rsvdmemlists_used;
1487 				if (rsvdmemlists_used > MAX_MEMLIST)
1488 					dboot_panic("too many rsvdmemlists");
1489 				break;
1490 			default:
1491 				continue;
1492 			}
1493 		}
1494 		build_pcimemlists();
1495 	} else if (dboot_multiboot_basicmeminfo(&lower, &upper)) {
1496 		DBG(lower);
1497 		memlists[memlists_used].addr = 0;
1498 		memlists[memlists_used].size = lower * 1024;
1499 		++memlists_used;
1500 		DBG(upper);
1501 		memlists[memlists_used].addr = 1024 * 1024;
1502 		memlists[memlists_used].size = upper * 1024;
1503 		++memlists_used;
1504 
1505 		/*
1506 		 * Old platform - assume I/O space at the end of memory.
1507 		 */
1508 		pcimemlists[0].addr = (upper * 1024) + (1024 * 1024);
1509 		pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
1510 		pcimemlists[0].next = 0;
1511 		pcimemlists[0].prev = 0;
1512 		bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
1513 		DBG(bi->bi_pcimem);
1514 	} else {
1515 		dboot_panic("No memory info from boot loader!!!");
1516 	}
1517 
1518 	/*
1519 	 * finish processing the physinstall list
1520 	 */
1521 	sort_physinstall();
1522 
1523 	/*
1524 	 * build bios reserved mem lists
1525 	 */
1526 	build_rsvdmemlists();
1527 }
1528 
1529 /*
1530  * The highest address is used as the starting point for dboot's simple
1531  * memory allocator.
1532  *
1533  * Finding the highest address in case of Multiboot 1 protocol is
1534  * quite painful in the sense that some information provided by
1535  * the multiboot info structure points to BIOS data, and some to RAM.
1536  *
1537  * The module list was processed and checked already by dboot_process_modules(),
1538  * so we will check the command line string and the memory map.
1539  *
1540  * This list of to be checked items is based on our current knowledge of
1541  * allocations made by grub1 and will need to be reviewed if there
1542  * are updates about the information provided by Multiboot 1.
1543  *
1544  * In the case of the Multiboot 2, our life is much simpler, as the MB2
1545  * information tag list is one contiguous chunk of memory.
1546  */
1547 static paddr_t
1548 dboot_multiboot1_highest_addr(void)
1549 {
1550 	paddr_t addr = (paddr_t)(uintptr_t)NULL;
1551 	char *cmdl = (char *)mb_info->cmdline;
1552 
1553 	if (mb_info->flags & MB_INFO_CMDLINE)
1554 		addr = ((paddr_t)((uintptr_t)cmdl + strlen(cmdl) + 1));
1555 
1556 	if (mb_info->flags & MB_INFO_MEM_MAP)
1557 		addr = MAX(addr,
1558 		    ((paddr_t)(mb_info->mmap_addr + mb_info->mmap_length)));
1559 	return (addr);
1560 }
1561 
1562 static void
1563 dboot_multiboot_highest_addr(void)
1564 {
1565 	paddr_t addr;
1566 
1567 	switch (multiboot_version) {
1568 	case 1:
1569 		addr = dboot_multiboot1_highest_addr();
1570 		if (addr != (paddr_t)(uintptr_t)NULL)
1571 			check_higher(addr);
1572 		break;
1573 	case 2:
1574 		addr = dboot_multiboot2_highest_addr(mb2_info);
1575 		if (addr != (paddr_t)(uintptr_t)NULL)
1576 			check_higher(addr);
1577 		break;
1578 	default:
1579 		dboot_panic("Unknown multiboot version: %d\n",
1580 		    multiboot_version);
1581 		break;
1582 	}
1583 }
1584 
1585 /*
1586  * Walk the boot loader provided information and find the highest free address.
1587  */
1588 static void
1589 init_mem_alloc(void)
1590 {
1591 	DBG_MSG("Entered init_mem_alloc()\n");
1592 	dboot_process_modules();
1593 	dboot_process_mmap();
1594 	dboot_multiboot_highest_addr();
1595 }
1596 
1597 static void
1598 dboot_multiboot_get_fwtables(void)
1599 {
1600 	multiboot_tag_new_acpi_t *nacpitagp;
1601 	multiboot_tag_old_acpi_t *oacpitagp;
1602 
1603 	/* no fw tables from multiboot 1 */
1604 	if (multiboot_version != 2)
1605 		return;
1606 
1607 	/* only provide SMBIOS pointer in case of UEFI */
1608 	bi->bi_smbios = (native_ptr_t)(uintptr_t)NULL;
1609 
1610 	nacpitagp = (multiboot_tag_new_acpi_t *)
1611 	    dboot_multiboot2_find_tag(mb2_info,
1612 	    MULTIBOOT_TAG_TYPE_ACPI_NEW);
1613 	oacpitagp = (multiboot_tag_old_acpi_t *)
1614 	    dboot_multiboot2_find_tag(mb2_info,
1615 	    MULTIBOOT_TAG_TYPE_ACPI_OLD);
1616 
1617 	if (nacpitagp != NULL) {
1618 		bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1619 		    &nacpitagp->mb_rsdp[0];
1620 	} else if (oacpitagp != NULL) {
1621 		bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1622 		    &oacpitagp->mb_rsdp[0];
1623 	} else {
1624 		bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)NULL;
1625 	}
1626 }
1627 #endif /* !__xpv */
1628 
1629 /*
1630  * Simple memory allocator, allocates aligned physical memory.
1631  * Note that startup_kernel() only allocates memory, never frees.
1632  * Memory usage just grows in an upward direction.
1633  */
1634 static void *
1635 do_mem_alloc(uint32_t size, uint32_t align)
1636 {
1637 	uint_t i;
1638 	uint64_t best;
1639 	uint64_t start;
1640 	uint64_t end;
1641 
1642 	/*
1643 	 * make sure size is a multiple of pagesize
1644 	 */
1645 	size = RNDUP(size, MMU_PAGESIZE);
1646 	next_avail_addr = RNDUP(next_avail_addr, align);
1647 
1648 	/*
1649 	 * XXPV fixme joe
1650 	 *
1651 	 * a really large bootarchive that causes you to run out of memory
1652 	 * may cause this to blow up
1653 	 */
1654 	/* LINTED E_UNEXPECTED_UINT_PROMOTION */
1655 	best = (uint64_t)-size;
1656 	for (i = 0; i < memlists_used; ++i) {
1657 		start = memlists[i].addr;
1658 #if defined(__xpv)
1659 		start += mfn_base;
1660 #endif
1661 		end = start + memlists[i].size;
1662 
1663 		/*
1664 		 * did we find the desired address?
1665 		 */
1666 		if (start <= next_avail_addr && next_avail_addr + size <= end) {
1667 			best = next_avail_addr;
1668 			goto done;
1669 		}
1670 
1671 		/*
1672 		 * if not is this address the best so far?
1673 		 */
1674 		if (start > next_avail_addr && start < best &&
1675 		    RNDUP(start, align) + size <= end)
1676 			best = RNDUP(start, align);
1677 	}
1678 
1679 	/*
1680 	 * We didn't find exactly the address we wanted, due to going off the
1681 	 * end of a memory region. Return the best found memory address.
1682 	 */
1683 done:
1684 	next_avail_addr = best + size;
1685 #if defined(__xpv)
1686 	if (next_avail_addr > scratch_end)
1687 		dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
1688 		    "0x%lx", (ulong_t)next_avail_addr,
1689 		    (ulong_t)scratch_end);
1690 #endif
1691 	(void) memset((void *)(uintptr_t)best, 0, size);
1692 	return ((void *)(uintptr_t)best);
1693 }
1694 
1695 void *
1696 mem_alloc(uint32_t size)
1697 {
1698 	return (do_mem_alloc(size, MMU_PAGESIZE));
1699 }
1700 
1701 
1702 /*
1703  * Build page tables to map all of memory used so far as well as the kernel.
1704  */
1705 static void
1706 build_page_tables(void)
1707 {
1708 	uint32_t psize;
1709 	uint32_t level;
1710 	uint32_t off;
1711 	uint64_t start;
1712 #if !defined(__xpv)
1713 	uint32_t i;
1714 	uint64_t end;
1715 #endif	/* __xpv */
1716 
1717 	/*
1718 	 * If we're on metal, we need to create the top level pagetable.
1719 	 */
1720 #if defined(__xpv)
1721 	top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1722 #else /* __xpv */
1723 	top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1724 #endif /* __xpv */
1725 	DBG((uintptr_t)top_page_table);
1726 
1727 	/*
1728 	 * Determine if we'll use large mappings for kernel, then map it.
1729 	 */
1730 	if (largepage_support) {
1731 		psize = lpagesize;
1732 		level = 1;
1733 	} else {
1734 		psize = MMU_PAGESIZE;
1735 		level = 0;
1736 	}
1737 
1738 	DBG_MSG("Mapping kernel\n");
1739 	DBG(ktext_phys);
1740 	DBG(target_kernel_text);
1741 	DBG(ksize);
1742 	DBG(psize);
1743 	for (off = 0; off < ksize; off += psize)
1744 		map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1745 
1746 	/*
1747 	 * The kernel will need a 1 page window to work with page tables
1748 	 */
1749 	bi->bi_pt_window = (native_ptr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1750 	DBG(bi->bi_pt_window);
1751 	bi->bi_pte_to_pt_window =
1752 	    (native_ptr_t)(uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1753 	DBG(bi->bi_pte_to_pt_window);
1754 
1755 #if defined(__xpv)
1756 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1757 		/* If this is a domU we're done. */
1758 		DBG_MSG("\nPage tables constructed\n");
1759 		return;
1760 	}
1761 #endif /* __xpv */
1762 
1763 	/*
1764 	 * We need 1:1 mappings for the lower 1M of memory to access
1765 	 * BIOS tables used by a couple of drivers during boot.
1766 	 *
1767 	 * The following code works because our simple memory allocator
1768 	 * only grows usage in an upwards direction.
1769 	 *
1770 	 * Note that by this point in boot some mappings for low memory
1771 	 * may already exist because we've already accessed device in low
1772 	 * memory.  (Specifically the video frame buffer and keyboard
1773 	 * status ports.)  If we're booting on raw hardware then GRUB
1774 	 * created these mappings for us.  If we're booting under a
1775 	 * hypervisor then we went ahead and remapped these devices into
1776 	 * memory allocated within dboot itself.
1777 	 */
1778 	if (map_debug)
1779 		dboot_printf("1:1 map pa=0..1Meg\n");
1780 	for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
1781 #if defined(__xpv)
1782 		map_ma_at_va(start, start, 0);
1783 #else /* __xpv */
1784 		map_pa_at_va(start, start, 0);
1785 #endif /* __xpv */
1786 	}
1787 
1788 #if !defined(__xpv)
1789 	for (i = 0; i < memlists_used; ++i) {
1790 		start = memlists[i].addr;
1791 
1792 		end = start + memlists[i].size;
1793 
1794 		if (map_debug)
1795 			dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
1796 			    start, end);
1797 		while (start < end && start < next_avail_addr) {
1798 			map_pa_at_va(start, start, 0);
1799 			start += MMU_PAGESIZE;
1800 		}
1801 	}
1802 #endif /* !__xpv */
1803 
1804 	DBG_MSG("\nPage tables constructed\n");
1805 }
1806 
1807 #define	NO_MULTIBOOT	\
1808 "multiboot is no longer used to boot the Solaris Operating System.\n\
1809 The grub entry should be changed to:\n\
1810 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
1811 module$ /platform/i86pc/$ISADIR/boot_archive\n\
1812 See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
1813 
1814 static void
1815 dboot_init_xboot_consinfo(void)
1816 {
1817 	uintptr_t addr;
1818 	/*
1819 	 * boot info must be 16 byte aligned for 64 bit kernel ABI
1820 	 */
1821 	addr = (uintptr_t)boot_info;
1822 	addr = (addr + 0xf) & ~0xf;
1823 	bi = (struct xboot_info *)addr;
1824 
1825 #if !defined(__xpv)
1826 	switch (multiboot_version) {
1827 	case 1:
1828 		dboot_multiboot1_xboot_consinfo();
1829 		break;
1830 	case 2:
1831 		dboot_multiboot2_xboot_consinfo();
1832 		break;
1833 	default:
1834 		dboot_panic("Unknown multiboot version: %d\n",
1835 		    multiboot_version);
1836 		break;
1837 	}
1838 	/*
1839 	 * Lookup environment module for the console. Complete module list
1840 	 * will be built after console setup.
1841 	 */
1842 	dboot_find_env();
1843 #endif
1844 }
1845 
1846 /*
1847  * Set up basic data from the boot loader.
1848  * The load_addr is part of AOUT kludge setup in dboot_grub.s, to support
1849  * 32-bit dboot code setup used to set up and start 64-bit kernel.
1850  * AOUT kludge does allow 32-bit boot loader, such as grub1, to load and
1851  * start 64-bit illumos kernel.
1852  */
1853 static void
1854 dboot_loader_init(void)
1855 {
1856 #if !defined(__xpv)
1857 	mb_info = NULL;
1858 	mb2_info = NULL;
1859 
1860 	switch (mb_magic) {
1861 	case MB_BOOTLOADER_MAGIC:
1862 		multiboot_version = 1;
1863 		mb_info = (multiboot_info_t *)(uintptr_t)mb_addr;
1864 #if defined(_BOOT_TARGET_amd64)
1865 		load_addr = mb_header.load_addr;
1866 #endif
1867 		break;
1868 
1869 	case MULTIBOOT2_BOOTLOADER_MAGIC:
1870 		multiboot_version = 2;
1871 		mb2_info = (multiboot2_info_header_t *)(uintptr_t)mb_addr;
1872 		mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mb2_info);
1873 #if defined(_BOOT_TARGET_amd64)
1874 		load_addr = mb2_load_addr;
1875 #endif
1876 		break;
1877 
1878 	default:
1879 		dboot_panic("Unknown bootloader magic: 0x%x\n", mb_magic);
1880 		break;
1881 	}
1882 #endif	/* !defined(__xpv) */
1883 }
1884 
1885 /* Extract the kernel command line from [multi]boot information. */
1886 static char *
1887 dboot_loader_cmdline(void)
1888 {
1889 	char *line = NULL;
1890 
1891 #if defined(__xpv)
1892 	line = (char *)xen_info->cmd_line;
1893 #else /* __xpv */
1894 
1895 	switch (multiboot_version) {
1896 	case 1:
1897 		if (mb_info->flags & MB_INFO_CMDLINE)
1898 			line = (char *)mb_info->cmdline;
1899 		break;
1900 
1901 	case 2:
1902 		line = dboot_multiboot2_cmdline(mb2_info);
1903 		break;
1904 
1905 	default:
1906 		dboot_panic("Unknown multiboot version: %d\n",
1907 		    multiboot_version);
1908 		break;
1909 	}
1910 
1911 #endif /* __xpv */
1912 
1913 	/*
1914 	 * Make sure we have valid pointer so the string operations
1915 	 * will not crash us.
1916 	 */
1917 	if (line == NULL)
1918 		line = "";
1919 
1920 	return (line);
1921 }
1922 
1923 static char *
1924 dboot_loader_name(void)
1925 {
1926 #if defined(__xpv)
1927 	return (NULL);
1928 #else /* __xpv */
1929 	multiboot_tag_string_t *tag;
1930 
1931 	switch (multiboot_version) {
1932 	case 1:
1933 		return ((char *)mb_info->boot_loader_name);
1934 
1935 	case 2:
1936 		tag = dboot_multiboot2_find_tag(mb2_info,
1937 		    MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME);
1938 		return (tag->mb_string);
1939 	default:
1940 		dboot_panic("Unknown multiboot version: %d\n",
1941 		    multiboot_version);
1942 		break;
1943 	}
1944 
1945 	return (NULL);
1946 #endif /* __xpv */
1947 }
1948 /*
1949  * startup_kernel has a pretty simple job. It builds pagetables which reflect
1950  * 1:1 mappings for all memory in use. It then also adds mappings for
1951  * the kernel nucleus at virtual address of target_kernel_text using large page
1952  * mappings. The page table pages are also accessible at 1:1 mapped
1953  * virtual addresses.
1954  */
1955 /*ARGSUSED*/
1956 void
1957 startup_kernel(void)
1958 {
1959 	char *cmdline;
1960 	char *bootloader;
1961 #if defined(__xpv)
1962 	physdev_set_iopl_t set_iopl;
1963 #endif /* __xpv */
1964 
1965 	dboot_loader_init();
1966 	/*
1967 	 * At this point we are executing in a 32 bit real mode.
1968 	 */
1969 
1970 	bootloader = dboot_loader_name();
1971 	cmdline = dboot_loader_cmdline();
1972 
1973 #if defined(__xpv)
1974 	/*
1975 	 * For dom0, before we initialize the console subsystem we'll
1976 	 * need to enable io operations, so set I/O priveldge level to 1.
1977 	 */
1978 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1979 		set_iopl.iopl = 1;
1980 		(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1981 	}
1982 #endif /* __xpv */
1983 
1984 	dboot_init_xboot_consinfo();
1985 	bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
1986 	bcons_init(bi);
1987 
1988 	prom_debug = (find_boot_prop("prom_debug") != NULL);
1989 	map_debug = (find_boot_prop("map_debug") != NULL);
1990 
1991 #if !defined(__xpv)
1992 	dboot_multiboot_get_fwtables();
1993 #endif
1994 	DBG_MSG("\n\nillumos prekernel set: ");
1995 	DBG_MSG(cmdline);
1996 	DBG_MSG("\n");
1997 
1998 	if (bootloader != NULL && prom_debug) {
1999 		dboot_printf("Kernel loaded by: %s\n", bootloader);
2000 #if !defined(__xpv)
2001 		dboot_printf("Using multiboot %d boot protocol.\n",
2002 		    multiboot_version);
2003 #endif
2004 	}
2005 
2006 	if (strstr(cmdline, "multiboot") != NULL) {
2007 		dboot_panic(NO_MULTIBOOT);
2008 	}
2009 
2010 	DBG((uintptr_t)bi);
2011 #if !defined(__xpv)
2012 	DBG((uintptr_t)mb_info);
2013 	DBG((uintptr_t)mb2_info);
2014 	if (mb2_info != NULL)
2015 		DBG(mb2_info->mbi_total_size);
2016 	DBG(bi->bi_acpi_rsdp);
2017 	DBG(bi->bi_smbios);
2018 #endif
2019 
2020 	/*
2021 	 * Need correct target_kernel_text value
2022 	 */
2023 #if defined(_BOOT_TARGET_amd64)
2024 	target_kernel_text = KERNEL_TEXT_amd64;
2025 #elif defined(__xpv)
2026 	target_kernel_text = KERNEL_TEXT_i386_xpv;
2027 #else
2028 	target_kernel_text = KERNEL_TEXT_i386;
2029 #endif
2030 	DBG(target_kernel_text);
2031 
2032 #if defined(__xpv)
2033 
2034 	/*
2035 	 * XXPV	Derive this stuff from CPUID / what the hypervisor has enabled
2036 	 */
2037 
2038 #if defined(_BOOT_TARGET_amd64)
2039 	/*
2040 	 * 64-bit hypervisor.
2041 	 */
2042 	amd64_support = 1;
2043 	pae_support = 1;
2044 
2045 #else	/* _BOOT_TARGET_amd64 */
2046 
2047 	/*
2048 	 * See if we are running on a PAE Hypervisor
2049 	 */
2050 	{
2051 		xen_capabilities_info_t caps;
2052 
2053 		if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
2054 			dboot_panic("HYPERVISOR_xen_version(caps) failed");
2055 		caps[sizeof (caps) - 1] = 0;
2056 		if (prom_debug)
2057 			dboot_printf("xen capabilities %s\n", caps);
2058 		if (strstr(caps, "x86_32p") != NULL)
2059 			pae_support = 1;
2060 	}
2061 
2062 #endif	/* _BOOT_TARGET_amd64 */
2063 	{
2064 		xen_platform_parameters_t p;
2065 
2066 		if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
2067 			dboot_panic("HYPERVISOR_xen_version(parms) failed");
2068 		DBG(p.virt_start);
2069 		mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
2070 	}
2071 
2072 	/*
2073 	 * The hypervisor loads stuff starting at 1Gig
2074 	 */
2075 	mfn_base = ONE_GIG;
2076 	DBG(mfn_base);
2077 
2078 	/*
2079 	 * enable writable page table mode for the hypervisor
2080 	 */
2081 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2082 	    VMASST_TYPE_writable_pagetables) < 0)
2083 		dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
2084 
2085 	/*
2086 	 * check for NX support
2087 	 */
2088 	if (pae_support) {
2089 		uint32_t eax = 0x80000000;
2090 		uint32_t edx = get_cpuid_edx(&eax);
2091 
2092 		if (eax >= 0x80000001) {
2093 			eax = 0x80000001;
2094 			edx = get_cpuid_edx(&eax);
2095 			if (edx & CPUID_AMD_EDX_NX)
2096 				NX_support = 1;
2097 		}
2098 	}
2099 
2100 #if !defined(_BOOT_TARGET_amd64)
2101 
2102 	/*
2103 	 * The 32-bit hypervisor uses segmentation to protect itself from
2104 	 * guests. This means when a guest attempts to install a flat 4GB
2105 	 * code or data descriptor the 32-bit hypervisor will protect itself
2106 	 * by silently shrinking the segment such that if the guest attempts
2107 	 * any access where the hypervisor lives a #gp fault is generated.
2108 	 * The problem is that some applications expect a full 4GB flat
2109 	 * segment for their current thread pointer and will use negative
2110 	 * offset segment wrap around to access data. TLS support in linux
2111 	 * brand is one example of this.
2112 	 *
2113 	 * The 32-bit hypervisor can catch the #gp fault in these cases
2114 	 * and emulate the access without passing the #gp fault to the guest
2115 	 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
2116 	 * Seems like this should have been the default.
2117 	 * Either way, we want the hypervisor -- and not Solaris -- to deal
2118 	 * to deal with emulating these accesses.
2119 	 */
2120 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2121 	    VMASST_TYPE_4gb_segments) < 0)
2122 		dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
2123 #endif	/* !_BOOT_TARGET_amd64 */
2124 
2125 #else	/* __xpv */
2126 
2127 	/*
2128 	 * use cpuid to enable MMU features
2129 	 */
2130 	if (have_cpuid()) {
2131 		uint32_t eax, edx;
2132 
2133 		eax = 1;
2134 		edx = get_cpuid_edx(&eax);
2135 		if (edx & CPUID_INTC_EDX_PSE)
2136 			largepage_support = 1;
2137 		if (edx & CPUID_INTC_EDX_PGE)
2138 			pge_support = 1;
2139 		if (edx & CPUID_INTC_EDX_PAE)
2140 			pae_support = 1;
2141 
2142 		eax = 0x80000000;
2143 		edx = get_cpuid_edx(&eax);
2144 		if (eax >= 0x80000001) {
2145 			eax = 0x80000001;
2146 			edx = get_cpuid_edx(&eax);
2147 			if (edx & CPUID_AMD_EDX_LM)
2148 				amd64_support = 1;
2149 			if (edx & CPUID_AMD_EDX_NX)
2150 				NX_support = 1;
2151 		}
2152 	} else {
2153 		dboot_printf("cpuid not supported\n");
2154 	}
2155 #endif /* __xpv */
2156 
2157 
2158 #if defined(_BOOT_TARGET_amd64)
2159 	if (amd64_support == 0)
2160 		dboot_panic("long mode not supported, rebooting");
2161 	else if (pae_support == 0)
2162 		dboot_panic("long mode, but no PAE; rebooting");
2163 #else
2164 	/*
2165 	 * Allow the command line to over-ride use of PAE for 32 bit.
2166 	 */
2167 	if (strstr(cmdline, "disablePAE=true") != NULL) {
2168 		pae_support = 0;
2169 		NX_support = 0;
2170 		amd64_support = 0;
2171 	}
2172 #endif
2173 
2174 	/*
2175 	 * initialize the simple memory allocator
2176 	 */
2177 	init_mem_alloc();
2178 
2179 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
2180 	/*
2181 	 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
2182 	 */
2183 	if (max_mem < FOUR_GIG && NX_support == 0)
2184 		pae_support = 0;
2185 #endif
2186 
2187 	/*
2188 	 * configure mmu information
2189 	 */
2190 	if (pae_support) {
2191 		shift_amt = shift_amt_pae;
2192 		ptes_per_table = 512;
2193 		pte_size = 8;
2194 		lpagesize = TWO_MEG;
2195 #if defined(_BOOT_TARGET_amd64)
2196 		top_level = 3;
2197 #else
2198 		top_level = 2;
2199 #endif
2200 	} else {
2201 		pae_support = 0;
2202 		NX_support = 0;
2203 		shift_amt = shift_amt_nopae;
2204 		ptes_per_table = 1024;
2205 		pte_size = 4;
2206 		lpagesize = FOUR_MEG;
2207 		top_level = 1;
2208 	}
2209 
2210 	DBG(pge_support);
2211 	DBG(NX_support);
2212 	DBG(largepage_support);
2213 	DBG(amd64_support);
2214 	DBG(top_level);
2215 	DBG(pte_size);
2216 	DBG(ptes_per_table);
2217 	DBG(lpagesize);
2218 
2219 #if defined(__xpv)
2220 	ktext_phys = ONE_GIG;		/* from UNIX Mapfile */
2221 #else
2222 	ktext_phys = FOUR_MEG;		/* from UNIX Mapfile */
2223 #endif
2224 
2225 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
2226 	/*
2227 	 * For grub, copy kernel bits from the ELF64 file to final place.
2228 	 */
2229 	DBG_MSG("\nAllocating nucleus pages.\n");
2230 	ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
2231 	if (ktext_phys == 0)
2232 		dboot_panic("failed to allocate aligned kernel memory");
2233 	DBG(load_addr);
2234 	if (dboot_elfload64(load_addr) != 0)
2235 		dboot_panic("failed to parse kernel ELF image, rebooting");
2236 #endif
2237 
2238 	DBG(ktext_phys);
2239 
2240 	/*
2241 	 * Allocate page tables.
2242 	 */
2243 	build_page_tables();
2244 
2245 	/*
2246 	 * return to assembly code to switch to running kernel
2247 	 */
2248 	entry_addr_low = (uint32_t)target_kernel_text;
2249 	DBG(entry_addr_low);
2250 	bi->bi_use_largepage = largepage_support;
2251 	bi->bi_use_pae = pae_support;
2252 	bi->bi_use_pge = pge_support;
2253 	bi->bi_use_nx = NX_support;
2254 
2255 #if defined(__xpv)
2256 
2257 	bi->bi_next_paddr = next_avail_addr - mfn_base;
2258 	DBG(bi->bi_next_paddr);
2259 	bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2260 	DBG(bi->bi_next_vaddr);
2261 
2262 	/*
2263 	 * unmap unused pages in start area to make them available for DMA
2264 	 */
2265 	while (next_avail_addr < scratch_end) {
2266 		(void) HYPERVISOR_update_va_mapping(next_avail_addr,
2267 		    0, UVMF_INVLPG | UVMF_LOCAL);
2268 		next_avail_addr += MMU_PAGESIZE;
2269 	}
2270 
2271 	bi->bi_xen_start_info = (native_ptr_t)(uintptr_t)xen_info;
2272 	DBG((uintptr_t)HYPERVISOR_shared_info);
2273 	bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
2274 	bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
2275 
2276 #else /* __xpv */
2277 
2278 	bi->bi_next_paddr = next_avail_addr;
2279 	DBG(bi->bi_next_paddr);
2280 	bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2281 	DBG(bi->bi_next_vaddr);
2282 	bi->bi_mb_version = multiboot_version;
2283 
2284 	switch (multiboot_version) {
2285 	case 1:
2286 		bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb_info;
2287 		break;
2288 	case 2:
2289 		bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb2_info;
2290 		break;
2291 	default:
2292 		dboot_panic("Unknown multiboot version: %d\n",
2293 		    multiboot_version);
2294 		break;
2295 	}
2296 	bi->bi_top_page_table = (uintptr_t)top_page_table;
2297 
2298 #endif /* __xpv */
2299 
2300 	bi->bi_kseg_size = FOUR_MEG;
2301 	DBG(bi->bi_kseg_size);
2302 
2303 #ifndef __xpv
2304 	if (map_debug)
2305 		dump_tables();
2306 #endif
2307 
2308 	DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
2309 }
2310