xref: /illumos-gate/usr/src/lib/libm/common/m9x/__fex_sse.c (revision 9a686fbc186e8e2a64e9a5094d44c7d6fa0ea167)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24  */
25 /*
26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #include <ucontext.h>
31 #include <fenv.h>
32 #if defined(__SUNPRO_C)
33 #include <sunmath.h>
34 #else
35 #include <sys/ieeefp.h>
36 #endif
37 #include <sys/regset.h>
38 #include "fex_handler.h"
39 #include "fenv_inlines.h"
40 
41 #if !defined(REG_PC)
42 #define REG_PC	EIP
43 #endif
44 
45 #if !defined(REG_PS)
46 #define REG_PS	EFL
47 #endif
48 
49 #ifdef __amd64
50 #define regno(X)	((X < 4)? REG_RAX - X : \
51 			((X > 4)? REG_RAX + 1 - X : REG_RSP))
52 #else
53 #define regno(X)	(EAX - X)
54 #endif
55 
56 /*
57  * Support for SSE instructions
58  */
59 
60 /*
61  * Decode an SSE instruction.  Fill in *inst and return the length of the
62  * instruction in bytes.  Return 0 if the instruction is not recognized.
63  */
64 int
65 __fex_parse_sse(ucontext_t *uap, sseinst_t *inst)
66 {
67 	unsigned char	*ip;
68 	char		*addr;
69 	int		i, dbl, simd, rex, modrm, sib, r;
70 
71 	i = 0;
72 	ip = (unsigned char *)uap->uc_mcontext.gregs[REG_PC];
73 
74 	/* look for pseudo-prefixes */
75 	dbl = 0;
76 	simd = SIMD;
77 	if (ip[i] == 0xF3) {
78 		simd = 0;
79 		i++;
80 	} else if (ip[i] == 0x66) {
81 		dbl = DOUBLE;
82 		i++;
83 	} else if (ip[i] == 0xF2) {
84 		dbl = DOUBLE;
85 		simd = 0;
86 		i++;
87 	}
88 
89 	/* look for AMD64 REX prefix */
90 	rex = 0;
91 	if (ip[i] >= 0x40 && ip[i] <= 0x4F) {
92 		rex = ip[i];
93 		i++;
94 	}
95 
96 	/* parse opcode */
97 	if (ip[i++] != 0x0F)
98 		return 0;
99 	switch (ip[i++]) {
100 	case 0x2A:
101 		inst->op = (int)cvtsi2ss + simd + dbl;
102 		if (!simd)
103 			inst->op = (int)inst->op + (rex & 8);
104 		break;
105 
106 	case 0x2C:
107 		inst->op = (int)cvttss2si + simd + dbl;
108 		if (!simd)
109 			inst->op = (int)inst->op + (rex & 8);
110 		break;
111 
112 	case 0x2D:
113 		inst->op = (int)cvtss2si + simd + dbl;
114 		if (!simd)
115 			inst->op = (int)inst->op + (rex & 8);
116 		break;
117 
118 	case 0x2E:
119 		/* oddball: scalar instruction in a SIMD opcode group */
120 		if (!simd)
121 			return 0;
122 		inst->op = (int)ucomiss + dbl;
123 		break;
124 
125 	case 0x2F:
126 		/* oddball: scalar instruction in a SIMD opcode group */
127 		if (!simd)
128 			return 0;
129 		inst->op = (int)comiss + dbl;
130 		break;
131 
132 	case 0x51:
133 		inst->op = (int)sqrtss + simd + dbl;
134 		break;
135 
136 	case 0x58:
137 		inst->op = (int)addss + simd + dbl;
138 		break;
139 
140 	case 0x59:
141 		inst->op = (int)mulss + simd + dbl;
142 		break;
143 
144 	case 0x5A:
145 		inst->op = (int)cvtss2sd + simd + dbl;
146 		break;
147 
148 	case 0x5B:
149 		if (dbl) {
150 			if (simd)
151 				inst->op = cvtps2dq;
152 			else
153 				return 0;
154 		} else {
155 			inst->op = (simd)? cvtdq2ps : cvttps2dq;
156 		}
157 		break;
158 
159 	case 0x5C:
160 		inst->op = (int)subss + simd + dbl;
161 		break;
162 
163 	case 0x5D:
164 		inst->op = (int)minss + simd + dbl;
165 		break;
166 
167 	case 0x5E:
168 		inst->op = (int)divss + simd + dbl;
169 		break;
170 
171 	case 0x5F:
172 		inst->op = (int)maxss + simd + dbl;
173 		break;
174 
175 	case 0xC2:
176 		inst->op = (int)cmpss + simd + dbl;
177 		break;
178 
179 	case 0xE6:
180 		if (simd) {
181 			if (dbl)
182 				inst->op = cvttpd2dq;
183 			else
184 				return 0;
185 		} else {
186 			inst->op = (dbl)? cvtpd2dq : cvtdq2pd;
187 		}
188 		break;
189 
190 	default:
191 		return 0;
192 	}
193 
194 	/* locate operands */
195 	modrm = ip[i++];
196 
197 	if (inst->op == cvtss2si || inst->op == cvttss2si ||
198 	    inst->op == cvtsd2si || inst->op == cvttsd2si ||
199 	    inst->op == cvtss2siq || inst->op == cvttss2siq ||
200 	    inst->op == cvtsd2siq || inst->op == cvttsd2siq) {
201 		/* op1 is a gp register */
202 		r = ((rex & 4) << 1) | ((modrm >> 3) & 7);
203 		inst->op1 = (sseoperand_t *)&uap->uc_mcontext.gregs[regno(r)];
204 	} else if (inst->op == cvtps2pi || inst->op == cvttps2pi ||
205 	    inst->op == cvtpd2pi || inst->op == cvttpd2pi) {
206 		/* op1 is a mmx register */
207 #ifdef __amd64
208 		inst->op1 = (sseoperand_t *)&uap->uc_mcontext.fpregs.fp_reg_set.
209 		    fpchip_state.st[(modrm >> 3) & 7];
210 #else
211 		inst->op1 = (sseoperand_t *)(10 * ((modrm >> 3) & 7) +
212 		    (char *)&uap->uc_mcontext.fpregs.fp_reg_set.
213 		    fpchip_state.state[7]);
214 #endif
215 	} else {
216 		/* op1 is a xmm register */
217 		r = ((rex & 4) << 1) | ((modrm >> 3) & 7);
218 		inst->op1 = (sseoperand_t *)&uap->uc_mcontext.fpregs.
219 		    fp_reg_set.fpchip_state.xmm[r];
220 	}
221 
222 	if ((modrm >> 6) == 3) {
223 		if (inst->op == cvtsi2ss || inst->op == cvtsi2sd ||
224 		    inst->op == cvtsi2ssq || inst->op == cvtsi2sdq) {
225 			/* op2 is a gp register */
226 			r = ((rex & 1) << 3) | (modrm & 7);
227 			inst->op2 = (sseoperand_t *)&uap->uc_mcontext.
228 			    gregs[regno(r)];
229 		} else if (inst->op == cvtpi2ps || inst->op == cvtpi2pd) {
230 			/* op2 is a mmx register */
231 #ifdef __amd64
232 			inst->op2 = (sseoperand_t *)&uap->uc_mcontext.fpregs.
233 			    fp_reg_set.fpchip_state.st[modrm & 7];
234 #else
235 			inst->op2 = (sseoperand_t *)(10 * (modrm & 7) +
236 			    (char *)&uap->uc_mcontext.fpregs.fp_reg_set.
237 			    fpchip_state.state[7]);
238 #endif
239 		} else {
240 			/* op2 is a xmm register */
241 			r = ((rex & 1) << 3) | (modrm & 7);
242 			inst->op2 = (sseoperand_t *)&uap->uc_mcontext.fpregs.
243 			    fp_reg_set.fpchip_state.xmm[r];
244 		}
245 	} else if ((modrm & 0xc7) == 0x05) {
246 #ifdef __amd64
247 		/* address of next instruction + offset */
248 		r = i + 4;
249 		if (inst->op == cmpss || inst->op == cmpps ||
250 		    inst->op == cmpsd || inst->op == cmppd)
251 			r++;
252 		inst->op2 = (sseoperand_t *)(ip + r + *(int *)(ip + i));
253 #else
254 		/* absolute address */
255 		inst->op2 = (sseoperand_t *)(*(int *)(ip + i));
256 #endif
257 		i += 4;
258 	} else {
259 		/* complex address */
260 		if ((modrm & 7) == 4) {
261 			/* parse sib byte */
262 			sib = ip[i++];
263 			if ((sib & 7) == 5 && (modrm >> 6) == 0) {
264 				/* start with absolute address */
265 				addr = (char *)(uintptr_t)(*(int *)(ip + i));
266 				i += 4;
267 			} else {
268 				/* start with base */
269 				r = ((rex & 1) << 3) | (sib & 7);
270 				addr = (char *)uap->uc_mcontext.gregs[regno(r)];
271 			}
272 			r = ((rex & 2) << 2) | ((sib >> 3) & 7);
273 			if (r != 4) {
274 				/* add scaled index */
275 				addr += uap->uc_mcontext.gregs[regno(r)]
276 				    << (sib >> 6);
277 			}
278 		} else {
279 			r = ((rex & 1) << 3) | (modrm & 7);
280 			addr = (char *)uap->uc_mcontext.gregs[regno(r)];
281 		}
282 
283 		/* add displacement, if any */
284 		if ((modrm >> 6) == 1) {
285 			addr += (char)ip[i++];
286 		} else if ((modrm >> 6) == 2) {
287 			addr += *(int *)(ip + i);
288 			i += 4;
289 		}
290 		inst->op2 = (sseoperand_t *)addr;
291 	}
292 
293 	if (inst->op == cmpss || inst->op == cmpps || inst->op == cmpsd ||
294 	    inst->op == cmppd) {
295 		/* get the immediate operand */
296 		inst->imm = ip[i++];
297 	}
298 
299 	return i;
300 }
301 
302 static enum fp_class_type
303 my_fp_classf(float *x)
304 {
305 	int	i = *(int *)x & ~0x80000000;
306 
307 	if (i < 0x7f800000) {
308 		if (i < 0x00800000)
309 			return ((i == 0)? fp_zero : fp_subnormal);
310 		return fp_normal;
311 	}
312 	else if (i == 0x7f800000)
313 		return fp_infinity;
314 	else if (i & 0x400000)
315 		return fp_quiet;
316 	else
317 		return fp_signaling;
318 }
319 
320 static enum fp_class_type
321 my_fp_class(double *x)
322 {
323 	int	i = *(1+(int *)x) & ~0x80000000;
324 
325 	if (i < 0x7ff00000) {
326 		if (i < 0x00100000)
327 			return (((i | *(int *)x) == 0)? fp_zero : fp_subnormal);
328 		return fp_normal;
329 	}
330 	else if (i == 0x7ff00000 && *(int *)x == 0)
331 		return fp_infinity;
332 	else if (i & 0x80000)
333 		return fp_quiet;
334 	else
335 		return fp_signaling;
336 }
337 
338 /*
339  * Inspect a scalar SSE instruction that incurred an invalid operation
340  * exception to determine which type of exception it was.
341  */
342 static enum fex_exception
343 __fex_get_sse_invalid_type(sseinst_t *inst)
344 {
345 	enum fp_class_type	t1, t2;
346 
347 	/* check op2 for signaling nan */
348 	t2 = ((int)inst->op & DOUBLE)? my_fp_class(&inst->op2->d[0]) :
349 	    my_fp_classf(&inst->op2->f[0]);
350 	if (t2 == fp_signaling)
351 		return fex_inv_snan;
352 
353 	/* eliminate all single-operand instructions */
354 	switch (inst->op) {
355 	case cvtsd2ss:
356 	case cvtss2sd:
357 		/* hmm, this shouldn't have happened */
358 		return (enum fex_exception) -1;
359 
360 	case sqrtss:
361 	case sqrtsd:
362 		return fex_inv_sqrt;
363 
364 	case cvtss2si:
365 	case cvtsd2si:
366 	case cvttss2si:
367 	case cvttsd2si:
368 	case cvtss2siq:
369 	case cvtsd2siq:
370 	case cvttss2siq:
371 	case cvttsd2siq:
372 		return fex_inv_int;
373 	default:
374 		break;
375 	}
376 
377 	/* check op1 for signaling nan */
378 	t1 = ((int)inst->op & DOUBLE)? my_fp_class(&inst->op1->d[0]) :
379 	    my_fp_classf(&inst->op1->f[0]);
380 	if (t1 == fp_signaling)
381 		return fex_inv_snan;
382 
383 	/* check two-operand instructions for other cases */
384 	switch (inst->op) {
385 	case cmpss:
386 	case cmpsd:
387 	case minss:
388 	case minsd:
389 	case maxss:
390 	case maxsd:
391 	case comiss:
392 	case comisd:
393 		return fex_inv_cmp;
394 
395 	case addss:
396 	case addsd:
397 	case subss:
398 	case subsd:
399 		if (t1 == fp_infinity && t2 == fp_infinity)
400 			return fex_inv_isi;
401 		break;
402 
403 	case mulss:
404 	case mulsd:
405 		if ((t1 == fp_zero && t2 == fp_infinity) ||
406 		    (t2 == fp_zero && t1 == fp_infinity))
407 			return fex_inv_zmi;
408 		break;
409 
410 	case divss:
411 	case divsd:
412 		if (t1 == fp_zero && t2 == fp_zero)
413 			return fex_inv_zdz;
414 		if (t1 == fp_infinity && t2 == fp_infinity)
415 			return fex_inv_idi;
416 	default:
417 		break;
418 	}
419 
420 	return (enum fex_exception)-1;
421 }
422 
423 /* inline templates */
424 extern void sse_cmpeqss(float *, float *, int *);
425 extern void sse_cmpltss(float *, float *, int *);
426 extern void sse_cmpless(float *, float *, int *);
427 extern void sse_cmpunordss(float *, float *, int *);
428 extern void sse_minss(float *, float *, float *);
429 extern void sse_maxss(float *, float *, float *);
430 extern void sse_addss(float *, float *, float *);
431 extern void sse_subss(float *, float *, float *);
432 extern void sse_mulss(float *, float *, float *);
433 extern void sse_divss(float *, float *, float *);
434 extern void sse_sqrtss(float *, float *);
435 extern void sse_ucomiss(float *, float *);
436 extern void sse_comiss(float *, float *);
437 extern void sse_cvtss2sd(float *, double *);
438 extern void sse_cvtsi2ss(int *, float *);
439 extern void sse_cvttss2si(float *, int *);
440 extern void sse_cvtss2si(float *, int *);
441 #ifdef __amd64
442 extern void sse_cvtsi2ssq(long long *, float *);
443 extern void sse_cvttss2siq(float *, long long *);
444 extern void sse_cvtss2siq(float *, long long *);
445 #endif
446 extern void sse_cmpeqsd(double *, double *, long long *);
447 extern void sse_cmpltsd(double *, double *, long long *);
448 extern void sse_cmplesd(double *, double *, long long *);
449 extern void sse_cmpunordsd(double *, double *, long long *);
450 extern void sse_minsd(double *, double *, double *);
451 extern void sse_maxsd(double *, double *, double *);
452 extern void sse_addsd(double *, double *, double *);
453 extern void sse_subsd(double *, double *, double *);
454 extern void sse_mulsd(double *, double *, double *);
455 extern void sse_divsd(double *, double *, double *);
456 extern void sse_sqrtsd(double *, double *);
457 extern void sse_ucomisd(double *, double *);
458 extern void sse_comisd(double *, double *);
459 extern void sse_cvtsd2ss(double *, float *);
460 extern void sse_cvtsi2sd(int *, double *);
461 extern void sse_cvttsd2si(double *, int *);
462 extern void sse_cvtsd2si(double *, int *);
463 #ifdef __amd64
464 extern void sse_cvtsi2sdq(long long *, double *);
465 extern void sse_cvttsd2siq(double *, long long *);
466 extern void sse_cvtsd2siq(double *, long long *);
467 #endif
468 
469 /*
470  * Fill in *info with the operands, default untrapped result, and
471  * flags produced by a scalar SSE instruction, and return the type
472  * of trapped exception (if any).  On entry, the mxcsr must have
473  * all exceptions masked and all flags clear.  The same conditions
474  * will hold on exit.
475  *
476  * This routine does not work if the instruction specified by *inst
477  * is not a scalar instruction.
478  */
479 enum fex_exception
480 __fex_get_sse_op(ucontext_t *uap, sseinst_t *inst, fex_info_t *info)
481 {
482 	unsigned int	e, te, mxcsr, oldmxcsr, subnorm;
483 
484 	/*
485 	 * Perform the operation with traps disabled and check the
486 	 * exception flags.  If the underflow trap was enabled, also
487 	 * check for an exact subnormal result.
488 	 */
489 	__fenv_getmxcsr(&oldmxcsr);
490 	subnorm = 0;
491 	if ((int)inst->op & DOUBLE) {
492 		if (inst->op == cvtsi2sd) {
493 			info->op1.type = fex_int;
494 			info->op1.val.i = inst->op2->i[0];
495 			info->op2.type = fex_nodata;
496 		} else if (inst->op == cvtsi2sdq) {
497 			info->op1.type = fex_llong;
498 			info->op1.val.l = inst->op2->l[0];
499 			info->op2.type = fex_nodata;
500 		} else if (inst->op == sqrtsd || inst->op == cvtsd2ss ||
501 		    inst->op == cvttsd2si || inst->op == cvtsd2si ||
502 		    inst->op == cvttsd2siq || inst->op == cvtsd2siq) {
503 			info->op1.type = fex_double;
504 			info->op1.val.d = inst->op2->d[0];
505 			info->op2.type = fex_nodata;
506 		} else {
507 			info->op1.type = fex_double;
508 			info->op1.val.d = inst->op1->d[0];
509 			info->op2.type = fex_double;
510 			info->op2.val.d = inst->op2->d[0];
511 		}
512 		info->res.type = fex_double;
513 		switch (inst->op) {
514 		case cmpsd:
515 			info->op = fex_cmp;
516 			info->res.type = fex_llong;
517 			switch (inst->imm & 3) {
518 			case 0:
519 				sse_cmpeqsd(&info->op1.val.d, &info->op2.val.d,
520 				    &info->res.val.l);
521 				break;
522 
523 			case 1:
524 				sse_cmpltsd(&info->op1.val.d, &info->op2.val.d,
525 				    &info->res.val.l);
526 				break;
527 
528 			case 2:
529 				sse_cmplesd(&info->op1.val.d, &info->op2.val.d,
530 				    &info->res.val.l);
531 				break;
532 
533 			case 3:
534 				sse_cmpunordsd(&info->op1.val.d,
535 				    &info->op2.val.d, &info->res.val.l);
536 			}
537 			if (inst->imm & 4)
538 				info->res.val.l ^= 0xffffffffffffffffull;
539 			break;
540 
541 		case minsd:
542 			info->op = fex_other;
543 			sse_minsd(&info->op1.val.d, &info->op2.val.d,
544 			    &info->res.val.d);
545 			break;
546 
547 		case maxsd:
548 			info->op = fex_other;
549 			sse_maxsd(&info->op1.val.d, &info->op2.val.d,
550 			    &info->res.val.d);
551 			break;
552 
553 		case addsd:
554 			info->op = fex_add;
555 			sse_addsd(&info->op1.val.d, &info->op2.val.d,
556 			    &info->res.val.d);
557 			if (my_fp_class(&info->res.val.d) == fp_subnormal)
558 				subnorm = 1;
559 			break;
560 
561 		case subsd:
562 			info->op = fex_sub;
563 			sse_subsd(&info->op1.val.d, &info->op2.val.d,
564 			    &info->res.val.d);
565 			if (my_fp_class(&info->res.val.d) == fp_subnormal)
566 				subnorm = 1;
567 			break;
568 
569 		case mulsd:
570 			info->op = fex_mul;
571 			sse_mulsd(&info->op1.val.d, &info->op2.val.d,
572 			    &info->res.val.d);
573 			if (my_fp_class(&info->res.val.d) == fp_subnormal)
574 				subnorm = 1;
575 			break;
576 
577 		case divsd:
578 			info->op = fex_div;
579 			sse_divsd(&info->op1.val.d, &info->op2.val.d,
580 			    &info->res.val.d);
581 			if (my_fp_class(&info->res.val.d) == fp_subnormal)
582 				subnorm = 1;
583 			break;
584 
585 		case sqrtsd:
586 			info->op = fex_sqrt;
587 			sse_sqrtsd(&info->op1.val.d, &info->res.val.d);
588 			break;
589 
590 		case cvtsd2ss:
591 			info->op = fex_cnvt;
592 			info->res.type = fex_float;
593 			sse_cvtsd2ss(&info->op1.val.d, &info->res.val.f);
594 			if (my_fp_classf(&info->res.val.f) == fp_subnormal)
595 				subnorm = 1;
596 			break;
597 
598 		case cvtsi2sd:
599 			info->op = fex_cnvt;
600 			sse_cvtsi2sd(&info->op1.val.i, &info->res.val.d);
601 			break;
602 
603 		case cvttsd2si:
604 			info->op = fex_cnvt;
605 			info->res.type = fex_int;
606 			sse_cvttsd2si(&info->op1.val.d, &info->res.val.i);
607 			break;
608 
609 		case cvtsd2si:
610 			info->op = fex_cnvt;
611 			info->res.type = fex_int;
612 			sse_cvtsd2si(&info->op1.val.d, &info->res.val.i);
613 			break;
614 
615 #ifdef __amd64
616 		case cvtsi2sdq:
617 			info->op = fex_cnvt;
618 			sse_cvtsi2sdq(&info->op1.val.l, &info->res.val.d);
619 			break;
620 
621 		case cvttsd2siq:
622 			info->op = fex_cnvt;
623 			info->res.type = fex_llong;
624 			sse_cvttsd2siq(&info->op1.val.d, &info->res.val.l);
625 			break;
626 
627 		case cvtsd2siq:
628 			info->op = fex_cnvt;
629 			info->res.type = fex_llong;
630 			sse_cvtsd2siq(&info->op1.val.d, &info->res.val.l);
631 			break;
632 #endif
633 
634 		case ucomisd:
635 			info->op = fex_cmp;
636 			info->res.type = fex_nodata;
637 			sse_ucomisd(&info->op1.val.d, &info->op2.val.d);
638 			break;
639 
640 		case comisd:
641 			info->op = fex_cmp;
642 			info->res.type = fex_nodata;
643 			sse_comisd(&info->op1.val.d, &info->op2.val.d);
644 			break;
645 		default:
646 			break;
647 		}
648 	} else {
649 		if (inst->op == cvtsi2ss) {
650 			info->op1.type = fex_int;
651 			info->op1.val.i = inst->op2->i[0];
652 			info->op2.type = fex_nodata;
653 		} else if (inst->op == cvtsi2ssq) {
654 			info->op1.type = fex_llong;
655 			info->op1.val.l = inst->op2->l[0];
656 			info->op2.type = fex_nodata;
657 		} else if (inst->op == sqrtss || inst->op == cvtss2sd ||
658 		    inst->op == cvttss2si || inst->op == cvtss2si ||
659 		    inst->op == cvttss2siq || inst->op == cvtss2siq) {
660 			info->op1.type = fex_float;
661 			info->op1.val.f = inst->op2->f[0];
662 			info->op2.type = fex_nodata;
663 		} else {
664 			info->op1.type = fex_float;
665 			info->op1.val.f = inst->op1->f[0];
666 			info->op2.type = fex_float;
667 			info->op2.val.f = inst->op2->f[0];
668 		}
669 		info->res.type = fex_float;
670 		switch (inst->op) {
671 		case cmpss:
672 			info->op = fex_cmp;
673 			info->res.type = fex_int;
674 			switch (inst->imm & 3) {
675 			case 0:
676 				sse_cmpeqss(&info->op1.val.f, &info->op2.val.f,
677 				    &info->res.val.i);
678 				break;
679 
680 			case 1:
681 				sse_cmpltss(&info->op1.val.f, &info->op2.val.f,
682 				    &info->res.val.i);
683 				break;
684 
685 			case 2:
686 				sse_cmpless(&info->op1.val.f, &info->op2.val.f,
687 				    &info->res.val.i);
688 				break;
689 
690 			case 3:
691 				sse_cmpunordss(&info->op1.val.f,
692 				    &info->op2.val.f, &info->res.val.i);
693 			}
694 			if (inst->imm & 4)
695 				info->res.val.i ^= 0xffffffffu;
696 			break;
697 
698 		case minss:
699 			info->op = fex_other;
700 			sse_minss(&info->op1.val.f, &info->op2.val.f,
701 			    &info->res.val.f);
702 			break;
703 
704 		case maxss:
705 			info->op = fex_other;
706 			sse_maxss(&info->op1.val.f, &info->op2.val.f,
707 			    &info->res.val.f);
708 			break;
709 
710 		case addss:
711 			info->op = fex_add;
712 			sse_addss(&info->op1.val.f, &info->op2.val.f,
713 			    &info->res.val.f);
714 			if (my_fp_classf(&info->res.val.f) == fp_subnormal)
715 				subnorm = 1;
716 			break;
717 
718 		case subss:
719 			info->op = fex_sub;
720 			sse_subss(&info->op1.val.f, &info->op2.val.f,
721 			    &info->res.val.f);
722 			if (my_fp_classf(&info->res.val.f) == fp_subnormal)
723 				subnorm = 1;
724 			break;
725 
726 		case mulss:
727 			info->op = fex_mul;
728 			sse_mulss(&info->op1.val.f, &info->op2.val.f,
729 			    &info->res.val.f);
730 			if (my_fp_classf(&info->res.val.f) == fp_subnormal)
731 				subnorm = 1;
732 			break;
733 
734 		case divss:
735 			info->op = fex_div;
736 			sse_divss(&info->op1.val.f, &info->op2.val.f,
737 			    &info->res.val.f);
738 			if (my_fp_classf(&info->res.val.f) == fp_subnormal)
739 				subnorm = 1;
740 			break;
741 
742 		case sqrtss:
743 			info->op = fex_sqrt;
744 			sse_sqrtss(&info->op1.val.f, &info->res.val.f);
745 			break;
746 
747 		case cvtss2sd:
748 			info->op = fex_cnvt;
749 			info->res.type = fex_double;
750 			sse_cvtss2sd(&info->op1.val.f, &info->res.val.d);
751 			break;
752 
753 		case cvtsi2ss:
754 			info->op = fex_cnvt;
755 			sse_cvtsi2ss(&info->op1.val.i, &info->res.val.f);
756 			break;
757 
758 		case cvttss2si:
759 			info->op = fex_cnvt;
760 			info->res.type = fex_int;
761 			sse_cvttss2si(&info->op1.val.f, &info->res.val.i);
762 			break;
763 
764 		case cvtss2si:
765 			info->op = fex_cnvt;
766 			info->res.type = fex_int;
767 			sse_cvtss2si(&info->op1.val.f, &info->res.val.i);
768 			break;
769 
770 #ifdef __amd64
771 		case cvtsi2ssq:
772 			info->op = fex_cnvt;
773 			sse_cvtsi2ssq(&info->op1.val.l, &info->res.val.f);
774 			break;
775 
776 		case cvttss2siq:
777 			info->op = fex_cnvt;
778 			info->res.type = fex_llong;
779 			sse_cvttss2siq(&info->op1.val.f, &info->res.val.l);
780 			break;
781 
782 		case cvtss2siq:
783 			info->op = fex_cnvt;
784 			info->res.type = fex_llong;
785 			sse_cvtss2siq(&info->op1.val.f, &info->res.val.l);
786 			break;
787 #endif
788 
789 		case ucomiss:
790 			info->op = fex_cmp;
791 			info->res.type = fex_nodata;
792 			sse_ucomiss(&info->op1.val.f, &info->op2.val.f);
793 			break;
794 
795 		case comiss:
796 			info->op = fex_cmp;
797 			info->res.type = fex_nodata;
798 			sse_comiss(&info->op1.val.f, &info->op2.val.f);
799 			break;
800 		default:
801 			break;
802 		}
803 	}
804 	__fenv_getmxcsr(&mxcsr);
805 	info->flags = mxcsr & 0x3d;
806 	__fenv_setmxcsr(&oldmxcsr);
807 
808 	/* determine which exception would have been trapped */
809 	te = ~(uap->uc_mcontext.fpregs.fp_reg_set.fpchip_state.mxcsr
810 	    >> 7) & 0x3d;
811 	e = mxcsr & te;
812 	if (e & FE_INVALID)
813 		return __fex_get_sse_invalid_type(inst);
814 	if (e & FE_DIVBYZERO)
815 		return fex_division;
816 	if (e & FE_OVERFLOW)
817 		return fex_overflow;
818 	if ((e & FE_UNDERFLOW) || (subnorm && (te & FE_UNDERFLOW)))
819 		return fex_underflow;
820 	if (e & FE_INEXACT)
821 		return fex_inexact;
822 	return (enum fex_exception)-1;
823 }
824 
825 /*
826  * Emulate a SIMD SSE instruction to determine which exceptions occur
827  * in each part.  For i = 0, 1, 2, and 3, set e[i] to indicate the
828  * trapped exception that would occur if the i-th part of the SIMD
829  * instruction were executed in isolation; set e[i] to -1 if no
830  * trapped exception would occur in this part.  Also fill in info[i]
831  * with the corresponding operands, default untrapped result, and
832  * flags.
833  *
834  * This routine does not work if the instruction specified by *inst
835  * is not a SIMD instruction.
836  */
837 void
838 __fex_get_simd_op(ucontext_t *uap, sseinst_t *inst, enum fex_exception *e,
839     fex_info_t *info)
840 {
841 	sseinst_t	dummy;
842 	int		i;
843 
844 	e[0] = e[1] = e[2] = e[3] = -1;
845 
846 	/* perform each part of the SIMD operation */
847 	switch (inst->op) {
848 	case cmpps:
849 		dummy.op = cmpss;
850 		dummy.imm = inst->imm;
851 		for (i = 0; i < 4; i++) {
852 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
853 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
854 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
855 		}
856 		break;
857 
858 	case minps:
859 		dummy.op = minss;
860 		for (i = 0; i < 4; i++) {
861 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
862 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
863 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
864 		}
865 		break;
866 
867 	case maxps:
868 		dummy.op = maxss;
869 		for (i = 0; i < 4; i++) {
870 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
871 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
872 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
873 		}
874 		break;
875 
876 	case addps:
877 		dummy.op = addss;
878 		for (i = 0; i < 4; i++) {
879 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
880 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
881 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
882 		}
883 		break;
884 
885 	case subps:
886 		dummy.op = subss;
887 		for (i = 0; i < 4; i++) {
888 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
889 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
890 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
891 		}
892 		break;
893 
894 	case mulps:
895 		dummy.op = mulss;
896 		for (i = 0; i < 4; i++) {
897 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
898 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
899 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
900 		}
901 		break;
902 
903 	case divps:
904 		dummy.op = divss;
905 		for (i = 0; i < 4; i++) {
906 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
907 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
908 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
909 		}
910 		break;
911 
912 	case sqrtps:
913 		dummy.op = sqrtss;
914 		for (i = 0; i < 4; i++) {
915 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
916 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
917 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
918 		}
919 		break;
920 
921 	case cvtdq2ps:
922 		dummy.op = cvtsi2ss;
923 		for (i = 0; i < 4; i++) {
924 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
925 			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
926 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
927 		}
928 		break;
929 
930 	case cvttps2dq:
931 		dummy.op = cvttss2si;
932 		for (i = 0; i < 4; i++) {
933 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
934 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
935 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
936 		}
937 		break;
938 
939 	case cvtps2dq:
940 		dummy.op = cvtss2si;
941 		for (i = 0; i < 4; i++) {
942 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
943 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
944 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
945 		}
946 		break;
947 
948 	case cvtpi2ps:
949 		dummy.op = cvtsi2ss;
950 		for (i = 0; i < 2; i++) {
951 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
952 			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
953 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
954 		}
955 		break;
956 
957 	case cvttps2pi:
958 		dummy.op = cvttss2si;
959 		for (i = 0; i < 2; i++) {
960 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
961 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
962 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
963 		}
964 		break;
965 
966 	case cvtps2pi:
967 		dummy.op = cvtss2si;
968 		for (i = 0; i < 2; i++) {
969 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
970 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
971 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
972 		}
973 		break;
974 
975 	case cmppd:
976 		dummy.op = cmpsd;
977 		dummy.imm = inst->imm;
978 		for (i = 0; i < 2; i++) {
979 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
980 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
981 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
982 		}
983 		break;
984 
985 	case minpd:
986 		dummy.op = minsd;
987 		for (i = 0; i < 2; i++) {
988 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
989 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
990 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
991 		}
992 		break;
993 
994 	case maxpd:
995 		dummy.op = maxsd;
996 		for (i = 0; i < 2; i++) {
997 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
998 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
999 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1000 		}
1001 		break;
1002 
1003 	case addpd:
1004 		dummy.op = addsd;
1005 		for (i = 0; i < 2; i++) {
1006 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1007 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1008 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1009 		}
1010 		break;
1011 
1012 	case subpd:
1013 		dummy.op = subsd;
1014 		for (i = 0; i < 2; i++) {
1015 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1016 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1017 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1018 		}
1019 		break;
1020 
1021 	case mulpd:
1022 		dummy.op = mulsd;
1023 		for (i = 0; i < 2; i++) {
1024 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1025 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1026 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1027 		}
1028 		break;
1029 
1030 	case divpd:
1031 		dummy.op = divsd;
1032 		for (i = 0; i < 2; i++) {
1033 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1034 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1035 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1036 		}
1037 		break;
1038 
1039 	case sqrtpd:
1040 		dummy.op = sqrtsd;
1041 		for (i = 0; i < 2; i++) {
1042 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1043 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1044 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1045 		}
1046 		break;
1047 
1048 	case cvtpi2pd:
1049 	case cvtdq2pd:
1050 		dummy.op = cvtsi2sd;
1051 		for (i = 0; i < 2; i++) {
1052 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1053 			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
1054 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1055 		}
1056 		break;
1057 
1058 	case cvttpd2pi:
1059 	case cvttpd2dq:
1060 		dummy.op = cvttsd2si;
1061 		for (i = 0; i < 2; i++) {
1062 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1063 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1064 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1065 		}
1066 		break;
1067 
1068 	case cvtpd2pi:
1069 	case cvtpd2dq:
1070 		dummy.op = cvtsd2si;
1071 		for (i = 0; i < 2; i++) {
1072 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1073 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1074 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1075 		}
1076 		break;
1077 
1078 	case cvtps2pd:
1079 		dummy.op = cvtss2sd;
1080 		for (i = 0; i < 2; i++) {
1081 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1082 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1083 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1084 		}
1085 		break;
1086 
1087 	case cvtpd2ps:
1088 		dummy.op = cvtsd2ss;
1089 		for (i = 0; i < 2; i++) {
1090 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1091 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1092 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1093 		}
1094 	default:
1095 		break;
1096 	}
1097 }
1098 
1099 /*
1100  * Store the result value from *info in the destination of the scalar
1101  * SSE instruction specified by *inst.  If no result is given but the
1102  * exception is underflow or overflow, supply the default trapped result.
1103  *
1104  * This routine does not work if the instruction specified by *inst
1105  * is not a scalar instruction.
1106  */
1107 void
1108 __fex_st_sse_result(ucontext_t *uap, sseinst_t *inst, enum fex_exception e,
1109     fex_info_t *info)
1110 {
1111 	int		i = 0;
1112 	long long	l = 0L;;
1113 	float		f = 0.0, fscl;
1114 	double		d = 0.0L, dscl;
1115 
1116 	/* for compares that write eflags, just set the flags
1117 	   to indicate "unordered" */
1118 	if (inst->op == ucomiss || inst->op == comiss ||
1119 	    inst->op == ucomisd || inst->op == comisd) {
1120 		uap->uc_mcontext.gregs[REG_PS] |= 0x45;
1121 		return;
1122 	}
1123 
1124 	/* if info doesn't specify a result value, try to generate
1125 	   the default trapped result */
1126 	if (info->res.type == fex_nodata) {
1127 		/* set scale factors for exponent wrapping */
1128 		switch (e) {
1129 		case fex_overflow:
1130 			fscl = 1.262177448e-29f; /* 2^-96 */
1131 			dscl = 6.441148769597133308e-232; /* 2^-768 */
1132 			break;
1133 
1134 		case fex_underflow:
1135 			fscl = 7.922816251e+28f; /* 2^96 */
1136 			dscl = 1.552518092300708935e+231; /* 2^768 */
1137 			break;
1138 
1139 		default:
1140 			(void) __fex_get_sse_op(uap, inst, info);
1141 			if (info->res.type == fex_nodata)
1142 				return;
1143 			goto stuff;
1144 		}
1145 
1146 		/* generate the wrapped result */
1147 		if (inst->op == cvtsd2ss) {
1148 			info->op1.type = fex_double;
1149 			info->op1.val.d = inst->op2->d[0];
1150 			info->op2.type = fex_nodata;
1151 			info->res.type = fex_float;
1152 			info->res.val.f = (float)(fscl * (fscl *
1153 			    info->op1.val.d));
1154 		} else if ((int)inst->op & DOUBLE) {
1155 			info->op1.type = fex_double;
1156 			info->op1.val.d = inst->op1->d[0];
1157 			info->op2.type = fex_double;
1158 			info->op2.val.d = inst->op2->d[0];
1159 			info->res.type = fex_double;
1160 			switch (inst->op) {
1161 			case addsd:
1162 				info->res.val.d = dscl * (dscl *
1163 				    info->op1.val.d + dscl * info->op2.val.d);
1164 				break;
1165 
1166 			case subsd:
1167 				info->res.val.d = dscl * (dscl *
1168 				    info->op1.val.d - dscl * info->op2.val.d);
1169 				break;
1170 
1171 			case mulsd:
1172 				info->res.val.d = (dscl * info->op1.val.d) *
1173 				    (dscl * info->op2.val.d);
1174 				break;
1175 
1176 			case divsd:
1177 				info->res.val.d = (dscl * info->op1.val.d) /
1178 				    (info->op2.val.d / dscl);
1179 				break;
1180 
1181 			default:
1182 				return;
1183 			}
1184 		} else {
1185 			info->op1.type = fex_float;
1186 			info->op1.val.f = inst->op1->f[0];
1187 			info->op2.type = fex_float;
1188 			info->op2.val.f = inst->op2->f[0];
1189 			info->res.type = fex_float;
1190 			switch (inst->op) {
1191 			case addss:
1192 				info->res.val.f = fscl * (fscl *
1193 				    info->op1.val.f + fscl * info->op2.val.f);
1194 				break;
1195 
1196 			case subss:
1197 				info->res.val.f = fscl * (fscl *
1198 				    info->op1.val.f - fscl * info->op2.val.f);
1199 				break;
1200 
1201 			case mulss:
1202 				info->res.val.f = (fscl * info->op1.val.f) *
1203 				    (fscl * info->op2.val.f);
1204 				break;
1205 
1206 			case divss:
1207 				info->res.val.f = (fscl * info->op1.val.f) /
1208 				    (info->op2.val.f / fscl);
1209 				break;
1210 
1211 			default:
1212 				return;
1213 			}
1214 		}
1215 	}
1216 
1217 	/* put the result in the destination */
1218 stuff:
1219 	if (inst->op == cmpss || inst->op == cvttss2si || inst->op == cvtss2si
1220 	    || inst->op == cvttsd2si || inst->op == cvtsd2si) {
1221 		switch (info->res.type) {
1222 		case fex_int:
1223 			i = info->res.val.i;
1224 			break;
1225 
1226 		case fex_llong:
1227 			i = info->res.val.l;
1228 			break;
1229 
1230 		case fex_float:
1231 			i = info->res.val.f;
1232 			break;
1233 
1234 		case fex_double:
1235 			i = info->res.val.d;
1236 			break;
1237 
1238 		case fex_ldouble:
1239 			i = info->res.val.q;
1240 			break;
1241 
1242 		default:
1243 			break;
1244 		}
1245 		inst->op1->i[0] = i;
1246 	} else if (inst->op == cmpsd || inst->op == cvttss2siq ||
1247 	    inst->op == cvtss2siq || inst->op == cvttsd2siq ||
1248 	    inst->op == cvtsd2siq) {
1249 		switch (info->res.type) {
1250 		case fex_int:
1251 			l = info->res.val.i;
1252 			break;
1253 
1254 		case fex_llong:
1255 			l = info->res.val.l;
1256 			break;
1257 
1258 		case fex_float:
1259 			l = info->res.val.f;
1260 			break;
1261 
1262 		case fex_double:
1263 			l = info->res.val.d;
1264 			break;
1265 
1266 		case fex_ldouble:
1267 			l = info->res.val.q;
1268 			break;
1269 
1270 		default:
1271 			break;
1272 		}
1273 		inst->op1->l[0] = l;
1274 	} else if ((((int)inst->op & DOUBLE) && inst->op != cvtsd2ss) ||
1275 	    inst->op == cvtss2sd) {
1276 		switch (info->res.type) {
1277 		case fex_int:
1278 			d = info->res.val.i;
1279 			break;
1280 
1281 		case fex_llong:
1282 			d = info->res.val.l;
1283 			break;
1284 
1285 		case fex_float:
1286 			d = info->res.val.f;
1287 			break;
1288 
1289 		case fex_double:
1290 			d = info->res.val.d;
1291 			break;
1292 
1293 		case fex_ldouble:
1294 			d = info->res.val.q;
1295 			break;
1296 
1297 		default:
1298 			break;
1299 		}
1300 		inst->op1->d[0] = d;
1301 	} else {
1302 		switch (info->res.type) {
1303 		case fex_int:
1304 			f = info->res.val.i;
1305 			break;
1306 
1307 		case fex_llong:
1308 			f = info->res.val.l;
1309 			break;
1310 
1311 		case fex_float:
1312 			f = info->res.val.f;
1313 			break;
1314 
1315 		case fex_double:
1316 			f = info->res.val.d;
1317 			break;
1318 
1319 		case fex_ldouble:
1320 			f = info->res.val.q;
1321 			break;
1322 
1323 		default:
1324 			break;
1325 		}
1326 		inst->op1->f[0] = f;
1327 	}
1328 }
1329 
1330 /*
1331  * Store the results from a SIMD instruction.  For each i, store
1332  * the result value from info[i] in the i-th part of the destination
1333  * of the SIMD SSE instruction specified by *inst.  If no result
1334  * is given but the exception indicated by e[i] is underflow or
1335  * overflow, supply the default trapped result.
1336  *
1337  * This routine does not work if the instruction specified by *inst
1338  * is not a SIMD instruction.
1339  */
1340 void
1341 __fex_st_simd_result(ucontext_t *uap, sseinst_t *inst, enum fex_exception *e,
1342     fex_info_t *info)
1343 {
1344 	sseinst_t	dummy;
1345 	int		i;
1346 
1347 	/* store each part */
1348 	switch (inst->op) {
1349 	case cmpps:
1350 		dummy.op = cmpss;
1351 		dummy.imm = inst->imm;
1352 		for (i = 0; i < 4; i++) {
1353 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1354 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1355 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1356 		}
1357 		break;
1358 
1359 	case minps:
1360 		dummy.op = minss;
1361 		for (i = 0; i < 4; i++) {
1362 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1363 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1364 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1365 		}
1366 		break;
1367 
1368 	case maxps:
1369 		dummy.op = maxss;
1370 		for (i = 0; i < 4; i++) {
1371 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1372 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1373 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1374 		}
1375 		break;
1376 
1377 	case addps:
1378 		dummy.op = addss;
1379 		for (i = 0; i < 4; i++) {
1380 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1381 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1382 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1383 		}
1384 		break;
1385 
1386 	case subps:
1387 		dummy.op = subss;
1388 		for (i = 0; i < 4; i++) {
1389 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1390 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1391 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1392 		}
1393 		break;
1394 
1395 	case mulps:
1396 		dummy.op = mulss;
1397 		for (i = 0; i < 4; i++) {
1398 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1399 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1400 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1401 		}
1402 		break;
1403 
1404 	case divps:
1405 		dummy.op = divss;
1406 		for (i = 0; i < 4; i++) {
1407 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1408 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1409 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1410 		}
1411 		break;
1412 
1413 	case sqrtps:
1414 		dummy.op = sqrtss;
1415 		for (i = 0; i < 4; i++) {
1416 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1417 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1418 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1419 		}
1420 		break;
1421 
1422 	case cvtdq2ps:
1423 		dummy.op = cvtsi2ss;
1424 		for (i = 0; i < 4; i++) {
1425 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1426 			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
1427 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1428 		}
1429 		break;
1430 
1431 	case cvttps2dq:
1432 		dummy.op = cvttss2si;
1433 		for (i = 0; i < 4; i++) {
1434 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1435 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1436 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1437 		}
1438 		break;
1439 
1440 	case cvtps2dq:
1441 		dummy.op = cvtss2si;
1442 		for (i = 0; i < 4; i++) {
1443 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1444 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1445 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1446 		}
1447 		break;
1448 
1449 	case cvtpi2ps:
1450 		dummy.op = cvtsi2ss;
1451 		for (i = 0; i < 2; i++) {
1452 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1453 			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
1454 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1455 		}
1456 		break;
1457 
1458 	case cvttps2pi:
1459 		dummy.op = cvttss2si;
1460 		for (i = 0; i < 2; i++) {
1461 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1462 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1463 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1464 		}
1465 		break;
1466 
1467 	case cvtps2pi:
1468 		dummy.op = cvtss2si;
1469 		for (i = 0; i < 2; i++) {
1470 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1471 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1472 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1473 		}
1474 		break;
1475 
1476 	case cmppd:
1477 		dummy.op = cmpsd;
1478 		dummy.imm = inst->imm;
1479 		for (i = 0; i < 2; i++) {
1480 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1481 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1482 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1483 		}
1484 		break;
1485 
1486 	case minpd:
1487 		dummy.op = minsd;
1488 		for (i = 0; i < 2; i++) {
1489 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1490 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1491 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1492 		}
1493 		break;
1494 
1495 	case maxpd:
1496 		dummy.op = maxsd;
1497 		for (i = 0; i < 2; i++) {
1498 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1499 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1500 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1501 		}
1502 		break;
1503 
1504 	case addpd:
1505 		dummy.op = addsd;
1506 		for (i = 0; i < 2; i++) {
1507 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1508 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1509 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1510 		}
1511 		break;
1512 
1513 	case subpd:
1514 		dummy.op = subsd;
1515 		for (i = 0; i < 2; i++) {
1516 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1517 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1518 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1519 		}
1520 		break;
1521 
1522 	case mulpd:
1523 		dummy.op = mulsd;
1524 		for (i = 0; i < 2; i++) {
1525 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1526 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1527 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1528 		}
1529 		break;
1530 
1531 	case divpd:
1532 		dummy.op = divsd;
1533 		for (i = 0; i < 2; i++) {
1534 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1535 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1536 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1537 		}
1538 		break;
1539 
1540 	case sqrtpd:
1541 		dummy.op = sqrtsd;
1542 		for (i = 0; i < 2; i++) {
1543 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1544 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1545 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1546 		}
1547 		break;
1548 
1549 	case cvtpi2pd:
1550 	case cvtdq2pd:
1551 		dummy.op = cvtsi2sd;
1552 		for (i = 0; i < 2; i++) {
1553 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1554 			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
1555 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1556 		}
1557 		break;
1558 
1559 	case cvttpd2pi:
1560 	case cvttpd2dq:
1561 		dummy.op = cvttsd2si;
1562 		for (i = 0; i < 2; i++) {
1563 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1564 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1565 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1566 		}
1567 		/* for cvttpd2dq, zero the high 64 bits of the destination */
1568 		if (inst->op == cvttpd2dq)
1569 			inst->op1->l[1] = 0ll;
1570 		break;
1571 
1572 	case cvtpd2pi:
1573 	case cvtpd2dq:
1574 		dummy.op = cvtsd2si;
1575 		for (i = 0; i < 2; i++) {
1576 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1577 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1578 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1579 		}
1580 		/* for cvtpd2dq, zero the high 64 bits of the destination */
1581 		if (inst->op == cvtpd2dq)
1582 			inst->op1->l[1] = 0ll;
1583 		break;
1584 
1585 	case cvtps2pd:
1586 		dummy.op = cvtss2sd;
1587 		for (i = 0; i < 2; i++) {
1588 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1589 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1590 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1591 		}
1592 		break;
1593 
1594 	case cvtpd2ps:
1595 		dummy.op = cvtsd2ss;
1596 		for (i = 0; i < 2; i++) {
1597 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1598 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1599 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1600 		}
1601 		/* zero the high 64 bits of the destination */
1602 		inst->op1->l[1] = 0ll;
1603 
1604 	default:
1605 		break;
1606 	}
1607 }
1608 
1609