xref: /illumos-gate/usr/src/lib/iconv_modules/zh/common/UTF-8%zh_TW-iso2022-7.c (revision 16d8656330ae5622ec32e5007f62145ebafdc50f)
1*16d86563SAlexander Pyhalov /*
2*16d86563SAlexander Pyhalov  * CDDL HEADER START
3*16d86563SAlexander Pyhalov  *
4*16d86563SAlexander Pyhalov  * The contents of this file are subject to the terms of the
5*16d86563SAlexander Pyhalov  * Common Development and Distribution License (the "License").
6*16d86563SAlexander Pyhalov  * You may not use this file except in compliance with the License.
7*16d86563SAlexander Pyhalov  *
8*16d86563SAlexander Pyhalov  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9*16d86563SAlexander Pyhalov  * or http://www.opensolaris.org/os/licensing.
10*16d86563SAlexander Pyhalov  * See the License for the specific language governing permissions
11*16d86563SAlexander Pyhalov  * and limitations under the License.
12*16d86563SAlexander Pyhalov  *
13*16d86563SAlexander Pyhalov  * When distributing Covered Code, include this CDDL HEADER in each
14*16d86563SAlexander Pyhalov  * file and include the License file at src/OPENSOLARIS.LICENSE.
15*16d86563SAlexander Pyhalov  * If applicable, add the following below this CDDL HEADER, with the
16*16d86563SAlexander Pyhalov  * fields enclosed by brackets "[]" replaced with your own identifying
17*16d86563SAlexander Pyhalov  * information: Portions Copyright [yyyy] [name of copyright owner]
18*16d86563SAlexander Pyhalov  *
19*16d86563SAlexander Pyhalov  * CDDL HEADER END
20*16d86563SAlexander Pyhalov  */
21*16d86563SAlexander Pyhalov 
22*16d86563SAlexander Pyhalov /*
23*16d86563SAlexander Pyhalov  * Copyright (c) 1995, by Sun Microsystems, Inc.
24*16d86563SAlexander Pyhalov  * All rights reserved.
25*16d86563SAlexander Pyhalov  */
26*16d86563SAlexander Pyhalov 
27*16d86563SAlexander Pyhalov #include <stdio.h>
28*16d86563SAlexander Pyhalov #include <stdlib.h>
29*16d86563SAlexander Pyhalov #include <sys/types.h>
30*16d86563SAlexander Pyhalov #include <errno.h>
31*16d86563SAlexander Pyhalov #include "unicode_cns11643_TW.h"	/* UTF8 to CNS 11643 mapping table */
32*16d86563SAlexander Pyhalov #include "common_defs.h"
33*16d86563SAlexander Pyhalov 
34*16d86563SAlexander Pyhalov #define	MSB	0x80	/* most significant bit */
35*16d86563SAlexander Pyhalov #define	MBYTE	0x8e	/* multi-byte (4 byte character) */
36*16d86563SAlexander Pyhalov #define	PMASK	0xa0	/* plane number mask */
37*16d86563SAlexander Pyhalov #define ONEBYTE	0xff	/* right most byte */
38*16d86563SAlexander Pyhalov 
39*16d86563SAlexander Pyhalov #define SI	0x0f	/* shift in */
40*16d86563SAlexander Pyhalov #define SO	0x0e	/* shift out */
41*16d86563SAlexander Pyhalov #define ESC	0x1b	/* escape */
42*16d86563SAlexander Pyhalov 
43*16d86563SAlexander Pyhalov /* static const char plane_char[] = "0GH23456789:;<=>?"; */
44*16d86563SAlexander Pyhalov static const char plane_char[] = "0GHIJKLMNOPQRSTUV";
45*16d86563SAlexander Pyhalov 
46*16d86563SAlexander Pyhalov #define	GET_PLANEC(i)	(plane_char[i])
47*16d86563SAlexander Pyhalov 
48*16d86563SAlexander Pyhalov #define NON_ID_CHAR '?'	/* non-identified character */
49*16d86563SAlexander Pyhalov 
50*16d86563SAlexander Pyhalov typedef struct _icv_state {
51*16d86563SAlexander Pyhalov 	char	keepc[6];	/* maximum # byte of UTF8 code */
52*16d86563SAlexander Pyhalov 	short	cstate;
53*16d86563SAlexander Pyhalov 	short	istate;
54*16d86563SAlexander Pyhalov 	short	ustate;
55*16d86563SAlexander Pyhalov 	int	_errno;		/* internal errno */
56*16d86563SAlexander Pyhalov } _iconv_st;
57*16d86563SAlexander Pyhalov 
58*16d86563SAlexander Pyhalov enum _CSTATE	{ C0, C1 };
59*16d86563SAlexander Pyhalov enum _ISTATE	{ IN, OUT };
60*16d86563SAlexander Pyhalov enum _USTATE	{ U0, U1, U2, U3, U4, U5, U6, U7 };
61*16d86563SAlexander Pyhalov 
62*16d86563SAlexander Pyhalov 
63*16d86563SAlexander Pyhalov static int get_plane_no_by_utf(const char, const char, int *, unsigned long *);
64*16d86563SAlexander Pyhalov static int utf8_to_iso(int, int, unsigned long, char *, size_t);
65*16d86563SAlexander Pyhalov static int binsearch(unsigned long, utf_cns[], int);
66*16d86563SAlexander Pyhalov 
67*16d86563SAlexander Pyhalov /*
68*16d86563SAlexander Pyhalov  * Open; called from iconv_open()
69*16d86563SAlexander Pyhalov  */
70*16d86563SAlexander Pyhalov void *
_icv_open()71*16d86563SAlexander Pyhalov _icv_open()
72*16d86563SAlexander Pyhalov {
73*16d86563SAlexander Pyhalov 	_iconv_st *st;
74*16d86563SAlexander Pyhalov 
75*16d86563SAlexander Pyhalov 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
76*16d86563SAlexander Pyhalov 		errno = ENOMEM;
77*16d86563SAlexander Pyhalov 		return ((void *) -1);
78*16d86563SAlexander Pyhalov 	}
79*16d86563SAlexander Pyhalov 
80*16d86563SAlexander Pyhalov 	st->cstate = C0;
81*16d86563SAlexander Pyhalov 	st->istate = IN;
82*16d86563SAlexander Pyhalov 	st->ustate = U0;
83*16d86563SAlexander Pyhalov 	st->_errno = 0;
84*16d86563SAlexander Pyhalov 
85*16d86563SAlexander Pyhalov #ifdef DEBUG
86*16d86563SAlexander Pyhalov     fprintf(stderr, "==========     iconv(): UTF2 --> ISO2022-7     ==========\n");
87*16d86563SAlexander Pyhalov #endif
88*16d86563SAlexander Pyhalov 
89*16d86563SAlexander Pyhalov 	return ((void *) st);
90*16d86563SAlexander Pyhalov }
91*16d86563SAlexander Pyhalov 
92*16d86563SAlexander Pyhalov 
93*16d86563SAlexander Pyhalov /*
94*16d86563SAlexander Pyhalov  * Close; called from iconv_close()
95*16d86563SAlexander Pyhalov  */
96*16d86563SAlexander Pyhalov void
_icv_close(_iconv_st * st)97*16d86563SAlexander Pyhalov _icv_close(_iconv_st *st)
98*16d86563SAlexander Pyhalov {
99*16d86563SAlexander Pyhalov 	if (!st)
100*16d86563SAlexander Pyhalov 		errno = EBADF;
101*16d86563SAlexander Pyhalov 	else
102*16d86563SAlexander Pyhalov 		free(st);
103*16d86563SAlexander Pyhalov }
104*16d86563SAlexander Pyhalov 
105*16d86563SAlexander Pyhalov 
106*16d86563SAlexander Pyhalov /*
107*16d86563SAlexander Pyhalov  * Actual conversion; called from iconv()
108*16d86563SAlexander Pyhalov  */
109*16d86563SAlexander Pyhalov /*=========================================================
110*16d86563SAlexander Pyhalov  *
111*16d86563SAlexander Pyhalov  *       State Machine for interpreting UTF8 code
112*16d86563SAlexander Pyhalov  *
113*16d86563SAlexander Pyhalov  *=========================================================
114*16d86563SAlexander Pyhalov  *                         2nd byte   3rd byte 4th byte
115*16d86563SAlexander Pyhalov  *          +----->------->------->U5------>U6--------->U7
116*16d86563SAlexander Pyhalov  *          |                                            |
117*16d86563SAlexander Pyhalov  *          |     3 byte unicode                         |
118*16d86563SAlexander Pyhalov  *          +----->------->-------+                      |
119*16d86563SAlexander Pyhalov  *          |                     |                      |
120*16d86563SAlexander Pyhalov  *          ^                     v                      |
121*16d86563SAlexander Pyhalov  *          |  2 byte             U2 ---> U3             |
122*16d86563SAlexander Pyhalov  *          |  unicode                    v              |
123*16d86563SAlexander Pyhalov  * +------> U0 -------> U1                +-------->U4---+
124*16d86563SAlexander Pyhalov  * ^  ascii |           |                           ^    |
125*16d86563SAlexander Pyhalov  * |        |           +-------->--------->--------+    |
126*16d86563SAlexander Pyhalov  * |        v                                            v
127*16d86563SAlexander Pyhalov  * +----<---+-----<------------<------------<------------+
128*16d86563SAlexander Pyhalov  *
129*16d86563SAlexander Pyhalov  *=========================================================*/
130*16d86563SAlexander Pyhalov size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)131*16d86563SAlexander Pyhalov _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
132*16d86563SAlexander Pyhalov 				char **outbuf, size_t *outbytesleft)
133*16d86563SAlexander Pyhalov {
134*16d86563SAlexander Pyhalov 	char c1 = '\0', c2 = '\0';
135*16d86563SAlexander Pyhalov 	int		plane_no, n, unidx;
136*16d86563SAlexander Pyhalov 	/* pre_plane_no: need to be static when re-entry occurs on errno set */
137*16d86563SAlexander Pyhalov 	static int	pre_plane_no = -1;	/* previous plane number */
138*16d86563SAlexander Pyhalov 	unsigned long	cnscode;
139*16d86563SAlexander Pyhalov 
140*16d86563SAlexander Pyhalov 	if (st == NULL) {
141*16d86563SAlexander Pyhalov 		errno = EBADF;
142*16d86563SAlexander Pyhalov 		return ((size_t) -1);
143*16d86563SAlexander Pyhalov 	}
144*16d86563SAlexander Pyhalov 
145*16d86563SAlexander Pyhalov 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
146*16d86563SAlexander Pyhalov 		st->cstate = C0;
147*16d86563SAlexander Pyhalov 		st->istate = IN;
148*16d86563SAlexander Pyhalov 		st->ustate = U0;
149*16d86563SAlexander Pyhalov 		st->_errno = 0;
150*16d86563SAlexander Pyhalov 		return ((size_t) 0);
151*16d86563SAlexander Pyhalov 	}
152*16d86563SAlexander Pyhalov 
153*16d86563SAlexander Pyhalov #ifdef DEBUG
154*16d86563SAlexander Pyhalov     fprintf(stderr, "=== (Re-entry)     iconv(): UTF-8 --> ISO 2022-7 ===\n");
155*16d86563SAlexander Pyhalov     fprintf(stderr, "st->cstate=%d\tst->istate=%d\tst->_errno=%d\tplane_no=%d\n",
156*16d86563SAlexander Pyhalov 	    st->cstate, st->istate, st->_errno, plane_no);
157*16d86563SAlexander Pyhalov #endif
158*16d86563SAlexander Pyhalov 	st->_errno = 0;		/* reset internal errno */
159*16d86563SAlexander Pyhalov 	errno = 0;		/* reset external errno */
160*16d86563SAlexander Pyhalov 
161*16d86563SAlexander Pyhalov 	/* a state machine for interpreting UTF8 code */
162*16d86563SAlexander Pyhalov 	while (*inbytesleft > 0 && *outbytesleft > 0) {
163*16d86563SAlexander Pyhalov 
164*16d86563SAlexander Pyhalov 	        uchar_t  first_byte;
165*16d86563SAlexander Pyhalov 
166*16d86563SAlexander Pyhalov 		switch (st->ustate) {
167*16d86563SAlexander Pyhalov 		case U0:		/* assuming ASCII in the beginning */
168*16d86563SAlexander Pyhalov 			if ((**inbuf & MSB) == 0) {	/* ASCII */
169*16d86563SAlexander Pyhalov 				if (st->istate == OUT) {
170*16d86563SAlexander Pyhalov 					st->cstate = C0;
171*16d86563SAlexander Pyhalov 					st->istate = IN;
172*16d86563SAlexander Pyhalov 					**outbuf = SI;
173*16d86563SAlexander Pyhalov 					(*outbuf)++;
174*16d86563SAlexander Pyhalov 					(*outbytesleft)--;
175*16d86563SAlexander Pyhalov 					if (*outbytesleft <= 0) {
176*16d86563SAlexander Pyhalov 						errno = E2BIG;
177*16d86563SAlexander Pyhalov 						return((size_t) -1);
178*16d86563SAlexander Pyhalov 					}
179*16d86563SAlexander Pyhalov 				}
180*16d86563SAlexander Pyhalov 				**outbuf = **inbuf;
181*16d86563SAlexander Pyhalov 				(*outbuf)++;
182*16d86563SAlexander Pyhalov 				(*outbytesleft)--;
183*16d86563SAlexander Pyhalov 			} else {	/* Chinese character */
184*16d86563SAlexander Pyhalov 				if ((**inbuf & 0xe0) == 0xc0) {	/* 2 byte unicode 0xc2..0xdf */
185*16d86563SAlexander Pyhalov 
186*16d86563SAlexander Pyhalov 				        /* invalid sequence if the first byte is either 0xc0 or 0xc1 */
187*16d86563SAlexander Pyhalov 				        if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
188*16d86563SAlexander Pyhalov 				             st->_errno = errno = EILSEQ;
189*16d86563SAlexander Pyhalov 				        else {
190*16d86563SAlexander Pyhalov 					     st->ustate = U1;
191*16d86563SAlexander Pyhalov 					     st->keepc[0] = **inbuf;
192*16d86563SAlexander Pyhalov 					}
193*16d86563SAlexander Pyhalov 				} else if ((**inbuf & 0xf0) == 0xe0) {	/* 3 byte 0xe0..0xef */
194*16d86563SAlexander Pyhalov 					st->ustate = U2;
195*16d86563SAlexander Pyhalov 					st->keepc[0] = **inbuf;
196*16d86563SAlexander Pyhalov 				} else {
197*16d86563SAlexander Pyhalov 				        /* four bytes of UTF-8 sequences */
198*16d86563SAlexander Pyhalov 				        if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
199*16d86563SAlexander Pyhalov 					     st->_errno = errno = EILSEQ;
200*16d86563SAlexander Pyhalov 				        else
201*16d86563SAlexander Pyhalov 				         {
202*16d86563SAlexander Pyhalov 					     st->ustate = U5;
203*16d86563SAlexander Pyhalov 					     st->keepc[0] = **inbuf;
204*16d86563SAlexander Pyhalov 					 }
205*16d86563SAlexander Pyhalov 				}
206*16d86563SAlexander Pyhalov 			}
207*16d86563SAlexander Pyhalov 			break;
208*16d86563SAlexander Pyhalov 		case U1:		/* 2 byte unicode */
209*16d86563SAlexander Pyhalov 			if ((**inbuf & 0xc0) == 0x80) {
210*16d86563SAlexander Pyhalov 				st->ustate = U4;
211*16d86563SAlexander Pyhalov 				st->keepc[1] = **inbuf;
212*16d86563SAlexander Pyhalov 				c1 = (st->keepc[0]&0x1c)>>2;
213*16d86563SAlexander Pyhalov 				c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f);
214*16d86563SAlexander Pyhalov #ifdef DEBUG
215*16d86563SAlexander Pyhalov     fprintf(stderr, "UTF8: %02x%02x   --> ",
216*16d86563SAlexander Pyhalov 	st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
217*16d86563SAlexander Pyhalov #endif
218*16d86563SAlexander Pyhalov 				continue;	/* should not advance *inbuf */
219*16d86563SAlexander Pyhalov 			} else {
220*16d86563SAlexander Pyhalov 				st->_errno = errno = EILSEQ;
221*16d86563SAlexander Pyhalov 			}
222*16d86563SAlexander Pyhalov 			break;
223*16d86563SAlexander Pyhalov 		case U2:		/* 3 byte unicode - 2nd byte */
224*16d86563SAlexander Pyhalov 
225*16d86563SAlexander Pyhalov 		        first_byte = st->keepc[0];
226*16d86563SAlexander Pyhalov 
227*16d86563SAlexander Pyhalov 		        /* if the first byte is 0xed, it is illegal sequence if the second
228*16d86563SAlexander Pyhalov 			 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
229*16d86563SAlexander Pyhalov 			 */
230*16d86563SAlexander Pyhalov 		        if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
231*16d86563SAlexander Pyhalov 			    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
232*16d86563SAlexander Pyhalov 				st->_errno = errno = EILSEQ;
233*16d86563SAlexander Pyhalov 			else {
234*16d86563SAlexander Pyhalov 				st->ustate = U3;
235*16d86563SAlexander Pyhalov 				st->keepc[1] = **inbuf;
236*16d86563SAlexander Pyhalov 			}
237*16d86563SAlexander Pyhalov 			break;
238*16d86563SAlexander Pyhalov 		case U3:		/* 3 byte unicode - 3rd byte */
239*16d86563SAlexander Pyhalov 			if ((**inbuf & 0xc0) == 0x80) {
240*16d86563SAlexander Pyhalov 				st->ustate = U4;
241*16d86563SAlexander Pyhalov 				st->keepc[2] = **inbuf;
242*16d86563SAlexander Pyhalov 				c1 = ((st->keepc[0]&0x0f)<<4) |
243*16d86563SAlexander Pyhalov 					((st->keepc[1]&0x3c)>>2);
244*16d86563SAlexander Pyhalov 				c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
245*16d86563SAlexander Pyhalov #ifdef DEBUG
246*16d86563SAlexander Pyhalov     fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
247*16d86563SAlexander Pyhalov 		st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
248*16d86563SAlexander Pyhalov #endif
249*16d86563SAlexander Pyhalov 				continue;	/* should not advance *inbuf */
250*16d86563SAlexander Pyhalov 			} else {
251*16d86563SAlexander Pyhalov 				st->_errno = errno = EILSEQ;
252*16d86563SAlexander Pyhalov 			}
253*16d86563SAlexander Pyhalov 			break;
254*16d86563SAlexander Pyhalov 		case U4:
255*16d86563SAlexander Pyhalov 			plane_no = get_plane_no_by_utf(c1, c2, &unidx, &cnscode);
256*16d86563SAlexander Pyhalov 		        if (plane_no == -2)
257*16d86563SAlexander Pyhalov 		         {  /* unicode is either 0xFFFE or 0xFFFF */
258*16d86563SAlexander Pyhalov 			    st->_errno = errno = EILSEQ;
259*16d86563SAlexander Pyhalov 			    break;
260*16d86563SAlexander Pyhalov 		         }
261*16d86563SAlexander Pyhalov 
262*16d86563SAlexander Pyhalov 			if (plane_no > 0) {	/* legal unicode; illegal CNS */
263*16d86563SAlexander Pyhalov 			if ((st->istate == IN) || (pre_plane_no != plane_no)) {
264*16d86563SAlexander Pyhalov 				if ((st->cstate == C0) ||
265*16d86563SAlexander Pyhalov 					(pre_plane_no != plane_no)) {
266*16d86563SAlexander Pyhalov 					/* change plane # in Chinese mode */
267*16d86563SAlexander Pyhalov 					if (st->cstate == C1) {
268*16d86563SAlexander Pyhalov 						**outbuf = SI;
269*16d86563SAlexander Pyhalov 						(*outbuf)++;
270*16d86563SAlexander Pyhalov 						(*outbytesleft)--;
271*16d86563SAlexander Pyhalov 					}
272*16d86563SAlexander Pyhalov 					if (*outbytesleft < 4) {
273*16d86563SAlexander Pyhalov 						st->_errno = errno = E2BIG;
274*16d86563SAlexander Pyhalov 						return((size_t) -1);
275*16d86563SAlexander Pyhalov 					}
276*16d86563SAlexander Pyhalov 					pre_plane_no = plane_no;
277*16d86563SAlexander Pyhalov 					st->cstate = C1;
278*16d86563SAlexander Pyhalov 					**outbuf = ESC;
279*16d86563SAlexander Pyhalov 					*(*outbuf+1) = '$';
280*16d86563SAlexander Pyhalov 					*(*outbuf+2) = ')';
281*16d86563SAlexander Pyhalov 					*(*outbuf+3) = GET_PLANEC(plane_no);
282*16d86563SAlexander Pyhalov #ifdef DEBUG
283*16d86563SAlexander Pyhalov     fprintf(stderr, "\n\t\t\t\tESC $ ) %c\t", *(*outbuf+3));
284*16d86563SAlexander Pyhalov #endif
285*16d86563SAlexander Pyhalov 					(*outbuf) += 4;
286*16d86563SAlexander Pyhalov 					(*outbytesleft) -= 4;
287*16d86563SAlexander Pyhalov 					if (*outbytesleft <= 0) {
288*16d86563SAlexander Pyhalov 						st->_errno = errno = E2BIG;
289*16d86563SAlexander Pyhalov 						return((size_t) -1);
290*16d86563SAlexander Pyhalov 					}
291*16d86563SAlexander Pyhalov 				}
292*16d86563SAlexander Pyhalov 				st->istate = OUT;
293*16d86563SAlexander Pyhalov 				**outbuf = SO;
294*16d86563SAlexander Pyhalov 				(*outbuf)++;
295*16d86563SAlexander Pyhalov 				(*outbytesleft)--;
296*16d86563SAlexander Pyhalov 			}
297*16d86563SAlexander Pyhalov 			}/* get_plane_no OK */
298*16d86563SAlexander Pyhalov 
299*16d86563SAlexander Pyhalov 			n = utf8_to_iso(plane_no, unidx, cnscode,
300*16d86563SAlexander Pyhalov 					*outbuf, *outbytesleft);
301*16d86563SAlexander Pyhalov 			if (n > 0) {
302*16d86563SAlexander Pyhalov 				(*outbuf) += n;
303*16d86563SAlexander Pyhalov 				(*outbytesleft) -= n;
304*16d86563SAlexander Pyhalov 			} else {
305*16d86563SAlexander Pyhalov 				st->_errno = errno;
306*16d86563SAlexander Pyhalov 				return((size_t) -1);
307*16d86563SAlexander Pyhalov 			}
308*16d86563SAlexander Pyhalov 			st->ustate = U0;
309*16d86563SAlexander Pyhalov 			st->_errno = 0;
310*16d86563SAlexander Pyhalov 			break;
311*16d86563SAlexander Pyhalov 	        case U5:
312*16d86563SAlexander Pyhalov 
313*16d86563SAlexander Pyhalov 		        first_byte = st->keepc[0];
314*16d86563SAlexander Pyhalov 
315*16d86563SAlexander Pyhalov 		        /* if the first byte is 0xed, it is illegal sequence if the second
316*16d86563SAlexander Pyhalov 			 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
317*16d86563SAlexander Pyhalov 			 */
318*16d86563SAlexander Pyhalov 		        if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
319*16d86563SAlexander Pyhalov 			    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
320*16d86563SAlexander Pyhalov 				st->_errno = errno = EILSEQ;
321*16d86563SAlexander Pyhalov 			else {
322*16d86563SAlexander Pyhalov 				st->ustate = U6;
323*16d86563SAlexander Pyhalov 				st->keepc[1] = **inbuf;
324*16d86563SAlexander Pyhalov 			}
325*16d86563SAlexander Pyhalov 		        break;
326*16d86563SAlexander Pyhalov 	        case U6:
327*16d86563SAlexander Pyhalov 		        if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
328*16d86563SAlexander Pyhalov 		          {
329*16d86563SAlexander Pyhalov 			     st->ustate = U7;
330*16d86563SAlexander Pyhalov 			     st->keepc[2] = **inbuf;
331*16d86563SAlexander Pyhalov 			  }
332*16d86563SAlexander Pyhalov 		        else
333*16d86563SAlexander Pyhalov 		             st->_errno = errno = EILSEQ;
334*16d86563SAlexander Pyhalov 		        break;
335*16d86563SAlexander Pyhalov 		case U7:
336*16d86563SAlexander Pyhalov 		        if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
337*16d86563SAlexander Pyhalov 		          {  /* skip it to simplify */
338*16d86563SAlexander Pyhalov 			     st->ustate = U0;
339*16d86563SAlexander Pyhalov 			     st->_errno = 0;
340*16d86563SAlexander Pyhalov 			  }
341*16d86563SAlexander Pyhalov 		        else
342*16d86563SAlexander Pyhalov 		             st->_errno = errno = EILSEQ;
343*16d86563SAlexander Pyhalov 		        break;
344*16d86563SAlexander Pyhalov 		default:			/* should never come here */
345*16d86563SAlexander Pyhalov 			st->_errno = errno = EILSEQ;
346*16d86563SAlexander Pyhalov 			st->ustate = U0;	/* reset state */
347*16d86563SAlexander Pyhalov 			break;
348*16d86563SAlexander Pyhalov 		}
349*16d86563SAlexander Pyhalov 
350*16d86563SAlexander Pyhalov 		if (st->_errno) {
351*16d86563SAlexander Pyhalov #ifdef DEBUG
352*16d86563SAlexander Pyhalov     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
353*16d86563SAlexander Pyhalov 		st->_errno, st->ustate);
354*16d86563SAlexander Pyhalov #endif
355*16d86563SAlexander Pyhalov 			break;
356*16d86563SAlexander Pyhalov 		}
357*16d86563SAlexander Pyhalov 		(*inbuf)++;
358*16d86563SAlexander Pyhalov 		(*inbytesleft)--;
359*16d86563SAlexander Pyhalov 	}
360*16d86563SAlexander Pyhalov 
361*16d86563SAlexander Pyhalov 	if (errno)
362*16d86563SAlexander Pyhalov 		return((size_t) -1);
363*16d86563SAlexander Pyhalov 
364*16d86563SAlexander Pyhalov         if (*inbytesleft == 0 && st->ustate != U0) {
365*16d86563SAlexander Pyhalov 	        errno = EINVAL;
366*16d86563SAlexander Pyhalov 	        return ((size_t) -1);
367*16d86563SAlexander Pyhalov 	}
368*16d86563SAlexander Pyhalov 
369*16d86563SAlexander Pyhalov 	if (*inbytesleft > 0 && *outbytesleft == 0) {
370*16d86563SAlexander Pyhalov 		errno = E2BIG;
371*16d86563SAlexander Pyhalov 		return((size_t) -1);
372*16d86563SAlexander Pyhalov 	}
373*16d86563SAlexander Pyhalov 	return (*inbytesleft);
374*16d86563SAlexander Pyhalov }
375*16d86563SAlexander Pyhalov 
376*16d86563SAlexander Pyhalov 
377*16d86563SAlexander Pyhalov /*
378*16d86563SAlexander Pyhalov  * Get plane number by UTF8 code; i.e. plane #1 returns 1, #2 returns 2, etc.
379*16d86563SAlexander Pyhalov  * Returns -1 on error conditions and return -2 due to illegal sequence
380*16d86563SAlexander Pyhalov  *
381*16d86563SAlexander Pyhalov  * Since binary search of the UTF8 to CNS table is necessary, might as well
382*16d86563SAlexander Pyhalov  * return index and CNS code matching to the unicode.
383*16d86563SAlexander Pyhalov  */
get_plane_no_by_utf(const char c1,const char c2,int * unidx,unsigned long * cnscode)384*16d86563SAlexander Pyhalov static int get_plane_no_by_utf(const char c1, const char c2,
385*16d86563SAlexander Pyhalov 			int *unidx, unsigned long *cnscode)
386*16d86563SAlexander Pyhalov {
387*16d86563SAlexander Pyhalov 	int 		ret;
388*16d86563SAlexander Pyhalov 	unsigned long	unicode;
389*16d86563SAlexander Pyhalov 
390*16d86563SAlexander Pyhalov 	unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
391*16d86563SAlexander Pyhalov         /* the 0xfffe and 0xffff should not be allowed */
392*16d86563SAlexander Pyhalov 	if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -2;
393*16d86563SAlexander Pyhalov 
394*16d86563SAlexander Pyhalov 	*unidx = binsearch(unicode, utf_cns_tab, MAX_UTF_NUM);
395*16d86563SAlexander Pyhalov 	if ((*unidx) >= 0)
396*16d86563SAlexander Pyhalov 		*cnscode = utf_cns_tab[*unidx].cnscode;
397*16d86563SAlexander Pyhalov 	else
398*16d86563SAlexander Pyhalov 		return(0);	/* match from UTF8 to CNS not found */
399*16d86563SAlexander Pyhalov #ifdef DEBUG
400*16d86563SAlexander Pyhalov     fprintf(stderr, "Unicode=%04x, idx=%5d, CNS=%x ", unicode, *unidx, *cnscode);
401*16d86563SAlexander Pyhalov #endif
402*16d86563SAlexander Pyhalov 
403*16d86563SAlexander Pyhalov 	ret = (int) (*cnscode >> 16);
404*16d86563SAlexander Pyhalov 	switch (ret) {
405*16d86563SAlexander Pyhalov 	case 0x21:	/* 0x8EA1 - G */
406*16d86563SAlexander Pyhalov 	case 0x22:	/* 0x8EA2 - H */
407*16d86563SAlexander Pyhalov 	case 0x23:	/* 0x8EA3 - I */
408*16d86563SAlexander Pyhalov 	case 0x24:	/* 0x8EA4 - J */
409*16d86563SAlexander Pyhalov 	case 0x25:	/* 0x8EA5 - K */
410*16d86563SAlexander Pyhalov 	case 0x26:	/* 0x8EA6 - L */
411*16d86563SAlexander Pyhalov 	case 0x27:	/* 0x8EA7 - M */
412*16d86563SAlexander Pyhalov 	case 0x28:	/* 0x8EA8 - N */
413*16d86563SAlexander Pyhalov 	case 0x29:	/* 0x8EA9 - O */
414*16d86563SAlexander Pyhalov 	case 0x2a:	/* 0x8EAA - P */
415*16d86563SAlexander Pyhalov 	case 0x2b:	/* 0x8EAB - Q */
416*16d86563SAlexander Pyhalov 	case 0x2c:	/* 0x8EAC - R */
417*16d86563SAlexander Pyhalov 	case 0x2d:	/* 0x8EAD - S */
418*16d86563SAlexander Pyhalov 	case 0x2f:	/* 0x8EAF - U */
419*16d86563SAlexander Pyhalov 	case 0x30:	/* 0x8EB0 - V */
420*16d86563SAlexander Pyhalov 		return (ret - 0x20);	/* so that we can use GET_PLANEC() */
421*16d86563SAlexander Pyhalov 	case 0x2e:	/* 0x8EAE - T */
422*16d86563SAlexander Pyhalov 		return (3);		/* CNS 11643-1992 */
423*16d86563SAlexander Pyhalov 	default:
424*16d86563SAlexander Pyhalov 		return (-1);
425*16d86563SAlexander Pyhalov 	}
426*16d86563SAlexander Pyhalov }
427*16d86563SAlexander Pyhalov 
428*16d86563SAlexander Pyhalov 
429*16d86563SAlexander Pyhalov /*
430*16d86563SAlexander Pyhalov  * ISO/IEC 10646 (Unicode) --> ISO 2022-7
431*16d86563SAlexander Pyhalov  * Unicode --> UTF8 (FSS-UTF)
432*16d86563SAlexander Pyhalov  *             (File System Safe Universal Character Set Transformation Format)
433*16d86563SAlexander Pyhalov  * Return: > 0 - converted with enough space in output buffer
434*16d86563SAlexander Pyhalov  *         = 0 - no space in outbuf
435*16d86563SAlexander Pyhalov  */
utf8_to_iso(int plane_no,int unidx,unsigned long cnscode,char * buf,size_t buflen)436*16d86563SAlexander Pyhalov static int utf8_to_iso(int plane_no, int unidx, unsigned long cnscode,
437*16d86563SAlexander Pyhalov 						    char *buf, size_t buflen)
438*16d86563SAlexander Pyhalov {
439*16d86563SAlexander Pyhalov 	unsigned long	val;		/* CNS 11643 value */
440*16d86563SAlexander Pyhalov #ifdef DEBUG
441*16d86563SAlexander Pyhalov     char	cns_str[5];
442*16d86563SAlexander Pyhalov #endif
443*16d86563SAlexander Pyhalov 
444*16d86563SAlexander Pyhalov 	if (buflen < 2) {
445*16d86563SAlexander Pyhalov 		errno = E2BIG;
446*16d86563SAlexander Pyhalov 		return(0);
447*16d86563SAlexander Pyhalov 	}
448*16d86563SAlexander Pyhalov 
449*16d86563SAlexander Pyhalov 
450*16d86563SAlexander Pyhalov 	if (unidx < 0) {	/* no match from UTF8 to CNS 11643 */
451*16d86563SAlexander Pyhalov 	    *buf = *(buf+1) = NON_ID_CHAR;
452*16d86563SAlexander Pyhalov 	    return(2);
453*16d86563SAlexander Pyhalov 	} else {
454*16d86563SAlexander Pyhalov 		val = cnscode & 0xffff;
455*16d86563SAlexander Pyhalov 		*buf = (val & 0xff00) >> 8;
456*16d86563SAlexander Pyhalov 		*(buf+1) = val & 0xff;
457*16d86563SAlexander Pyhalov 	}
458*16d86563SAlexander Pyhalov #ifdef DEBUG
459*16d86563SAlexander Pyhalov     fprintf(stderr, "\t%02x%02x\t", *buf, *(buf+1));
460*16d86563SAlexander Pyhalov #endif
461*16d86563SAlexander Pyhalov 
462*16d86563SAlexander Pyhalov #ifdef DEBUG
463*16d86563SAlexander Pyhalov     switch (plane_no) {
464*16d86563SAlexander Pyhalov     case 1:
465*16d86563SAlexander Pyhalov 	cns_str[0] = *buf | MSB;
466*16d86563SAlexander Pyhalov 	cns_str[1] = *(buf+1) | MSB;
467*16d86563SAlexander Pyhalov 	cns_str[2] = cns_str[3] = cns_str[4] = NULL;
468*16d86563SAlexander Pyhalov 	break;
469*16d86563SAlexander Pyhalov     case 2:
470*16d86563SAlexander Pyhalov     case 3:
471*16d86563SAlexander Pyhalov     case 4:
472*16d86563SAlexander Pyhalov     case 5:
473*16d86563SAlexander Pyhalov     case 6:
474*16d86563SAlexander Pyhalov     case 7:
475*16d86563SAlexander Pyhalov     case 8:
476*16d86563SAlexander Pyhalov     case 9:
477*16d86563SAlexander Pyhalov     case 10:
478*16d86563SAlexander Pyhalov     case 11:
479*16d86563SAlexander Pyhalov     case 12:
480*16d86563SAlexander Pyhalov     case 13:
481*16d86563SAlexander Pyhalov     case 14:
482*16d86563SAlexander Pyhalov     case 15:
483*16d86563SAlexander Pyhalov     case 16:
484*16d86563SAlexander Pyhalov 	cns_str[0] = MBYTE;
485*16d86563SAlexander Pyhalov 	cns_str[1] = (char) PMASK + plane_no;
486*16d86563SAlexander Pyhalov 	cns_str[2] = (char) *buf | MSB;
487*16d86563SAlexander Pyhalov 	cns_str[3] = (char) *(buf+1) | MSB;
488*16d86563SAlexander Pyhalov 	cns_str[4] = NULL;
489*16d86563SAlexander Pyhalov 	break;
490*16d86563SAlexander Pyhalov     }
491*16d86563SAlexander Pyhalov 
492*16d86563SAlexander Pyhalov     fprintf(stderr, "#%d ->%s<-\n", plane_no, cns_str);
493*16d86563SAlexander Pyhalov #endif
494*16d86563SAlexander Pyhalov 	return(2);
495*16d86563SAlexander Pyhalov }
496*16d86563SAlexander Pyhalov 
497*16d86563SAlexander Pyhalov 
498*16d86563SAlexander Pyhalov /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
binsearch(unsigned long x,utf_cns v[],int n)499*16d86563SAlexander Pyhalov static int binsearch(unsigned long x, utf_cns v[], int n)
500*16d86563SAlexander Pyhalov {
501*16d86563SAlexander Pyhalov 	int low, high, mid;
502*16d86563SAlexander Pyhalov 
503*16d86563SAlexander Pyhalov 	low = 0;
504*16d86563SAlexander Pyhalov 	high = n - 1;
505*16d86563SAlexander Pyhalov 	while (low <= high) {
506*16d86563SAlexander Pyhalov 		mid = (low + high) / 2;
507*16d86563SAlexander Pyhalov 		if (x < v[mid].unicode)
508*16d86563SAlexander Pyhalov 			high = mid - 1;
509*16d86563SAlexander Pyhalov 		else if (x > v[mid].unicode)
510*16d86563SAlexander Pyhalov 			low = mid + 1;
511*16d86563SAlexander Pyhalov 		else	/* found match */
512*16d86563SAlexander Pyhalov 			return mid;
513*16d86563SAlexander Pyhalov 	}
514*16d86563SAlexander Pyhalov 	return (-1);	/* no match */
515*16d86563SAlexander Pyhalov }
516