xref: /illumos-gate/usr/src/lib/libc/port/locale/euc.c (revision b531f6d16eb39863e7bbc34773fb7ef7a282a0a2)
1 /*
2  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved.
5  * Copyright (c) 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Paul Borman at Krystal Technologies.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #include "lint.h"
37 #include <errno.h>
38 #include <limits.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <wchar.h>
42 #include <sys/types.h>
43 #include <sys/euc.h>
44 #include "mblocal.h"
45 #include "lctype.h"
46 
47 static size_t	_EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD,
48     const char *_RESTRICT_KYWD,
49     size_t, mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t,
50     boolean_t);
51 static size_t	_EUC_wcrtomb_impl(char *_RESTRICT_KYWD, wchar_t,
52     mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t);
53 
54 static size_t	_EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD,
55 		    const char *_RESTRICT_KYWD,
56 		    size_t, mbstate_t *_RESTRICT_KYWD, boolean_t);
57 static size_t	_EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD,
58 		    const char *_RESTRICT_KYWD,
59 		    size_t, mbstate_t *_RESTRICT_KYWD, boolean_t);
60 static size_t	_EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD,
61 		    const char *_RESTRICT_KYWD,
62 		    size_t, mbstate_t *_RESTRICT_KYWD, boolean_t);
63 static size_t	_EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD,
64 		    const char *_RESTRICT_KYWD,
65 		    size_t, mbstate_t *_RESTRICT_KYWD, boolean_t);
66 
67 static size_t	_EUC_CN_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
68 		    mbstate_t *_RESTRICT_KYWD);
69 static size_t	_EUC_JP_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
70 		    mbstate_t *_RESTRICT_KYWD);
71 static size_t	_EUC_KR_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
72 		    mbstate_t *_RESTRICT_KYWD);
73 static size_t	_EUC_TW_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
74 		    mbstate_t *_RESTRICT_KYWD);
75 
76 static size_t	_EUC_CN_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
77 		    const char **_RESTRICT_KYWD, size_t, size_t,
78 		    mbstate_t *_RESTRICT_KYWD);
79 static size_t	_EUC_JP_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
80 		    const char **_RESTRICT_KYWD, size_t, size_t,
81 		    mbstate_t *_RESTRICT_KYWD);
82 static size_t	_EUC_KR_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
83 		    const char **_RESTRICT_KYWD, size_t, size_t,
84 		    mbstate_t *_RESTRICT_KYWD);
85 static size_t	_EUC_TW_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
86 		    const char **_RESTRICT_KYWD, size_t, size_t,
87 		    mbstate_t *_RESTRICT_KYWD);
88 
89 static size_t	_EUC_CN_wcsnrtombs(char *_RESTRICT_KYWD,
90 		    const wchar_t **_RESTRICT_KYWD, size_t, size_t,
91 		    mbstate_t *_RESTRICT_KYWD);
92 static size_t	_EUC_JP_wcsnrtombs(char *_RESTRICT_KYWD,
93 		    const wchar_t **_RESTRICT_KYWD, size_t, size_t,
94 		    mbstate_t *_RESTRICT_KYWD);
95 static size_t	_EUC_KR_wcsnrtombs(char *_RESTRICT_KYWD,
96 		    const wchar_t **_RESTRICT_KYWD, size_t, size_t,
97 		    mbstate_t *_RESTRICT_KYWD);
98 static size_t	_EUC_TW_wcsnrtombs(char *_RESTRICT_KYWD,
99 		    const wchar_t **_RESTRICT_KYWD, size_t, size_t,
100 		    mbstate_t *_RESTRICT_KYWD);
101 
102 static int	_EUC_mbsinit(const mbstate_t *);
103 
104 typedef struct {
105 	wchar_t	ch;
106 	int	set;
107 	int	want;
108 } _EucState;
109 
110 int
111 _EUC_mbsinit(const mbstate_t *ps)
112 {
113 
114 	return (ps == NULL || ((const _EucState *)ps)->want == 0);
115 }
116 
117 /*
118  * EUC-CN uses CS0, CS1 and CS2 (4 bytes).
119  */
120 void
121 _EUC_CN_init(struct lc_ctype *lct)
122 {
123 	lct->lc_mbrtowc = _EUC_CN_mbrtowc;
124 	lct->lc_wcrtomb = _EUC_CN_wcrtomb;
125 	lct->lc_mbsnrtowcs = _EUC_CN_mbsnrtowcs;
126 	lct->lc_wcsnrtombs = _EUC_CN_wcsnrtombs;
127 	lct->lc_mbsinit = _EUC_mbsinit;
128 
129 	lct->lc_max_mblen = 4;
130 	lct->lc_is_ascii = 0;
131 }
132 
133 static size_t
134 _EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
135     size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero)
136 {
137 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0, zero));
138 }
139 
140 static size_t
141 _EUC_CN_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
142     const char **_RESTRICT_KYWD src,
143     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
144 {
145 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_CN_mbrtowc));
146 }
147 
148 static size_t
149 _EUC_CN_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
150     mbstate_t *_RESTRICT_KYWD ps)
151 {
152 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
153 }
154 
155 static size_t
156 _EUC_CN_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
157     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
158 {
159 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_CN_wcrtomb));
160 }
161 
162 /*
163  * EUC-KR uses only CS0 and CS1.
164  */
165 void
166 _EUC_KR_init(struct lc_ctype *lct)
167 {
168 	lct->lc_mbrtowc = _EUC_KR_mbrtowc;
169 	lct->lc_wcrtomb = _EUC_KR_wcrtomb;
170 	lct->lc_mbsnrtowcs = _EUC_KR_mbsnrtowcs;
171 	lct->lc_wcsnrtombs = _EUC_KR_wcsnrtombs;
172 	lct->lc_mbsinit = _EUC_mbsinit;
173 
174 	lct->lc_max_mblen = 2;
175 	lct->lc_is_ascii = 0;
176 }
177 
178 static size_t
179 _EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
180     size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero)
181 {
182 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0, zero));
183 }
184 
185 static size_t
186 _EUC_KR_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
187     const char **_RESTRICT_KYWD src,
188     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
189 {
190 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_KR_mbrtowc));
191 }
192 
193 static size_t
194 _EUC_KR_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
195     mbstate_t *_RESTRICT_KYWD ps)
196 {
197 	return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0));
198 }
199 
200 static size_t
201 _EUC_KR_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
202     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
203 {
204 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_KR_wcrtomb));
205 }
206 
207 /*
208  * EUC-JP uses CS0, CS1, CS2, and CS3.
209  */
210 void
211 _EUC_JP_init(struct lc_ctype *lct)
212 {
213 	lct->lc_mbrtowc = _EUC_JP_mbrtowc;
214 	lct->lc_wcrtomb = _EUC_JP_wcrtomb;
215 	lct->lc_mbsnrtowcs = _EUC_JP_mbsnrtowcs;
216 	lct->lc_wcsnrtombs = _EUC_JP_wcsnrtombs;
217 	lct->lc_mbsinit = _EUC_mbsinit;
218 
219 	lct->lc_max_mblen = 3;
220 	lct->lc_is_ascii = 0;
221 }
222 
223 static size_t
224 _EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
225     size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero)
226 {
227 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3, zero));
228 }
229 
230 static size_t
231 _EUC_JP_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
232     const char **_RESTRICT_KYWD src,
233     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
234 {
235 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_JP_mbrtowc));
236 }
237 
238 static size_t
239 _EUC_JP_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
240     mbstate_t *_RESTRICT_KYWD ps)
241 {
242 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3));
243 }
244 
245 static size_t
246 _EUC_JP_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
247     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
248 {
249 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_JP_wcrtomb));
250 }
251 
252 /*
253  * EUC-TW uses CS0, CS1, and CS2.
254  */
255 void
256 _EUC_TW_init(struct lc_ctype *lct)
257 {
258 	lct->lc_mbrtowc = _EUC_TW_mbrtowc;
259 	lct->lc_wcrtomb = _EUC_TW_wcrtomb;
260 	lct->lc_mbsnrtowcs = _EUC_TW_mbsnrtowcs;
261 	lct->lc_wcsnrtombs = _EUC_TW_wcsnrtombs;
262 	lct->lc_mbsinit = _EUC_mbsinit;
263 
264 	lct->lc_max_mblen = 4;
265 	lct->lc_is_ascii = 0;
266 }
267 
268 static size_t
269 _EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
270     size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero)
271 {
272 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0, zero));
273 }
274 
275 static size_t
276 _EUC_TW_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
277     const char **_RESTRICT_KYWD src,
278     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
279 {
280 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_TW_mbrtowc));
281 }
282 
283 static size_t
284 _EUC_TW_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
285     mbstate_t *_RESTRICT_KYWD ps)
286 {
287 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
288 }
289 
290 static size_t
291 _EUC_TW_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
292     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
293 {
294 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_TW_wcrtomb));
295 }
296 
297 /*
298  * Common EUC code.
299  */
300 
301 static size_t
302 _EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
303     size_t n, mbstate_t *_RESTRICT_KYWD ps,
304     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width,
305     boolean_t zero)
306 {
307 	_EucState *es;
308 	int i, want;
309 	wchar_t wc = 0;
310 	unsigned char ch, chs;
311 
312 	es = (_EucState *)ps;
313 
314 	if (es->want < 0 || es->want > MB_CUR_MAX) {
315 		errno = EINVAL;
316 		return ((size_t)-1);
317 	}
318 
319 	if (s == NULL) {
320 		s = "";
321 		n = 1;
322 		pwc = NULL;
323 	}
324 
325 	if (n == 0)
326 		/* Incomplete multibyte sequence */
327 		return ((size_t)-2);
328 
329 	if (es->want == 0) {
330 		/* Fast path for plain ASCII (CS0) */
331 		if (((ch = (unsigned char)*s) & 0x80) == 0) {
332 			if (pwc != NULL)
333 				*pwc = ch;
334 			if (zero || ch != '\0') {
335 				return (1);
336 			} else {
337 				return (0);
338 			}
339 		}
340 
341 		if (ch >= 0xa1) {
342 			/* CS1 */
343 			want = 2;
344 		} else if (ch == cs2) {
345 			want = cs2width;
346 		} else if (ch == cs3) {
347 			want = cs3width;
348 		} else {
349 			errno = EILSEQ;
350 			return ((size_t)-1);
351 		}
352 
353 
354 		es->want = want;
355 		es->ch = 0;
356 	} else {
357 		want = es->want;
358 		wc = es->ch;
359 	}
360 
361 	for (i = 0; i < MIN(want, n); i++) {
362 		wc <<= 8;
363 		chs = *s;
364 		wc |= chs;
365 		s++;
366 	}
367 	if (i < want) {
368 		/* Incomplete multibyte sequence */
369 		es->want = want - i;
370 		es->ch = wc;
371 		return ((size_t)-2);
372 	}
373 	if (pwc != NULL)
374 		*pwc = wc;
375 	es->want = 0;
376 	if (zero || wc != L'\0') {
377 		return (want);
378 	} else {
379 		return (0);
380 	}
381 }
382 
383 static size_t
384 _EUC_wcrtomb_impl(char *_RESTRICT_KYWD s, wchar_t wc,
385     mbstate_t *_RESTRICT_KYWD ps,
386     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
387 {
388 	_EucState *es;
389 	int i, len;
390 	wchar_t nm;
391 
392 	es = (_EucState *)ps;
393 
394 	if (es->want != 0) {
395 		errno = EINVAL;
396 		return ((size_t)-1);
397 	}
398 
399 	if (s == NULL)
400 		/* Reset to initial shift state (no-op) */
401 		return (1);
402 
403 	if ((wc & ~0x7f) == 0) {
404 		/* Fast path for plain ASCII (CS0) */
405 		*s = (char)wc;
406 		return (1);
407 	}
408 
409 	/* Determine the "length" */
410 	if ((unsigned)wc > 0xffffff) {
411 		len = 4;
412 	} else if ((unsigned)wc > 0xffff) {
413 		len = 3;
414 	} else if ((unsigned)wc > 0xff) {
415 		len = 2;
416 	} else {
417 		len = 1;
418 	}
419 
420 	if (len > MB_CUR_MAX) {
421 		errno = EILSEQ;
422 		return ((size_t)-1);
423 	}
424 
425 	/* This first check excludes CS1, which is implicitly valid. */
426 	if ((wc < 0xa100) || (wc > 0xffff)) {
427 		/* Check for valid CS2 or CS3 */
428 		nm = (wc >> ((len - 1) * 8));
429 		if (nm == cs2) {
430 			if (len != cs2width) {
431 				errno = EILSEQ;
432 				return ((size_t)-1);
433 			}
434 		} else if (nm == cs3) {
435 			if (len != cs3width) {
436 				errno = EILSEQ;
437 				return ((size_t)-1);
438 			}
439 		} else {
440 			errno = EILSEQ;
441 			return ((size_t)-1);
442 		}
443 	}
444 
445 	/* Stash the bytes, least significant last */
446 	for (i = len - 1; i >= 0; i--) {
447 		s[i] = (wc & 0xff);
448 		wc >>= 8;
449 	}
450 	return (len);
451 }
452