xref: /illumos-gate/usr/src/common/smbsrv/smb_utf8.c (revision 581cede61ac9c14d8d4ea452562a567189eead78)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Multibyte/wide-char conversion routines. Wide-char encoding provides
28  * a fixed size character encoding that maps to the Unicode 16-bit
29  * (UCS-2) character set standard. Multibyte or UCS transformation
30  * format (UTF) encoding is a variable length character encoding scheme
31  * that s compatible with existing ASCII characters and guarantees that
32  * the resultant strings do not contain embedded null characters. Both
33  * types of encoding provide a null terminator: single byte for UTF-8
34  * and a wide-char null for Unicode. See RFC 2044.
35  *
36  * The table below illustrates the UTF-8 encoding scheme. The letter x
37  * indicates bits available for encoding the character value.
38  *
39  *	UCS-2			UTF-8 octet sequence (binary)
40  *	0x0000-0x007F	0xxxxxxx
41  *	0x0080-0x07FF	110xxxxx 10xxxxxx
42  *	0x0800-0xFFFF	1110xxxx 10xxxxxx 10xxxxxx
43  *
44  * RFC 2044
45  * UTF-8,a transformation format of UNICODE and ISO 10646
46  * F. Yergeau
47  * Alis Technologies
48  * October 1996
49  */
50 
51 #pragma ident	"%Z%%M%	%I%	%E% SMI"
52 
53 #ifdef _KERNEL
54 #include <sys/types.h>
55 #include <sys/sunddi.h>
56 #else
57 #include <stdio.h>
58 #include <stdlib.h>
59 #include <assert.h>
60 #include <strings.h>
61 #endif
62 #include <smbsrv/smb_i18n.h>
63 #include <smbsrv/string.h>
64 
65 
66 /*
67  * mbstowcs
68  *
69  * The mbstowcs() function converts a multibyte character string
70  * mbstring into a wide character string wcstring. No more than
71  * nwchars wide characters are stored. A terminating null wide
72  * character is appended if there is room.
73  *
74  * Returns the number of wide characters converted, not counting
75  * any terminating null wide character. Returns -1 if an invalid
76  * multibyte character is encountered.
77  */
78 size_t
79 mts_mbstowcs(mts_wchar_t *wcstring, const char *mbstring, size_t nwchars)
80 {
81 	int len;
82 	mts_wchar_t	*start = wcstring;
83 
84 	while (nwchars--) {
85 		len = mts_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
86 		if (len < 0) {
87 			*wcstring = 0;
88 			return ((size_t)-1);
89 		}
90 
91 		if (*mbstring == 0)
92 			break;
93 
94 		++wcstring;
95 		mbstring += len;
96 	}
97 
98 	return (wcstring - start);
99 }
100 
101 
102 /*
103  * mbtowc
104  *
105  * The mbtowc() function converts a multibyte character mbchar into
106  * a wide character and stores the result in the object pointed to
107  * by wcharp. Up to nbytes bytes are examined.
108  *
109  * If mbchar is NULL, mbtowc() returns zero to indicate that shift
110  * states are not supported.  Shift states are used to switch between
111  * representation modes using reserved bytes to signal shifting
112  * without them being interpreted as characters.  If mbchar is null
113  * mbtowc should return non-zero if the current locale requires shift
114  * states.  Otherwise it should be return 0.
115  *
116  * If mbchar is non-null, returns the number of bytes processed in
117  * mbchar.  If mbchar is invalid, returns -1.
118  */
119 int /*ARGSUSED*/
120 mts_mbtowc(mts_wchar_t *wcharp, const char *mbchar, size_t nbytes)
121 {
122 	unsigned char mbyte;
123 	mts_wchar_t wide_char;
124 	int count;
125 	int bytes_left;
126 
127 	if (mbchar == NULL)
128 		return (0); /* no shift states */
129 
130 	/* 0xxxxxxx -> 1 byte ASCII encoding */
131 	if (((mbyte = *mbchar++) & 0x80) == 0) {
132 		if (wcharp)
133 			*wcharp = (mts_wchar_t)mbyte;
134 
135 		return (mbyte ? 1 : 0);
136 	}
137 
138 	/* 10xxxxxx -> invalid first byte */
139 	if ((mbyte & 0x40) == 0)
140 		return (-1);
141 
142 	wide_char = mbyte;
143 	if ((mbyte & 0x20) == 0) {
144 		wide_char &= 0x1f;
145 		bytes_left = 1;
146 	} else if ((mbyte & 0x10) == 0) {
147 		wide_char &= 0x0f;
148 		bytes_left = 2;
149 	} else {
150 		return (-1);
151 	}
152 
153 	count = 1;
154 	while (bytes_left--) {
155 		if (((mbyte = *mbchar++) & 0xc0) != 0x80)
156 			return (-1);
157 
158 		count++;
159 		wide_char = (wide_char << 6) | (mbyte & 0x3f);
160 	}
161 
162 	if (wcharp)
163 		*wcharp = wide_char;
164 
165 	return (count);
166 }
167 
168 
169 /*
170  * wctomb
171  *
172  * The wctomb() function converts a wide character wchar into a multibyte
173  * character and stores the result in mbchar. The object pointed to by
174  * mbchar must be large enough to accommodate the multibyte character.
175  *
176  * Returns the numberof bytes written to mbchar.
177  */
178 int
179 mts_wctomb(char *mbchar, mts_wchar_t wchar)
180 {
181 	if ((wchar & ~0x7f) == 0) {
182 		*mbchar = (char)wchar;
183 		return (1);
184 	}
185 
186 	if ((wchar & ~0x7ff) == 0) {
187 		*mbchar++ = (wchar >> 6) | 0xc0;
188 		*mbchar = (wchar & 0x3f) | 0x80;
189 		return (2);
190 	}
191 
192 	*mbchar++ = (wchar >> 12) | 0xe0;
193 	*mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
194 	*mbchar = (wchar & 0x3f) | 0x80;
195 	return (3);
196 }
197 
198 
199 /*
200  * wcstombs
201  *
202  * The wcstombs() function converts a wide character string wcstring
203  * into a multibyte character string mbstring. Up to nbytes bytes are
204  * stored in mbstring. Partial multibyte characters at the end of the
205  * string are not stored. The multibyte character string is null
206  * terminated if there is room.
207  *
208  * Returns the number of bytes converted, not counting the terminating
209  * null byte.
210  */
211 size_t
212 mts_wcstombs(char *mbstring, const mts_wchar_t *wcstring, size_t nbytes)
213 {
214 	char *start = mbstring;
215 	const mts_wchar_t *wcp = wcstring;
216 	mts_wchar_t wide_char;
217 	char buf[4];
218 	size_t len;
219 
220 	if ((mbstring == NULL) || (wcstring == NULL))
221 		return (0);
222 
223 	while (nbytes > MTS_MB_CHAR_MAX) {
224 		wide_char = *wcp++;
225 		len = mts_wctomb(mbstring, wide_char);
226 
227 		if (wide_char == 0)
228 			/*LINTED E_PTRDIFF_OVERFLOW*/
229 			return (mbstring - start);
230 
231 		mbstring += len;
232 		nbytes -= len;
233 	}
234 
235 	while (wide_char && nbytes) {
236 		wide_char = *wcp++;
237 		if ((len = mts_wctomb(buf, wide_char)) > nbytes) {
238 			*mbstring = 0;
239 			break;
240 		}
241 
242 		bcopy(buf, mbstring, len);
243 		mbstring += len;
244 		nbytes -= len;
245 	}
246 
247 	/*LINTED E_PTRDIFF_OVERFLOW*/
248 	return (mbstring - start);
249 }
250 
251 
252 /*
253  * Returns the number of bytes that would be written if the multi-
254  * byte string mbs was converted to a wide character string, not
255  * counting the terminating null wide character.
256  */
257 size_t
258 mts_wcequiv_strlen(const char *mbs)
259 {
260 	mts_wchar_t	wide_char;
261 	size_t bytes;
262 	size_t len = 0;
263 
264 	while (*mbs) {
265 		bytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
266 		if (bytes == ((size_t)-1))
267 			return ((size_t)-1);
268 
269 		len += sizeof (mts_wchar_t);
270 		mbs += bytes;
271 	}
272 
273 	return (len);
274 }
275 
276 
277 /*
278  * Returns the number of bytes that would be written if the multi-
279  * byte string mbs was converted to a single byte character string,
280  * not counting the terminating null character.
281  */
282 size_t
283 mts_sbequiv_strlen(const char *mbs)
284 {
285 	mts_wchar_t	wide_char;
286 	size_t nbytes;
287 	size_t len = 0;
288 
289 	while (*mbs) {
290 		nbytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
291 		if (nbytes == ((size_t)-1))
292 			return ((size_t)-1);
293 
294 		if (wide_char & 0xFF00)
295 			len += sizeof (mts_wchar_t);
296 		else
297 			++len;
298 
299 		mbs += nbytes;
300 	}
301 
302 	return (len);
303 }
304 
305 
306 /*
307  * stombs
308  *
309  * Convert a regular null terminated string 'string' to a UTF-8 encoded
310  * null terminated multi-byte string 'mbstring'. Only full converted
311  * UTF-8 characters will be written 'mbstring'. If a character will not
312  * fit within the remaining buffer space or 'mbstring' will overflow
313  * max_mblen, the conversion process will be terminated and 'mbstring'
314  * will be null terminated.
315  *
316  * Returns the number of bytes written to 'mbstring', excluding the
317  * terminating null character.
318  *
319  * If either mbstring or string is a null pointer, -1 is returned.
320  */
321 int
322 mts_stombs(char *mbstring, char *string, int max_mblen)
323 {
324 	char *start = mbstring;
325 	unsigned char *p = (unsigned char *)string;
326 	int space_left = max_mblen;
327 	int	len;
328 	mts_wchar_t	wide_char;
329 	char buf[4];
330 
331 	if (!mbstring || !string)
332 		return (-1);
333 
334 	while (*p && space_left > 2) {
335 		wide_char = *p++;
336 		len = mts_wctomb(mbstring, wide_char);
337 		mbstring += len;
338 		space_left -= len;
339 	}
340 
341 	if (*p) {
342 		wide_char = *p;
343 		if ((len = mts_wctomb(buf, wide_char)) < 2) {
344 			*mbstring = *buf;
345 			mbstring += len;
346 			space_left -= len;
347 		}
348 	}
349 
350 	*mbstring = '\0';
351 
352 	/*LINTED E_PTRDIFF_OVERFLOW*/
353 	return (mbstring - start);
354 }
355 
356 
357 /*
358  * mbstos
359  *
360  * Convert a null terminated multi-byte string 'mbstring' to a regular
361  * null terminated string 'string'.  A 1-byte character in 'mbstring'
362  * maps to a 1-byte character in 'string'. A 2-byte character in
363  * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
364  * Otherwise the upper byte null will be discarded to ensure that the
365  * output stream does not contain embedded null characters.
366  *
367  * If the input stream contains invalid multi-byte characters, a value
368  * of -1 will be returned. Otherwise the length of 'string', excluding
369  * the terminating null character, is returned.
370  *
371  * If either mbstring or string is a null pointer, -1 is returned.
372  */
373 int
374 mts_mbstos(char *string, const char *mbstring)
375 {
376 	mts_wchar_t wc;
377 	unsigned char *start = (unsigned char *)string;
378 	int len;
379 
380 	if (string == NULL || mbstring == NULL)
381 		return (-1);
382 
383 	while (*mbstring) {
384 		if ((len = mts_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
385 			*string = 0;
386 			return (-1);
387 		}
388 
389 		if (wc & 0xFF00) {
390 			/*LINTED E_BAD_PTR_CAST_ALIGN*/
391 			*((mts_wchar_t *)string) = wc;
392 			string += sizeof (mts_wchar_t);
393 		}
394 		else
395 		{
396 			*string = (unsigned char)wc;
397 			string++;
398 		}
399 
400 		mbstring += len;
401 	}
402 
403 	*string = 0;
404 
405 	/*LINTED E_PTRDIFF_OVERFLOW*/
406 	return ((unsigned char *)string - start);
407 }
408