DEADSOFTWARE

Patched for Linux
[mp3cc.git] / MPC.3.5.LINUX / preverifier / utf.c
1 /*
2 * @(#)utf.c 1.5 02/09/27
3 *
4 * Copyright 1995-1998 by Sun Microsystems, Inc.,
5 * 901 San Antonio Road, Palo Alto, California, 94303, U.S.A.
6 * All rights reserved.
7 *
8 * This software is the confidential and proprietary information
9 * of Sun Microsystems, Inc. ("Confidential Information"). You
10 * shall not disclose such Confidential Information and shall use
11 * it only in accordance with the terms of the license agreement
12 * you entered into with Sun.
13 * Use is subject to license terms.
14 */
16 /*=========================================================================
17 * SYSTEM: Verifier
18 * SUBSYSTEM: Unicode translators.
19 * FILE: utf.c
20 * OVERVIEW: Routines for Unicode -> UTF and UTF -> unicode translators.
21 *
22 * This file implements the unicode -> UTF and UTF -> unicode translators
23 * needed by the various parts of the compiler and interpreter.
24 *
25 * UTF strings are streams of bytes, in which unicode characters are encoded
26 * as follows:
27 * Unicode UTF
28 * 00000000 0jklmnop 0jklmnop
29 * 00000fgh ijklmnop 110fghij 10klmnop
30 * abcdefgh ijklmnop 1110abcd 10efghij 10klmnop
31 *
32 * unicode bytes with 7 or fewer significant bits MUST be converted using the
33 * first format. bytes with 11 or fewer bits MUST be converted using the
34 * second format.
35 *
36 * In JAVA/JAVAC, we deviate slightly from the above.
37 * 1) The null unicode character is represented using the 2-byte format
38 * 2) All UTF strings are null-terminated.
39 * In this way, we do not need to separately maintain a length field for the
40 * UTF string.
41 *
42 * Given a unicode string and its length, convert it to a utf string. But
43 * the result into the given buffer, whose length is buflength. The utf
44 * string should include a null terminator.
45 *
46 * If both buffer and buflength are 0, then malloc an appropriately sized
47 * buffer for the result.
48 *
49 * AUTHOR: Sheng Liang, Sun Microsystems, Inc.
50 * Edited by Tasneem Sayeed, Sun Microsystems
51 *=======================================================================*/
53 /*=========================================================================
54 * Include files
55 *=======================================================================*/
57 #include <stdio.h>
58 #include <string.h>
59 #include <stdlib.h>
61 #include "oobj.h"
62 #include "utf.h"
63 #include "sys_api.h"
65 char *unicode2utf(unicode *unistring, int length, char *buffer, int buflength)
66 {
67 int i;
68 unicode *uniptr;
69 char *bufptr;
70 unsigned bufleft;
72 if ((buffer == 0) && (buflength == 0)) {
73 buflength = unicode2utfstrlen(unistring, length);
74 if ((buffer = (char *) sysMalloc(buflength)) == 0)
75 return 0;
76 }
78 bufleft = buflength - 1; /* take note of null now! */
80 for(i = length, uniptr = unistring, bufptr = buffer; --i >= 0; uniptr++) {
81 unicode ch = *uniptr;
82 if ((ch != 0) && (ch <=0x7f)) {
83 if ((int)(--bufleft) < 0) /* no space for character */
84 break;
85 *bufptr++ = (char)ch;
86 } else if (ch <= 0x7FF) {
87 /* 11 bits or less. */
88 unsigned char high_five = ch >> 6;
89 unsigned char low_six = ch & 0x3F;
90 if ((int)(bufleft -= 2) < 0) /* no space for character */
91 break;
92 *bufptr++ = high_five | 0xC0; /* 110xxxxx */
93 *bufptr++ = low_six | 0x80; /* 10xxxxxx */
94 } else {
95 /* possibly full 16 bits. */
96 char high_four = ch >> 12;
97 char mid_six = (ch >> 6) & 0x3F;
98 char low_six = ch & 0x3f;
99 if ((int)(bufleft -= 3) < 0) /* no space for character */
100 break;
101 *bufptr++ = high_four | 0xE0; /* 1110xxxx */
102 *bufptr++ = mid_six | 0x80; /* 10xxxxxx */
103 *bufptr++ = low_six | 0x80; /* 10xxxxxx*/
106 *bufptr = 0;
107 return buffer;
110 /* Return the number of characters that would be needed to hold the unicode
111 * string in utf. This INCLUDES the NULL!
112 */
113 int unicode2utfstrlen(unicode *unistring, int unilength)
115 int result_length = 1;
117 for (; unilength > 0; unistring++, unilength--) {
118 unicode ch = *unistring;
119 if ((ch != 0) && (ch <= 0x7f)) /* 1 byte */
120 result_length++;
121 else if (ch <= 0x7FF)
122 result_length += 2; /* 2 byte character */
123 else
124 result_length += 3; /* 3 byte character */
126 return result_length;
129 /* Give the number of unicode characters in a utf string */
130 int utfstrlen(char *utfstring)
132 int length;
133 for (length = 0; *utfstring != 0; length++)
134 next_utf2unicode(&utfstring);
135 return length;
138 /* Convert a utfstring to unicode in the buffer provided. Put at most
139 * max_length characters into the buffer. Whether or not we actually overflow
140 * the space, indicate the actual unicode length.
142 * Whether or not we overflow the space, return the actual number of
143 * characters that we used.
144 */
146 void
147 utf2unicode(char *utfstring, unicode *unistring,
148 int max_length, int *lengthp)
150 int length_remaining = max_length;
152 while (length_remaining > 0 && *utfstring != 0) {
153 *unistring++ = next_utf2unicode(&utfstring);
154 length_remaining--;
157 if (length_remaining == 0) {
158 *lengthp = max_length + utfstrlen(utfstring);
159 } else {
160 *lengthp = max_length - length_remaining;
164 bool_t is_simple_utf(char *utfstring)
166 unsigned char *ptr;
167 for (ptr = (unsigned char *)utfstring; *ptr != 0; ptr++) {
168 if (*ptr > 0x80) return FALSE;
170 return TRUE;
174 unicode next_utf2unicode(char **utfstring_ptr) {
175 unsigned char *ptr = (unsigned char *)(*utfstring_ptr);
176 unsigned char ch, ch2, ch3;
177 int length = 1; /* default length */
178 unicode result = 0x80; /* default bad result; */
179 switch ((ch = ptr[0]) >> 4) {
180 default:
181 result = ch;
182 break;
184 case 0x8: case 0x9: case 0xA: case 0xB: case 0xF:
185 /* Shouldn't happen. */
186 break;
188 case 0xC: case 0xD:
189 /* 110xxxxx 10xxxxxx */
190 if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
191 unsigned char high_five = ch & 0x1F;
192 unsigned char low_six = ch2 & 0x3F;
193 result = (high_five << 6) + low_six;
194 length = 2;
195 }
196 break;
198 case 0xE:
199 /* 1110xxxx 10xxxxxx 10xxxxxx */
200 if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
201 if (((ch3 = ptr[2]) & 0xC0) == 0x80) {
202 unsigned char high_four = ch & 0x0f;
203 unsigned char mid_six = ch2 & 0x3f;
204 unsigned char low_six = ch3 & 0x3f;
205 result = (((high_four << 6) + mid_six) << 6) + low_six;
206 length = 3;
207 } else {
208 length = 2;
211 break;
212 } /* end of switch */
214 *utfstring_ptr = (char *)(ptr + length);
215 return result;