/* * @(#)utf.c 1.5 02/09/27 * * Copyright 1995-1998 by Sun Microsystems, Inc., * 901 San Antonio Road, Palo Alto, California, 94303, U.S.A. * All rights reserved. * * This software is the confidential and proprietary information * of Sun Microsystems, Inc. ("Confidential Information"). You * shall not disclose such Confidential Information and shall use * it only in accordance with the terms of the license agreement * you entered into with Sun. * Use is subject to license terms. */ /*========================================================================= * SYSTEM: Verifier * SUBSYSTEM: Unicode translators. * FILE: utf.c * OVERVIEW: Routines for Unicode -> UTF and UTF -> unicode translators. * * This file implements the unicode -> UTF and UTF -> unicode translators * needed by the various parts of the compiler and interpreter. * * UTF strings are streams of bytes, in which unicode characters are encoded * as follows: * Unicode UTF * 00000000 0jklmnop 0jklmnop * 00000fgh ijklmnop 110fghij 10klmnop * abcdefgh ijklmnop 1110abcd 10efghij 10klmnop * * unicode bytes with 7 or fewer significant bits MUST be converted using the * first format. bytes with 11 or fewer bits MUST be converted using the * second format. * * In JAVA/JAVAC, we deviate slightly from the above. * 1) The null unicode character is represented using the 2-byte format * 2) All UTF strings are null-terminated. * In this way, we do not need to separately maintain a length field for the * UTF string. * * Given a unicode string and its length, convert it to a utf string. But * the result into the given buffer, whose length is buflength. The utf * string should include a null terminator. * * If both buffer and buflength are 0, then malloc an appropriately sized * buffer for the result. * * AUTHOR: Sheng Liang, Sun Microsystems, Inc. * Edited by Tasneem Sayeed, Sun Microsystems *=======================================================================*/ /*========================================================================= * Include files *=======================================================================*/ #include #include #include #include "oobj.h" #include "utf.h" #include "sys_api.h" char *unicode2utf(unicode *unistring, int length, char *buffer, int buflength) { int i; unicode *uniptr; char *bufptr; unsigned bufleft; if ((buffer == 0) && (buflength == 0)) { buflength = unicode2utfstrlen(unistring, length); if ((buffer = (char *) sysMalloc(buflength)) == 0) return 0; } bufleft = buflength - 1; /* take note of null now! */ for(i = length, uniptr = unistring, bufptr = buffer; --i >= 0; uniptr++) { unicode ch = *uniptr; if ((ch != 0) && (ch <=0x7f)) { if ((int)(--bufleft) < 0) /* no space for character */ break; *bufptr++ = (char)ch; } else if (ch <= 0x7FF) { /* 11 bits or less. */ unsigned char high_five = ch >> 6; unsigned char low_six = ch & 0x3F; if ((int)(bufleft -= 2) < 0) /* no space for character */ break; *bufptr++ = high_five | 0xC0; /* 110xxxxx */ *bufptr++ = low_six | 0x80; /* 10xxxxxx */ } else { /* possibly full 16 bits. */ char high_four = ch >> 12; char mid_six = (ch >> 6) & 0x3F; char low_six = ch & 0x3f; if ((int)(bufleft -= 3) < 0) /* no space for character */ break; *bufptr++ = high_four | 0xE0; /* 1110xxxx */ *bufptr++ = mid_six | 0x80; /* 10xxxxxx */ *bufptr++ = low_six | 0x80; /* 10xxxxxx*/ } } *bufptr = 0; return buffer; } /* Return the number of characters that would be needed to hold the unicode * string in utf. This INCLUDES the NULL! */ int unicode2utfstrlen(unicode *unistring, int unilength) { int result_length = 1; for (; unilength > 0; unistring++, unilength--) { unicode ch = *unistring; if ((ch != 0) && (ch <= 0x7f)) /* 1 byte */ result_length++; else if (ch <= 0x7FF) result_length += 2; /* 2 byte character */ else result_length += 3; /* 3 byte character */ } return result_length; } /* Give the number of unicode characters in a utf string */ int utfstrlen(char *utfstring) { int length; for (length = 0; *utfstring != 0; length++) next_utf2unicode(&utfstring); return length; } /* Convert a utfstring to unicode in the buffer provided. Put at most * max_length characters into the buffer. Whether or not we actually overflow * the space, indicate the actual unicode length. * * Whether or not we overflow the space, return the actual number of * characters that we used. */ void utf2unicode(char *utfstring, unicode *unistring, int max_length, int *lengthp) { int length_remaining = max_length; while (length_remaining > 0 && *utfstring != 0) { *unistring++ = next_utf2unicode(&utfstring); length_remaining--; } if (length_remaining == 0) { *lengthp = max_length + utfstrlen(utfstring); } else { *lengthp = max_length - length_remaining; } } bool_t is_simple_utf(char *utfstring) { unsigned char *ptr; for (ptr = (unsigned char *)utfstring; *ptr != 0; ptr++) { if (*ptr > 0x80) return FALSE; } return TRUE; } unicode next_utf2unicode(char **utfstring_ptr) { unsigned char *ptr = (unsigned char *)(*utfstring_ptr); unsigned char ch, ch2, ch3; int length = 1; /* default length */ unicode result = 0x80; /* default bad result; */ switch ((ch = ptr[0]) >> 4) { default: result = ch; break; case 0x8: case 0x9: case 0xA: case 0xB: case 0xF: /* Shouldn't happen. */ break; case 0xC: case 0xD: /* 110xxxxx 10xxxxxx */ if (((ch2 = ptr[1]) & 0xC0) == 0x80) { unsigned char high_five = ch & 0x1F; unsigned char low_six = ch2 & 0x3F; result = (high_five << 6) + low_six; length = 2; } break; case 0xE: /* 1110xxxx 10xxxxxx 10xxxxxx */ if (((ch2 = ptr[1]) & 0xC0) == 0x80) { if (((ch3 = ptr[2]) & 0xC0) == 0x80) { unsigned char high_four = ch & 0x0f; unsigned char mid_six = ch2 & 0x3f; unsigned char low_six = ch3 & 0x3f; result = (((high_four << 6) + mid_six) << 6) + low_six; length = 3; } else { length = 2; } } break; } /* end of switch */ *utfstring_ptr = (char *)(ptr + length); return result; }