MPC.3.5.LINUX/preverifier/utf.c

   1 /*
   2  * @(#)utf.c    1.5 02/09/27
   3  *
   4  * Copyright 1995-1998 by Sun Microsystems, Inc.,
   5  * 901 San Antonio Road, Palo Alto, California, 94303, U.S.A.
   6  * All rights reserved.
   7  *
   8  * This software is the confidential and proprietary information
   9  * of Sun Microsystems, Inc. ("Confidential Information").  You
  10  * shall not disclose such Confidential Information and shall use
  11  * it only in accordance with the terms of the license agreement
  12  * you entered into with Sun.
  13  * Use is subject to license terms.
  14  */
  15
  16 /*=========================================================================
  17  * SYSTEM:    Verifier
  18  * SUBSYSTEM: Unicode translators.
  19  * FILE:      utf.c
  20  * OVERVIEW:  Routines for Unicode -> UTF and UTF -> unicode translators.
  21  *
  22  * This file implements the unicode -> UTF and UTF -> unicode translators
  23  * needed by the various parts of the compiler and interpreter.
  24  *
  25  * UTF strings are streams of bytes, in which unicode characters are encoded
  26  * as follows:
  27  *       Unicode                  UTF
  28  *       00000000 0jklmnop       0jklmnop
  29  *       00000fgh ijklmnop       110fghij 10klmnop
  30  *       abcdefgh ijklmnop       1110abcd 10efghij 10klmnop
  31  *
  32  * unicode bytes with 7 or fewer significant bits MUST be converted using the
  33  * first format.  bytes with 11 or fewer bits MUST be converted using the
  34  * second format.
  35  *
  36  * In JAVA/JAVAC, we deviate slightly from the above.
  37  *    1) The null unicode character is represented using the 2-byte format
  38  *    2)  All UTF strings are null-terminated.
  39  * In this way, we do not need to separately maintain a length field for the
  40  * UTF string.
  41  *
  42  * Given a unicode string and its length, convert it to a utf string.  But
  43  * the result into the given buffer, whose length is buflength.  The utf
  44  * string should include a null terminator.
  45  *
  46  * If both buffer and buflength are 0, then malloc an appropriately sized
  47  * buffer for the result.
  48  *
  49  * AUTHOR:    Sheng Liang, Sun Microsystems, Inc.
  50  *            Edited by Tasneem Sayeed, Sun Microsystems
  51  *=======================================================================*/
  52
  53 /*=========================================================================
  54  * Include files
  55  *=======================================================================*/
  56
  57 #include <stdio.h>
  58 #include <string.h>
  59 #include <stdlib.h>
  60
  61 #include "oobj.h"
  62 #include "utf.h"
  63 #include "sys_api.h"
  64
  65 char *unicode2utf(unicode *unistring, int length, char *buffer, int buflength)
  66 {
  67     int i;
  68     unicode *uniptr;
  69     char *bufptr;
  70     unsigned bufleft;
  71
  72     if ((buffer == 0) && (buflength == 0)) {
  73         buflength = unicode2utfstrlen(unistring, length);
  74         if ((buffer = (char *) sysMalloc(buflength)) == 0)
  75             return 0;
  76     }
  77
  78     bufleft = buflength - 1; /* take note of null now! */
  79
  80     for(i = length, uniptr = unistring, bufptr = buffer; --i >= 0; uniptr++) {
  81         unicode ch = *uniptr;
  82         if ((ch != 0) && (ch <=0x7f)) {
  83             if ((int)(--bufleft) < 0)   /* no space for character */
  84                 break;
  85             *bufptr++ = (char)ch;
  86         } else if (ch <= 0x7FF) {
  87             /* 11 bits or less. */
  88             unsigned char high_five = ch >> 6;
  89             unsigned char low_six = ch & 0x3F;
  90             if ((int)(bufleft -= 2) < 0) /* no space for character */
  91                 break;
  92             *bufptr++ = high_five | 0xC0; /* 110xxxxx */
  93             *bufptr++ = low_six | 0x80;   /* 10xxxxxx */
  94         } else {
  95             /* possibly full 16 bits. */
  96             char high_four = ch >> 12;
  97             char mid_six = (ch >> 6) & 0x3F;
  98             char low_six = ch & 0x3f;
  99             if ((int)(bufleft -= 3) < 0) /* no space for character */
 100                 break;
 101             *bufptr++ = high_four | 0xE0; /* 1110xxxx */
 102             *bufptr++ = mid_six | 0x80;   /* 10xxxxxx */
 103             *bufptr++ = low_six | 0x80;   /* 10xxxxxx*/
 104         }
 105     }
 106     *bufptr = 0;
 107     return buffer;
 108 }
 109
 110 /* Return the number of characters that would be needed to hold the unicode
 111  * string in utf.  This INCLUDES the NULL!
 112  */
 113 int unicode2utfstrlen(unicode *unistring, int unilength)
 114 {
 115     int result_length = 1;
 116
 117     for (; unilength > 0; unistring++, unilength--) {
 118         unicode ch = *unistring;
 119         if ((ch != 0) && (ch <= 0x7f)) /* 1 byte */
 120             result_length++;
 121         else if (ch <= 0x7FF)
 122             result_length += 2; /* 2 byte character */
 123         else
 124             result_length += 3; /* 3 byte character */
 125     }
 126     return result_length;
 127 }
 128
 129 /* Give the number of unicode characters in a utf string */
 130 int utfstrlen(char *utfstring)
 131 {
 132     int length;
 133     for (length = 0; *utfstring != 0; length++)
 134         next_utf2unicode(&utfstring);
 135     return length;
 136 }
 137
 138 /* Convert a utfstring to unicode in the buffer provided.  Put at most
 139  * max_length characters into the buffer.  Whether or not we actually overflow
 140  * the space, indicate the actual unicode length.
 141  *
 142  * Whether or not we overflow the space, return the actual number of
 143  * characters that we used.
 144  */
 145
 146 void
 147 utf2unicode(char *utfstring, unicode *unistring,
 148             int max_length, int *lengthp)
 149 {
 150     int length_remaining = max_length;
 151
 152     while (length_remaining > 0 && *utfstring != 0) {
 153         *unistring++ = next_utf2unicode(&utfstring);
 154         length_remaining--;
 155     }
 156
 157     if (length_remaining == 0) {
 158         *lengthp = max_length + utfstrlen(utfstring);
 159     } else {
 160         *lengthp = max_length - length_remaining;
 161     }
 162 }
 163
 164 bool_t is_simple_utf(char *utfstring)
 165 {
 166     unsigned char *ptr;
 167     for (ptr = (unsigned char *)utfstring; *ptr != 0; ptr++) {
 168         if (*ptr > 0x80) return FALSE;
 169     }
 170     return TRUE;
 171 }
 172
 173
 174 unicode next_utf2unicode(char **utfstring_ptr) {
 175     unsigned char *ptr = (unsigned char *)(*utfstring_ptr);
 176     unsigned char ch, ch2, ch3;
 177     int length = 1;             /* default length */
 178     unicode result = 0x80;      /* default bad result; */
 179     switch ((ch = ptr[0]) >> 4) {
 180         default:
 181             result = ch;
 182             break;
 183
 184         case 0x8: case 0x9: case 0xA: case 0xB: case 0xF:
 185             /* Shouldn't happen. */
 186             break;
 187
 188         case 0xC: case 0xD:
 189             /* 110xxxxx  10xxxxxx */
 190             if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
 191                 unsigned char high_five = ch & 0x1F;
 192                 unsigned char low_six = ch2 & 0x3F;
 193                 result = (high_five << 6) + low_six;
 194                 length = 2;
 195             }
 196             break;
 197
 198         case 0xE:
 199             /* 1110xxxx 10xxxxxx 10xxxxxx */
 200             if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
 201                 if (((ch3 = ptr[2]) & 0xC0) == 0x80) {
 202                     unsigned char high_four = ch & 0x0f;
 203                     unsigned char mid_six = ch2 & 0x3f;
 204                     unsigned char low_six = ch3 & 0x3f;
 205                     result = (((high_four << 6) + mid_six) << 6) + low_six;
 206                     length = 3;
 207                 } else {
 208                     length = 2;
 209                 }
 210             }
 211             break;
 212         } /* end of switch */
 213
 214     *utfstring_ptr = (char *)(ptr + length);
 215     return result;
 216 }