Ruby 1.9.3p327(2012-11-10revision37606)
enc/utf_16le.c
Go to the documentation of this file.
00001 /**********************************************************************
00002   utf_16le.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions
00010  * are met:
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in the
00015  *    documentation and/or other materials provided with the distribution.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00018  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00021  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00022  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00023  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00024  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00025  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00026  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00027  * SUCH DAMAGE.
00028  */
00029 
00030 #include "regenc.h"
00031 
00032 #define UTF16_IS_SURROGATE_FIRST(c)    (((c) & 0xfc) == 0xd8)
00033 #define UTF16_IS_SURROGATE_SECOND(c)   (((c) & 0xfc) == 0xdc)
00034 #define UTF16_IS_SURROGATE(c)          (((c) & 0xf8) == 0xd8)
00035 
00036 static const int EncLen_UTF16[] = {
00037   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00038   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00039   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00040   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00041   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00042   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00043   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00044   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00045   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00046   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00047   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00049   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00050   2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
00051   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00052   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
00053 };
00054 
00055 static int
00056 utf16le_mbc_enc_len(const UChar* p, const OnigUChar* e,
00057                     OnigEncoding enc ARG_UNUSED)
00058 {
00059   int len = (int)(e - p);
00060   UChar byte;
00061   if (len < 2)
00062     return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
00063   byte = p[1];
00064   if (!UTF16_IS_SURROGATE(byte)) {
00065     return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2);
00066   }
00067   if (UTF16_IS_SURROGATE_FIRST(byte)) {
00068     if (len < 4)
00069       return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-len);
00070     if (UTF16_IS_SURROGATE_SECOND(p[3]))
00071       return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4);
00072   }
00073   return ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00074 }
00075 
00076 static int
00077 utf16le_is_mbc_newline(const UChar* p, const UChar* end,
00078                        OnigEncoding enc ARG_UNUSED)
00079 {
00080   if (p + 1 < end) {
00081     if (*p == 0x0a && *(p+1) == 0x00)
00082       return 1;
00083 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
00084     if ((
00085 #ifndef USE_CRNL_AS_LINE_TERMINATOR
00086          *p == 0x0d ||
00087 #endif
00088          *p == 0x85) && *(p+1) == 0x00)
00089       return 1;
00090     if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28))
00091       return 1;
00092 #endif
00093   }
00094   return 0;
00095 }
00096 
00097 static OnigCodePoint
00098 utf16le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED,
00099                     OnigEncoding enc ARG_UNUSED)
00100 {
00101   OnigCodePoint code;
00102   UChar c0 = *p;
00103   UChar c1 = *(p+1);
00104 
00105   if (UTF16_IS_SURROGATE_FIRST(c1)) {
00106     code = ((((c1 << 8) + c0) & 0x03ff) << 10)
00107          + (((p[3] << 8) + p[2]) & 0x03ff) + 0x10000;
00108   }
00109   else {
00110     code = c1 * 256 + p[0];
00111   }
00112   return code;
00113 }
00114 
00115 static int
00116 utf16le_code_to_mbclen(OnigCodePoint code,
00117                        OnigEncoding enc ARG_UNUSED)
00118 {
00119   return (code > 0xffff ? 4 : 2);
00120 }
00121 
00122 static int
00123 utf16le_code_to_mbc(OnigCodePoint code, UChar *buf,
00124                     OnigEncoding enc ARG_UNUSED)
00125 {
00126   UChar* p = buf;
00127 
00128   if (code > 0xffff) {
00129     unsigned int high = (code >> 10) + 0xD7C0;
00130     unsigned int low = (code & 0x3FF) + 0xDC00;
00131     *p++ = high & 0xFF;
00132     *p++ = (high >> 8) & 0xFF;
00133     *p++ = low & 0xFF;
00134     *p++ = (low >> 8) & 0xFF;
00135     return 4;
00136   }
00137   else {
00138     *p++ = (UChar )(code & 0xff);
00139     *p++ = (UChar )((code & 0xff00) >> 8);
00140     return 2;
00141   }
00142 }
00143 
00144 static int
00145 utf16le_mbc_case_fold(OnigCaseFoldType flag,
00146                       const UChar** pp, const UChar* end, UChar* fold,
00147                       OnigEncoding enc)
00148 {
00149   const UChar* p = *pp;
00150 
00151   if (ONIGENC_IS_ASCII_CODE(*p) && *(p+1) == 0) {
00152 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
00153     if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
00154       if (*p == 0x49) {
00155         *fold++ = 0x31;
00156         *fold   = 0x01;
00157         (*pp) += 2;
00158         return 2;
00159       }
00160     }
00161 #endif
00162 
00163     *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00164     *fold   = 0;
00165     *pp += 2;
00166     return 2;
00167   }
00168   else
00169     return onigenc_unicode_mbc_case_fold(enc, flag, pp,
00170                                          end, fold);
00171 }
00172 
00173 #if 0
00174 static int
00175 utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp,
00176                          const UChar* end)
00177 {
00178   const UChar* p = *pp;
00179 
00180   (*pp) += EncLen_UTF16[*(p+1)];
00181 
00182   if (*(p+1) == 0) {
00183     int c, v;
00184 
00185     if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
00186       return TRUE;
00187     }
00188 
00189     c = *p;
00190     v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,
00191                        (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
00192     if ((v | BIT_CTYPE_LOWER) != 0) {
00193       /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
00194       if (c >= 0xaa && c <= 0xba)
00195         return FALSE;
00196       else
00197         return TRUE;
00198     }
00199     return (v != 0 ? TRUE : FALSE);
00200   }
00201 
00202   return FALSE;
00203 }
00204 #endif
00205 
00206 static UChar*
00207 utf16le_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end,
00208                               OnigEncoding enc ARG_UNUSED)
00209 {
00210   if (s <= start) return (UChar* )s;
00211 
00212   if ((s - start) % 2 == 1) {
00213     s--;
00214   }
00215 
00216   if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1)
00217     s -= 2;
00218 
00219   return (UChar* )s;
00220 }
00221 
00222 static int
00223 utf16le_get_case_fold_codes_by_str(OnigCaseFoldType flag,
00224                                    const OnigUChar* p, const OnigUChar* end,
00225                                    OnigCaseFoldCodeItem items[],
00226                                    OnigEncoding enc)
00227 {
00228   return onigenc_unicode_get_case_fold_codes_by_str(enc,
00229                                                     flag, p, end, items);
00230 }
00231 
00232 OnigEncodingDefine(utf_16le, UTF_16LE) = {
00233   utf16le_mbc_enc_len,
00234   "UTF-16LE",   /* name */
00235   4,            /* max byte length */
00236   2,            /* min byte length */
00237   utf16le_is_mbc_newline,
00238   utf16le_mbc_to_code,
00239   utf16le_code_to_mbclen,
00240   utf16le_code_to_mbc,
00241   utf16le_mbc_case_fold,
00242   onigenc_unicode_apply_all_case_fold,
00243   utf16le_get_case_fold_codes_by_str,
00244   onigenc_unicode_property_name_to_ctype,
00245   onigenc_unicode_is_code_ctype,
00246   onigenc_utf16_32_get_ctype_code_range,
00247   utf16le_left_adjust_char_head,
00248   onigenc_always_false_is_allowed_reverse_match
00249 };
00250