Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /********************************************************************** 00002 utf_16le.c - Oniguruma (regular expression library) 00003 **********************************************************************/ 00004 /*- 00005 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 00006 * All rights reserved. 00007 * 00008 * Redistribution and use in source and binary forms, with or without 00009 * modification, are permitted provided that the following conditions 00010 * are met: 00011 * 1. Redistributions of source code must retain the above copyright 00012 * notice, this list of conditions and the following disclaimer. 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in the 00015 * documentation and/or other materials provided with the distribution. 00016 * 00017 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 00018 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00019 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00020 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 00021 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00022 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 00023 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00024 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00025 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 00026 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00027 * SUCH DAMAGE. 00028 */ 00029 00030 #include "regenc.h" 00031 00032 #define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8) 00033 #define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc) 00034 #define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8) 00035 00036 static const int EncLen_UTF16[] = { 00037 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00038 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00039 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00040 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00041 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00042 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00043 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00044 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00045 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00046 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00047 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00048 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00049 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00050 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 00051 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00052 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 00053 }; 00054 00055 static int 00056 utf16le_mbc_enc_len(const UChar* p, const OnigUChar* e, 00057 OnigEncoding enc ARG_UNUSED) 00058 { 00059 int len = (int)(e - p); 00060 UChar byte; 00061 if (len < 2) 00062 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1); 00063 byte = p[1]; 00064 if (!UTF16_IS_SURROGATE(byte)) { 00065 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2); 00066 } 00067 if (UTF16_IS_SURROGATE_FIRST(byte)) { 00068 if (len < 4) 00069 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-len); 00070 if (UTF16_IS_SURROGATE_SECOND(p[3])) 00071 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4); 00072 } 00073 return ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 00074 } 00075 00076 static int 00077 utf16le_is_mbc_newline(const UChar* p, const UChar* end, 00078 OnigEncoding enc ARG_UNUSED) 00079 { 00080 if (p + 1 < end) { 00081 if (*p == 0x0a && *(p+1) == 0x00) 00082 return 1; 00083 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS 00084 if (( 00085 #ifndef USE_CRNL_AS_LINE_TERMINATOR 00086 *p == 0x0d || 00087 #endif 00088 *p == 0x85) && *(p+1) == 0x00) 00089 return 1; 00090 if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28)) 00091 return 1; 00092 #endif 00093 } 00094 return 0; 00095 } 00096 00097 static OnigCodePoint 00098 utf16le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED, 00099 OnigEncoding enc ARG_UNUSED) 00100 { 00101 OnigCodePoint code; 00102 UChar c0 = *p; 00103 UChar c1 = *(p+1); 00104 00105 if (UTF16_IS_SURROGATE_FIRST(c1)) { 00106 code = ((((c1 << 8) + c0) & 0x03ff) << 10) 00107 + (((p[3] << 8) + p[2]) & 0x03ff) + 0x10000; 00108 } 00109 else { 00110 code = c1 * 256 + p[0]; 00111 } 00112 return code; 00113 } 00114 00115 static int 00116 utf16le_code_to_mbclen(OnigCodePoint code, 00117 OnigEncoding enc ARG_UNUSED) 00118 { 00119 return (code > 0xffff ? 4 : 2); 00120 } 00121 00122 static int 00123 utf16le_code_to_mbc(OnigCodePoint code, UChar *buf, 00124 OnigEncoding enc ARG_UNUSED) 00125 { 00126 UChar* p = buf; 00127 00128 if (code > 0xffff) { 00129 unsigned int high = (code >> 10) + 0xD7C0; 00130 unsigned int low = (code & 0x3FF) + 0xDC00; 00131 *p++ = high & 0xFF; 00132 *p++ = (high >> 8) & 0xFF; 00133 *p++ = low & 0xFF; 00134 *p++ = (low >> 8) & 0xFF; 00135 return 4; 00136 } 00137 else { 00138 *p++ = (UChar )(code & 0xff); 00139 *p++ = (UChar )((code & 0xff00) >> 8); 00140 return 2; 00141 } 00142 } 00143 00144 static int 00145 utf16le_mbc_case_fold(OnigCaseFoldType flag, 00146 const UChar** pp, const UChar* end, UChar* fold, 00147 OnigEncoding enc) 00148 { 00149 const UChar* p = *pp; 00150 00151 if (ONIGENC_IS_ASCII_CODE(*p) && *(p+1) == 0) { 00152 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI 00153 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { 00154 if (*p == 0x49) { 00155 *fold++ = 0x31; 00156 *fold = 0x01; 00157 (*pp) += 2; 00158 return 2; 00159 } 00160 } 00161 #endif 00162 00163 *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); 00164 *fold = 0; 00165 *pp += 2; 00166 return 2; 00167 } 00168 else 00169 return onigenc_unicode_mbc_case_fold(enc, flag, pp, 00170 end, fold); 00171 } 00172 00173 #if 0 00174 static int 00175 utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, 00176 const UChar* end) 00177 { 00178 const UChar* p = *pp; 00179 00180 (*pp) += EncLen_UTF16[*(p+1)]; 00181 00182 if (*(p+1) == 0) { 00183 int c, v; 00184 00185 if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { 00186 return TRUE; 00187 } 00188 00189 c = *p; 00190 v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, 00191 (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); 00192 if ((v | BIT_CTYPE_LOWER) != 0) { 00193 /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ 00194 if (c >= 0xaa && c <= 0xba) 00195 return FALSE; 00196 else 00197 return TRUE; 00198 } 00199 return (v != 0 ? TRUE : FALSE); 00200 } 00201 00202 return FALSE; 00203 } 00204 #endif 00205 00206 static UChar* 00207 utf16le_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, 00208 OnigEncoding enc ARG_UNUSED) 00209 { 00210 if (s <= start) return (UChar* )s; 00211 00212 if ((s - start) % 2 == 1) { 00213 s--; 00214 } 00215 00216 if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1) 00217 s -= 2; 00218 00219 return (UChar* )s; 00220 } 00221 00222 static int 00223 utf16le_get_case_fold_codes_by_str(OnigCaseFoldType flag, 00224 const OnigUChar* p, const OnigUChar* end, 00225 OnigCaseFoldCodeItem items[], 00226 OnigEncoding enc) 00227 { 00228 return onigenc_unicode_get_case_fold_codes_by_str(enc, 00229 flag, p, end, items); 00230 } 00231 00232 OnigEncodingDefine(utf_16le, UTF_16LE) = { 00233 utf16le_mbc_enc_len, 00234 "UTF-16LE", /* name */ 00235 4, /* max byte length */ 00236 2, /* min byte length */ 00237 utf16le_is_mbc_newline, 00238 utf16le_mbc_to_code, 00239 utf16le_code_to_mbclen, 00240 utf16le_code_to_mbc, 00241 utf16le_mbc_case_fold, 00242 onigenc_unicode_apply_all_case_fold, 00243 utf16le_get_case_fold_codes_by_str, 00244 onigenc_unicode_property_name_to_ctype, 00245 onigenc_unicode_is_code_ctype, 00246 onigenc_utf16_32_get_ctype_code_range, 00247 utf16le_left_adjust_char_head, 00248 onigenc_always_false_is_allowed_reverse_match 00249 }; 00250