Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /********************************************************************** 00002 utf_16be.c - Oniguruma (regular expression library) 00003 **********************************************************************/ 00004 /*- 00005 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 00006 * All rights reserved. 00007 * 00008 * Redistribution and use in source and binary forms, with or without 00009 * modification, are permitted provided that the following conditions 00010 * are met: 00011 * 1. Redistributions of source code must retain the above copyright 00012 * notice, this list of conditions and the following disclaimer. 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in the 00015 * documentation and/or other materials provided with the distribution. 00016 * 00017 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 00018 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00019 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00020 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 00021 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00022 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 00023 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00024 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00025 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 00026 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00027 * SUCH DAMAGE. 00028 */ 00029 00030 #include "regenc.h" 00031 00032 #define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8) 00033 #define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc) 00034 #define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8) 00035 00036 static const int EncLen_UTF16[] = { 00037 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00038 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00039 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00040 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00041 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00042 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00043 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00044 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00045 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00046 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00047 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00048 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00049 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00050 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 00051 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00052 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 00053 }; 00054 00055 static int 00056 utf16be_mbc_enc_len(const UChar* p, const OnigUChar* e ARG_UNUSED, 00057 OnigEncoding enc ARG_UNUSED) 00058 { 00059 int byte = p[0]; 00060 if (!UTF16_IS_SURROGATE(byte)) { 00061 if (2 <= e-p) 00062 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2); 00063 else 00064 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1); 00065 } 00066 if (UTF16_IS_SURROGATE_FIRST(byte)) { 00067 switch (e-p) { 00068 case 1: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(3); 00069 case 2: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(2); 00070 case 3: 00071 if (UTF16_IS_SURROGATE_SECOND(p[2])) 00072 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1); 00073 break; 00074 default: 00075 if (UTF16_IS_SURROGATE_SECOND(p[2])) 00076 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4); 00077 break; 00078 } 00079 } 00080 return ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 00081 } 00082 00083 static int 00084 utf16be_is_mbc_newline(const UChar* p, const UChar* end, 00085 OnigEncoding enc) 00086 { 00087 if (p + 1 < end) { 00088 if (*(p+1) == 0x0a && *p == 0x00) 00089 return 1; 00090 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS 00091 if (( 00092 #ifndef USE_CRNL_AS_LINE_TERMINATOR 00093 *(p+1) == 0x0d || 00094 #endif 00095 *(p+1) == 0x85) && *p == 0x00) 00096 return 1; 00097 if (*p == 0x20 && (*(p+1) == 0x29 || *(p+1) == 0x28)) 00098 return 1; 00099 #endif 00100 } 00101 return 0; 00102 } 00103 00104 static OnigCodePoint 00105 utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED, 00106 OnigEncoding enc) 00107 { 00108 OnigCodePoint code; 00109 00110 if (UTF16_IS_SURROGATE_FIRST(*p)) { 00111 code = ((((p[0] << 8) + p[1]) & 0x03ff) << 10) 00112 + (((p[2] << 8) + p[3]) & 0x03ff) + 0x10000; 00113 } 00114 else { 00115 code = p[0] * 256 + p[1]; 00116 } 00117 return code; 00118 } 00119 00120 static int 00121 utf16be_code_to_mbclen(OnigCodePoint code, 00122 OnigEncoding enc) 00123 { 00124 return (code > 0xffff ? 4 : 2); 00125 } 00126 00127 static int 00128 utf16be_code_to_mbc(OnigCodePoint code, UChar *buf, 00129 OnigEncoding enc) 00130 { 00131 UChar* p = buf; 00132 00133 if (code > 0xffff) { 00134 unsigned int high = (code >> 10) + 0xD7C0; 00135 unsigned int low = (code & 0x3FF) + 0xDC00; 00136 *p++ = (high >> 8) & 0xFF; 00137 *p++ = high & 0xFF; 00138 *p++ = (low >> 8) & 0xFF; 00139 *p++ = low & 0xFF; 00140 return 4; 00141 } 00142 else { 00143 *p++ = (UChar )((code & 0xff00) >> 8); 00144 *p++ = (UChar )(code & 0xff); 00145 return 2; 00146 } 00147 } 00148 00149 static int 00150 utf16be_mbc_case_fold(OnigCaseFoldType flag, 00151 const UChar** pp, const UChar* end, UChar* fold, 00152 OnigEncoding enc) 00153 { 00154 const UChar* p = *pp; 00155 00156 if (ONIGENC_IS_ASCII_CODE(*(p+1)) && *p == 0) { 00157 p++; 00158 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI 00159 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { 00160 if (*p == 0x49) { 00161 *fold++ = 0x01; 00162 *fold = 0x31; 00163 (*pp) += 2; 00164 return 2; 00165 } 00166 } 00167 #endif 00168 00169 *fold++ = 0; 00170 *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); 00171 *pp += 2; 00172 return 2; 00173 } 00174 else 00175 return onigenc_unicode_mbc_case_fold(enc, flag, 00176 pp, end, fold); 00177 } 00178 00179 #if 0 00180 static int 00181 utf16be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) 00182 { 00183 const UChar* p = *pp; 00184 00185 (*pp) += EncLen_UTF16[*p]; 00186 00187 if (*p == 0) { 00188 int c, v; 00189 00190 p++; 00191 if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { 00192 return TRUE; 00193 } 00194 00195 c = *p; 00196 v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, 00197 (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); 00198 00199 if ((v | BIT_CTYPE_LOWER) != 0) { 00200 /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ 00201 if (c >= 0xaa && c <= 0xba) 00202 return FALSE; 00203 else 00204 return TRUE; 00205 } 00206 return (v != 0 ? TRUE : FALSE); 00207 } 00208 00209 return FALSE; 00210 } 00211 #endif 00212 00213 static UChar* 00214 utf16be_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, 00215 OnigEncoding enc ARG_UNUSED) 00216 { 00217 if (s <= start) return (UChar* )s; 00218 00219 if ((s - start) % 2 == 1) { 00220 s--; 00221 } 00222 00223 if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1) 00224 s -= 2; 00225 00226 return (UChar* )s; 00227 } 00228 00229 static int 00230 utf16be_get_case_fold_codes_by_str(OnigCaseFoldType flag, 00231 const OnigUChar* p, const OnigUChar* end, 00232 OnigCaseFoldCodeItem items[], 00233 OnigEncoding enc) 00234 { 00235 return onigenc_unicode_get_case_fold_codes_by_str(enc, 00236 flag, p, end, items); 00237 } 00238 00239 OnigEncodingDefine(utf_16be, UTF_16BE) = { 00240 utf16be_mbc_enc_len, 00241 "UTF-16BE", /* name */ 00242 4, /* max byte length */ 00243 2, /* min byte length */ 00244 utf16be_is_mbc_newline, 00245 utf16be_mbc_to_code, 00246 utf16be_code_to_mbclen, 00247 utf16be_code_to_mbc, 00248 utf16be_mbc_case_fold, 00249 onigenc_unicode_apply_all_case_fold, 00250 utf16be_get_case_fold_codes_by_str, 00251 onigenc_unicode_property_name_to_ctype, 00252 onigenc_unicode_is_code_ctype, 00253 onigenc_utf16_32_get_ctype_code_range, 00254 utf16be_left_adjust_char_head, 00255 onigenc_always_false_is_allowed_reverse_match 00256 }; 00257 ENC_ALIAS("UCS-2BE", "UTF-16BE") 00258