Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /********************************************************************** 00002 utf_32be.c - Oniguruma (regular expression library) 00003 **********************************************************************/ 00004 /*- 00005 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 00006 * All rights reserved. 00007 * 00008 * Redistribution and use in source and binary forms, with or without 00009 * modification, are permitted provided that the following conditions 00010 * are met: 00011 * 1. Redistributions of source code must retain the above copyright 00012 * notice, this list of conditions and the following disclaimer. 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in the 00015 * documentation and/or other materials provided with the distribution. 00016 * 00017 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 00018 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00019 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00020 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 00021 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00022 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 00023 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00024 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00025 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 00026 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00027 * SUCH DAMAGE. 00028 */ 00029 00030 #include "regenc.h" 00031 00032 static int 00033 utf32be_mbc_enc_len(const UChar* p ARG_UNUSED, const OnigUChar* e ARG_UNUSED, 00034 OnigEncoding enc ARG_UNUSED) 00035 { 00036 return 4; 00037 } 00038 00039 static int 00040 utf32be_is_mbc_newline(const UChar* p, const UChar* end, 00041 OnigEncoding enc ARG_UNUSED) 00042 { 00043 if (p + 3 < end) { 00044 if (*(p+3) == 0x0a && *(p+2) == 0 && *(p+1) == 0 && *p == 0) 00045 return 1; 00046 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS 00047 if (( 00048 #ifndef USE_CRNL_AS_LINE_TERMINATOR 00049 *(p+3) == 0x0d || 00050 #endif 00051 *(p+3) == 0x85) 00052 && *(p+2) == 0 && *(p+1) == 0 && *p == 0x00) 00053 return 1; 00054 if (*(p+2) == 0x20 && (*(p+3) == 0x29 || *(p+3) == 0x28) 00055 && *(p+1) == 0 && *p == 0) 00056 return 1; 00057 #endif 00058 } 00059 return 0; 00060 } 00061 00062 static OnigCodePoint 00063 utf32be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED, 00064 OnigEncoding enc ARG_UNUSED) 00065 { 00066 return (OnigCodePoint )(((p[0] * 256 + p[1]) * 256 + p[2]) * 256 + p[3]); 00067 } 00068 00069 static int 00070 utf32be_code_to_mbclen(OnigCodePoint code ARG_UNUSED, 00071 OnigEncoding enc ARG_UNUSED) 00072 { 00073 return 4; 00074 } 00075 00076 static int 00077 utf32be_code_to_mbc(OnigCodePoint code, UChar *buf, 00078 OnigEncoding enc ARG_UNUSED) 00079 { 00080 UChar* p = buf; 00081 00082 *p++ = (UChar )((code & 0xff000000) >>24); 00083 *p++ = (UChar )((code & 0xff0000) >>16); 00084 *p++ = (UChar )((code & 0xff00) >> 8); 00085 *p++ = (UChar ) (code & 0xff); 00086 return 4; 00087 } 00088 00089 static int 00090 utf32be_mbc_case_fold(OnigCaseFoldType flag, 00091 const UChar** pp, const UChar* end, UChar* fold, 00092 OnigEncoding enc) 00093 { 00094 const UChar* p = *pp; 00095 00096 if (ONIGENC_IS_ASCII_CODE(*(p+3)) && *(p+2) == 0 && *(p+1) == 0 && *p == 0) { 00097 *fold++ = 0; 00098 *fold++ = 0; 00099 00100 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI 00101 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { 00102 if (*(p+3) == 0x49) { 00103 *fold++ = 0x01; 00104 *fold = 0x31; 00105 (*pp) += 4; 00106 return 4; 00107 } 00108 } 00109 #endif 00110 00111 *fold++ = 0; 00112 *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*(p+3)); 00113 *pp += 4; 00114 return 4; 00115 } 00116 else 00117 return onigenc_unicode_mbc_case_fold(enc, flag, pp, 00118 end, fold); 00119 } 00120 00121 #if 0 00122 static int 00123 utf32be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) 00124 { 00125 const UChar* p = *pp; 00126 00127 (*pp) += 4; 00128 00129 if (*(p+2) == 0 && *(p+1) == 0 && *p == 0) { 00130 int c, v; 00131 00132 p += 3; 00133 if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { 00134 return TRUE; 00135 } 00136 00137 c = *p; 00138 v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, 00139 (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); 00140 if ((v | BIT_CTYPE_LOWER) != 0) { 00141 /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ 00142 if (c >= 0xaa && c <= 0xba) 00143 return FALSE; 00144 else 00145 return TRUE; 00146 } 00147 return (v != 0 ? TRUE : FALSE); 00148 } 00149 00150 return FALSE; 00151 } 00152 #endif 00153 00154 static UChar* 00155 utf32be_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, 00156 OnigEncoding enc ARG_UNUSED) 00157 { 00158 ptrdiff_t rem; 00159 00160 if (s <= start) return (UChar* )s; 00161 00162 rem = (s - start) % 4; 00163 return (UChar* )(s - rem); 00164 } 00165 00166 static int 00167 utf32be_get_case_fold_codes_by_str(OnigCaseFoldType flag, 00168 const OnigUChar* p, const OnigUChar* end, 00169 OnigCaseFoldCodeItem items[], 00170 OnigEncoding enc) 00171 { 00172 return onigenc_unicode_get_case_fold_codes_by_str(enc, 00173 flag, p, end, items); 00174 } 00175 00176 OnigEncodingDefine(utf_32be, UTF_32BE) = { 00177 utf32be_mbc_enc_len, 00178 "UTF-32BE", /* name */ 00179 4, /* max byte length */ 00180 4, /* min byte length */ 00181 utf32be_is_mbc_newline, 00182 utf32be_mbc_to_code, 00183 utf32be_code_to_mbclen, 00184 utf32be_code_to_mbc, 00185 utf32be_mbc_case_fold, 00186 onigenc_unicode_apply_all_case_fold, 00187 utf32be_get_case_fold_codes_by_str, 00188 onigenc_unicode_property_name_to_ctype, 00189 onigenc_unicode_is_code_ctype, 00190 onigenc_utf16_32_get_ctype_code_range, 00191 utf32be_left_adjust_char_head, 00192 onigenc_always_false_is_allowed_reverse_match 00193 }; 00194 ENC_ALIAS("UCS-4BE", "UTF-32BE") 00195 00196