Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /********************************************************************** 00002 utf_32le.c - Oniguruma (regular expression library) 00003 **********************************************************************/ 00004 /*- 00005 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 00006 * All rights reserved. 00007 * 00008 * Redistribution and use in source and binary forms, with or without 00009 * modification, are permitted provided that the following conditions 00010 * are met: 00011 * 1. Redistributions of source code must retain the above copyright 00012 * notice, this list of conditions and the following disclaimer. 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in the 00015 * documentation and/or other materials provided with the distribution. 00016 * 00017 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 00018 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00019 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00020 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 00021 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00022 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 00023 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00024 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00025 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 00026 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00027 * SUCH DAMAGE. 00028 */ 00029 00030 #include "regenc.h" 00031 00032 static int 00033 utf32le_mbc_enc_len(const UChar* p ARG_UNUSED, const OnigUChar* e ARG_UNUSED, 00034 OnigEncoding enc ARG_UNUSED) 00035 { 00036 return 4; 00037 } 00038 00039 static int 00040 utf32le_is_mbc_newline(const UChar* p, const UChar* end, 00041 OnigEncoding enc ARG_UNUSED) 00042 { 00043 if (p + 3 < end) { 00044 if (*p == 0x0a && *(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0) 00045 return 1; 00046 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS 00047 if (( 00048 #ifndef USE_CRNL_AS_LINE_TERMINATOR 00049 *p == 0x0d || 00050 #endif 00051 *p == 0x85) 00052 && *(p+1) == 0x00 && (p+2) == 0x00 && *(p+3) == 0x00) 00053 return 1; 00054 if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28) 00055 && *(p+2) == 0x00 && *(p+3) == 0x00) 00056 return 1; 00057 #endif 00058 } 00059 return 0; 00060 } 00061 00062 static OnigCodePoint 00063 utf32le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED, 00064 OnigEncoding enc ARG_UNUSED) 00065 { 00066 return (OnigCodePoint )(((p[3] * 256 + p[2]) * 256 + p[1]) * 256 + p[0]); 00067 } 00068 00069 static int 00070 utf32le_code_to_mbclen(OnigCodePoint code ARG_UNUSED, 00071 OnigEncoding enc ARG_UNUSED) 00072 { 00073 return 4; 00074 } 00075 00076 static int 00077 utf32le_code_to_mbc(OnigCodePoint code, UChar *buf, 00078 OnigEncoding enc ARG_UNUSED) 00079 { 00080 UChar* p = buf; 00081 00082 *p++ = (UChar ) (code & 0xff); 00083 *p++ = (UChar )((code & 0xff00) >> 8); 00084 *p++ = (UChar )((code & 0xff0000) >>16); 00085 *p++ = (UChar )((code & 0xff000000) >>24); 00086 return 4; 00087 } 00088 00089 static int 00090 utf32le_mbc_case_fold(OnigCaseFoldType flag, 00091 const UChar** pp, const UChar* end, UChar* fold, 00092 OnigEncoding enc) 00093 { 00094 const UChar* p = *pp; 00095 00096 if (ONIGENC_IS_ASCII_CODE(*p) && *(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0) { 00097 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI 00098 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { 00099 if (*p == 0x49) { 00100 *fold++ = 0x31; 00101 *fold++ = 0x01; 00102 } 00103 } 00104 else { 00105 #endif 00106 *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); 00107 *fold++ = 0; 00108 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI 00109 } 00110 #endif 00111 00112 *fold++ = 0; 00113 *fold = 0; 00114 *pp += 4; 00115 return 4; 00116 } 00117 else 00118 return onigenc_unicode_mbc_case_fold(enc, flag, pp, 00119 end, fold); 00120 } 00121 00122 #if 0 00123 static int 00124 utf32le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) 00125 { 00126 const UChar* p = *pp; 00127 00128 (*pp) += 4; 00129 00130 if (*(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0) { 00131 int c, v; 00132 00133 if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { 00134 return TRUE; 00135 } 00136 00137 c = *p; 00138 v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, 00139 (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); 00140 if ((v | BIT_CTYPE_LOWER) != 0) { 00141 /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ 00142 if (c >= 0xaa && c <= 0xba) 00143 return FALSE; 00144 else 00145 return TRUE; 00146 } 00147 return (v != 0 ? TRUE : FALSE); 00148 } 00149 00150 return FALSE; 00151 } 00152 #endif 00153 00154 static UChar* 00155 utf32le_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, 00156 OnigEncoding enc ARG_UNUSED) 00157 { 00158 ptrdiff_t rem; 00159 00160 if (s <= start) return (UChar* )s; 00161 00162 rem = (s - start) % 4; 00163 return (UChar* )(s - rem); 00164 } 00165 00166 static int 00167 utf32le_get_case_fold_codes_by_str(OnigCaseFoldType flag, 00168 const OnigUChar* p, const OnigUChar* end, 00169 OnigCaseFoldCodeItem items[], 00170 OnigEncoding enc) 00171 { 00172 return onigenc_unicode_get_case_fold_codes_by_str(enc, 00173 flag, p, end, items); 00174 } 00175 00176 OnigEncodingDefine(utf_32le, UTF_32LE) = { 00177 utf32le_mbc_enc_len, 00178 "UTF-32LE", /* name */ 00179 4, /* max byte length */ 00180 4, /* min byte length */ 00181 utf32le_is_mbc_newline, 00182 utf32le_mbc_to_code, 00183 utf32le_code_to_mbclen, 00184 utf32le_code_to_mbc, 00185 utf32le_mbc_case_fold, 00186 onigenc_unicode_apply_all_case_fold, 00187 utf32le_get_case_fold_codes_by_str, 00188 onigenc_unicode_property_name_to_ctype, 00189 onigenc_unicode_is_code_ctype, 00190 onigenc_utf16_32_get_ctype_code_range, 00191 utf32le_left_adjust_char_head, 00192 onigenc_always_false_is_allowed_reverse_match 00193 }; 00194 ENC_ALIAS("UCS-4LE", "UTF-32LE") 00195