Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /********************************************************************** 00002 utf_8.c - Oniguruma (regular expression library) 00003 **********************************************************************/ 00004 /*- 00005 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 00006 * All rights reserved. 00007 * 00008 * Redistribution and use in source and binary forms, with or without 00009 * modification, are permitted provided that the following conditions 00010 * are met: 00011 * 1. Redistributions of source code must retain the above copyright 00012 * notice, this list of conditions and the following disclaimer. 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in the 00015 * documentation and/or other materials provided with the distribution. 00016 * 00017 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 00018 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00019 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00020 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 00021 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00022 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 00023 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00024 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00025 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 00026 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00027 * SUCH DAMAGE. 00028 */ 00029 00030 #include "regenc.h" 00031 00032 #define USE_INVALID_CODE_SCHEME 00033 00034 #ifdef USE_INVALID_CODE_SCHEME 00035 /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */ 00036 #define INVALID_CODE_FE 0xfffffffe 00037 #define INVALID_CODE_FF 0xffffffff 00038 #define VALID_CODE_LIMIT 0x7fffffff 00039 #endif 00040 00041 #define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80) 00042 00043 static const int EncLen_UTF8[] = { 00044 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00045 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00046 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00047 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00048 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00049 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00050 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00051 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00052 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00053 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00054 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00055 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00056 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00057 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00058 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00059 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 00060 }; 00061 00062 typedef enum { 00063 FAILURE = -2, 00064 ACCEPT, 00065 S0, S1, S2, S3, 00066 S4, S5, S6, S7 00067 } state_t; 00068 #define A ACCEPT 00069 #define F FAILURE 00070 static const signed char trans[][0x100] = { 00071 { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00072 /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00073 /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00074 /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00075 /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00076 /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00077 /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00078 /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00079 /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00080 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00081 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00082 /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00083 /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00084 /* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00085 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00086 /* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 00087 /* f */ 5, 6, 6, 6, 7, F, F, F, F, F, F, F, F, F, F, F 00088 }, 00089 { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00090 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00091 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00092 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00093 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00094 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00095 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00096 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00097 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00098 /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00099 /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00100 /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00101 /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00102 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00103 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00104 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00105 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 00106 }, 00107 { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00108 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00109 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00110 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00111 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00112 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00113 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00114 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00115 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00116 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00117 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00118 /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00119 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00120 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00121 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00122 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00123 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 00124 }, 00125 { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00126 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00127 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00128 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00129 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00130 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00131 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00132 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00133 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00134 /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00135 /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00136 /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00137 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00138 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00139 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00140 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00141 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 00142 }, 00143 { /* S4 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00144 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00145 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00146 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00147 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00148 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00149 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00150 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00151 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00152 /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00153 /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00154 /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00155 /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00156 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00157 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00158 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00159 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 00160 }, 00161 { /* S5 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00162 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00163 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00164 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00165 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00166 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00167 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00168 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00169 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00170 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00171 /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00172 /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00173 /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00174 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00175 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00176 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00177 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 00178 }, 00179 { /* S6 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00180 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00181 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00182 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00183 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00184 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00185 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00186 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00187 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00188 /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00189 /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00190 /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00191 /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00192 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00193 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00194 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00195 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 00196 }, 00197 { /* S7 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00198 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00199 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00200 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00201 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00202 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00203 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00204 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00205 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00206 /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00207 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00208 /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00209 /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00210 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00211 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00212 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00213 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 00214 }, 00215 }; 00216 #undef A 00217 #undef F 00218 00219 static int 00220 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) 00221 { 00222 int firstbyte = *p++; 00223 state_t s; 00224 s = trans[0][firstbyte]; 00225 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) : 00226 ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 00227 00228 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1); 00229 s = trans[s][*p++]; 00230 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) : 00231 ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 00232 00233 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2); 00234 s = trans[s][*p++]; 00235 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) : 00236 ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 00237 00238 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3); 00239 s = trans[s][*p++]; 00240 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) : 00241 ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 00242 } 00243 00244 static int 00245 is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc) 00246 { 00247 if (p < end) { 00248 if (*p == 0x0a) return 1; 00249 00250 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS 00251 #ifndef USE_CRNL_AS_LINE_TERMINATOR 00252 if (*p == 0x0d) return 1; 00253 #endif 00254 if (p + 1 < end) { 00255 if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */ 00256 return 1; 00257 if (p + 2 < end) { 00258 if ((*(p+2) == 0xa8 || *(p+2) == 0xa9) 00259 && *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */ 00260 return 1; 00261 } 00262 } 00263 #endif 00264 } 00265 00266 return 0; 00267 } 00268 00269 static OnigCodePoint 00270 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) 00271 { 00272 int c, len; 00273 OnigCodePoint n; 00274 00275 len = enclen(enc, p, end); 00276 c = *p++; 00277 if (len > 1) { 00278 len--; 00279 n = c & ((1 << (6 - len)) - 1); 00280 while (len--) { 00281 c = *p++; 00282 n = (n << 6) | (c & ((1 << 6) - 1)); 00283 } 00284 return n; 00285 } 00286 else { 00287 #ifdef USE_INVALID_CODE_SCHEME 00288 if (c > 0xfd) { 00289 return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF); 00290 } 00291 #endif 00292 return (OnigCodePoint )c; 00293 } 00294 } 00295 00296 static int 00297 code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED) 00298 { 00299 if ((code & 0xffffff80) == 0) return 1; 00300 else if ((code & 0xfffff800) == 0) return 2; 00301 else if ((code & 0xffff0000) == 0) return 3; 00302 else if ((code & 0xffe00000) == 0) return 4; 00303 else if ((code & 0xfc000000) == 0) return 5; 00304 else if ((code & 0x80000000) == 0) return 6; 00305 #ifdef USE_INVALID_CODE_SCHEME 00306 else if (code == INVALID_CODE_FE) return 1; 00307 else if (code == INVALID_CODE_FF) return 1; 00308 #endif 00309 else 00310 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; 00311 } 00312 00313 static int 00314 code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED) 00315 { 00316 #define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80) 00317 #define UTF8_TRAIL0(code) (UChar )(((code) & 0x3f) | 0x80) 00318 00319 if ((code & 0xffffff80) == 0) { 00320 *buf = (UChar )code; 00321 return 1; 00322 } 00323 else { 00324 UChar *p = buf; 00325 00326 if ((code & 0xfffff800) == 0) { 00327 *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0); 00328 } 00329 else if ((code & 0xffff0000) == 0) { 00330 *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0); 00331 *p++ = UTF8_TRAILS(code, 6); 00332 } 00333 else if ((code & 0xffe00000) == 0) { 00334 *p++ = (UChar )(((code>>18) & 0x07) | 0xf0); 00335 *p++ = UTF8_TRAILS(code, 12); 00336 *p++ = UTF8_TRAILS(code, 6); 00337 } 00338 else if ((code & 0xfc000000) == 0) { 00339 *p++ = (UChar )(((code>>24) & 0x03) | 0xf8); 00340 *p++ = UTF8_TRAILS(code, 18); 00341 *p++ = UTF8_TRAILS(code, 12); 00342 *p++ = UTF8_TRAILS(code, 6); 00343 } 00344 else if ((code & 0x80000000) == 0) { 00345 *p++ = (UChar )(((code>>30) & 0x01) | 0xfc); 00346 *p++ = UTF8_TRAILS(code, 24); 00347 *p++ = UTF8_TRAILS(code, 18); 00348 *p++ = UTF8_TRAILS(code, 12); 00349 *p++ = UTF8_TRAILS(code, 6); 00350 } 00351 #ifdef USE_INVALID_CODE_SCHEME 00352 else if (code == INVALID_CODE_FE) { 00353 *p = 0xfe; 00354 return 1; 00355 } 00356 else if (code == INVALID_CODE_FF) { 00357 *p = 0xff; 00358 return 1; 00359 } 00360 #endif 00361 else { 00362 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; 00363 } 00364 00365 *p++ = UTF8_TRAIL0(code); 00366 return (int)(p - buf); 00367 } 00368 } 00369 00370 static int 00371 mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, 00372 const UChar* end, UChar* fold, OnigEncoding enc) 00373 { 00374 const UChar* p = *pp; 00375 00376 if (ONIGENC_IS_MBC_ASCII(p)) { 00377 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI 00378 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { 00379 if (*p == 0x49) { 00380 *fold++ = 0xc4; 00381 *fold = 0xb1; 00382 (*pp)++; 00383 return 2; 00384 } 00385 } 00386 #endif 00387 00388 *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); 00389 (*pp)++; 00390 return 1; /* return byte length of converted char to lower */ 00391 } 00392 else { 00393 return onigenc_unicode_mbc_case_fold(enc, flag, pp, end, fold); 00394 } 00395 } 00396 00397 00398 static int 00399 get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out, 00400 const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED) 00401 { 00402 *sb_out = 0x80; 00403 return onigenc_unicode_ctype_code_range(ctype, ranges); 00404 } 00405 00406 00407 static UChar* 00408 left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED) 00409 { 00410 const UChar *p; 00411 00412 if (s <= start) return (UChar* )s; 00413 p = s; 00414 00415 while (!utf8_islead(*p) && p > start) p--; 00416 return (UChar* )p; 00417 } 00418 00419 static int 00420 get_case_fold_codes_by_str(OnigCaseFoldType flag, 00421 const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[], 00422 OnigEncoding enc) 00423 { 00424 return onigenc_unicode_get_case_fold_codes_by_str(enc, flag, p, end, items); 00425 } 00426 00427 OnigEncodingDefine(utf_8, UTF_8) = { 00428 mbc_enc_len, 00429 "UTF-8", /* name */ 00430 6, /* max byte length */ 00431 1, /* min byte length */ 00432 is_mbc_newline, 00433 mbc_to_code, 00434 code_to_mbclen, 00435 code_to_mbc, 00436 mbc_case_fold, 00437 onigenc_unicode_apply_all_case_fold, 00438 get_case_fold_codes_by_str, 00439 onigenc_unicode_property_name_to_ctype, 00440 onigenc_unicode_is_code_ctype, 00441 get_ctype_code_range, 00442 left_adjust_char_head, 00443 onigenc_always_true_is_allowed_reverse_match 00444 }; 00445 ENC_ALIAS("CP65001", "UTF-8") 00446 00447 /* 00448 * Name: UTF8-MAC 00449 * Link: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/BPFileSystem.html 00450 * Link: http://developer.apple.com/qa/qa2001/qa1235.html 00451 * Link: http://developer.apple.com/jp/qa/qa2001/qa1235.html 00452 * Link: http://www.gnu.org/software/emacs/NEWS.23.2 00453 */ 00454 ENC_REPLICATE("UTF8-MAC", "UTF-8") 00455 ENC_ALIAS("UTF-8-MAC", "UTF8-MAC") 00456 ENC_ALIAS("UTF-8-HFS", "UTF8-MAC") /* Emacs 23.2 */ 00457 00458