Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /********************************************************************** 00002 euc_jp.c - Oniguruma (regular expression library) 00003 **********************************************************************/ 00004 /*- 00005 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 00006 * All rights reserved. 00007 * 00008 * Redistribution and use in source and binary forms, with or without 00009 * modification, are permitted provided that the following conditions 00010 * are met: 00011 * 1. Redistributions of source code must retain the above copyright 00012 * notice, this list of conditions and the following disclaimer. 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in the 00015 * documentation and/or other materials provided with the distribution. 00016 * 00017 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 00018 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00019 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00020 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 00021 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00022 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 00023 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00024 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00025 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 00026 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00027 * SUCH DAMAGE. 00028 */ 00029 00030 #include "regint.h" 00031 00032 00033 #define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1) 00034 00035 static const int EncLen_EUCJP[] = { 00036 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00037 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00038 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00039 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00040 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00041 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00042 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00043 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00044 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 00045 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00046 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00047 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00048 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00049 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00050 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00051 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 00052 }; 00053 00054 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2 } state_t; 00055 #define A ACCEPT 00056 #define F FAILURE 00057 static const signed char trans[][0x100] = { 00058 { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00059 /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00060 /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00061 /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00062 /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00063 /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00064 /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00065 /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00066 /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00067 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, 1, 2, 00068 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00069 /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00070 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00071 /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00072 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00073 /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00074 /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F 00075 }, 00076 { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00077 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00078 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00079 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00080 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00081 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00082 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00083 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00084 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00085 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00086 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00087 /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00088 /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00089 /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00090 /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00091 /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00092 /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F 00093 }, 00094 { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00095 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00096 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00097 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00098 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00099 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00100 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00101 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00102 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00103 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00104 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00105 /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00106 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00107 /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00108 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00109 /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00110 /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F 00111 }, 00112 00113 }; 00114 #undef A 00115 #undef F 00116 00117 static int 00118 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) 00119 { 00120 int firstbyte = *p++; 00121 state_t s; 00122 s = trans[0][firstbyte]; 00123 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) : 00124 ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 00125 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-1); 00126 s = trans[s][*p++]; 00127 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) : 00128 ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 00129 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-2); 00130 s = trans[s][*p++]; 00131 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) : 00132 ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 00133 } 00134 00135 static OnigCodePoint 00136 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) 00137 { 00138 int c, i, len; 00139 OnigCodePoint n; 00140 00141 len = enclen(enc, p, end); 00142 n = (OnigCodePoint )*p++; 00143 if (len == 1) return n; 00144 00145 for (i = 1; i < len; i++) { 00146 if (p >= end) break; 00147 c = *p++; 00148 n <<= 8; n += c; 00149 } 00150 return n; 00151 } 00152 00153 static int 00154 code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED) 00155 { 00156 if (ONIGENC_IS_CODE_ASCII(code)) return 1; 00157 else if (code > 0xffffff) 00158 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; 00159 else if (code & 0x800000) return 3; 00160 else if (code & 0x8000) return 2; 00161 else 00162 return ONIGERR_INVALID_CODE_POINT_VALUE; 00163 } 00164 00165 #if 0 00166 static int 00167 code_to_mbc_first(OnigCodePoint code) 00168 { 00169 int first; 00170 00171 if ((code & 0xff0000) != 0) { 00172 first = (code >> 16) & 0xff; 00173 } 00174 else if ((code & 0xff00) != 0) { 00175 first = (code >> 8) & 0xff; 00176 } 00177 else { 00178 return (int )code; 00179 } 00180 return first; 00181 } 00182 #endif 00183 00184 static int 00185 code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc) 00186 { 00187 UChar *p = buf; 00188 00189 if ((code & 0xff0000) != 0) *p++ = (UChar )(((code >> 16) & 0xff)); 00190 if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff)); 00191 *p++ = (UChar )(code & 0xff); 00192 00193 #if 1 00194 if (enclen(enc, buf, p) != (p - buf)) 00195 return ONIGERR_INVALID_CODE_POINT_VALUE; 00196 #endif 00197 return (int)(p - buf); 00198 } 00199 00200 static int 00201 mbc_case_fold(OnigCaseFoldType flag, 00202 const UChar** pp, const UChar* end, UChar* lower, 00203 OnigEncoding enc) 00204 { 00205 int len; 00206 const UChar* p = *pp; 00207 00208 if (ONIGENC_IS_MBC_ASCII(p)) { 00209 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); 00210 (*pp)++; 00211 return 1; 00212 } 00213 else { 00214 int i; 00215 00216 len = enclen(enc, p, end); 00217 for (i = 0; i < len; i++) { 00218 *lower++ = *p++; 00219 } 00220 (*pp) += len; 00221 return len; /* return byte length of converted char to lower */ 00222 } 00223 } 00224 00225 static UChar* 00226 left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc) 00227 { 00228 /* In this encoding 00229 mb-trail bytes doesn't mix with single bytes. 00230 */ 00231 const UChar *p; 00232 int len; 00233 00234 if (s <= start) return (UChar* )s; 00235 p = s; 00236 00237 while (!eucjp_islead(*p) && p > start) p--; 00238 len = enclen(enc, p, end); 00239 if (p + len > s) return (UChar* )p; 00240 p += len; 00241 return (UChar* )(p + ((s - p) & ~1)); 00242 } 00243 00244 static int 00245 is_allowed_reverse_match(const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED) 00246 { 00247 const UChar c = *s; 00248 if (c <= 0x7e || c == 0x8e || c == 0x8f) 00249 return TRUE; 00250 else 00251 return FALSE; 00252 } 00253 00254 00255 static int PropertyInited = 0; 00256 static const OnigCodePoint** PropertyList; 00257 static int PropertyListNum; 00258 static int PropertyListSize; 00259 static hash_table_type* PropertyNameTable; 00260 00261 static const OnigCodePoint CR_Hiragana[] = { 00262 1, 00263 0xa4a1, 0xa4f3 00264 }; /* CR_Hiragana */ 00265 00266 static const OnigCodePoint CR_Katakana[] = { 00267 3, 00268 0xa5a1, 0xa5f6, 00269 0xaaa6, 0xaaaf, 00270 0xaab1, 0xaadd 00271 }; /* CR_Katakana */ 00272 00273 static int 00274 init_property_list(void) 00275 { 00276 int r; 00277 00278 PROPERTY_LIST_ADD_PROP("hiragana", CR_Hiragana); 00279 PROPERTY_LIST_ADD_PROP("katakana", CR_Katakana); 00280 PropertyInited = 1; 00281 00282 end: 00283 return r; 00284 } 00285 00286 static int 00287 property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end) 00288 { 00289 st_data_t ctype; 00290 UChar *s, *e; 00291 00292 PROPERTY_LIST_INIT_CHECK; 00293 00294 s = e = ALLOCA_N(UChar, end-p+1); 00295 for (; p < end; p++) { 00296 *e++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); 00297 } 00298 00299 if (onig_st_lookup_strend(PropertyNameTable, s, e, &ctype) == 0) { 00300 return onigenc_minimum_property_name_to_ctype(enc, s, e); 00301 } 00302 00303 return (int)ctype; 00304 } 00305 00306 static int 00307 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED) 00308 { 00309 if (ctype <= ONIGENC_MAX_STD_CTYPE) { 00310 if (code < 128) 00311 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); 00312 else { 00313 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) { 00314 return (code_to_mbclen(code, enc) > 1 ? TRUE : FALSE); 00315 } 00316 } 00317 } 00318 else { 00319 PROPERTY_LIST_INIT_CHECK; 00320 00321 ctype -= (ONIGENC_MAX_STD_CTYPE + 1); 00322 if (ctype >= (unsigned int )PropertyListNum) 00323 return ONIGERR_TYPE_BUG; 00324 00325 return onig_is_in_code_range((UChar* )PropertyList[ctype], code); 00326 } 00327 00328 return FALSE; 00329 } 00330 00331 static int 00332 get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out, 00333 const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED) 00334 { 00335 if (ctype <= ONIGENC_MAX_STD_CTYPE) { 00336 return ONIG_NO_SUPPORT_CONFIG; 00337 } 00338 else { 00339 *sb_out = 0x80; 00340 00341 PROPERTY_LIST_INIT_CHECK; 00342 00343 ctype -= (ONIGENC_MAX_STD_CTYPE + 1); 00344 if (ctype >= (OnigCtype )PropertyListNum) 00345 return ONIGERR_TYPE_BUG; 00346 00347 *ranges = PropertyList[ctype]; 00348 return 0; 00349 } 00350 } 00351 00352 00353 OnigEncodingDefine(euc_jp, EUC_JP) = { 00354 mbc_enc_len, 00355 "EUC-JP", /* name */ 00356 3, /* max enc length */ 00357 1, /* min enc length */ 00358 onigenc_is_mbc_newline_0x0a, 00359 mbc_to_code, 00360 code_to_mbclen, 00361 code_to_mbc, 00362 mbc_case_fold, 00363 onigenc_ascii_apply_all_case_fold, 00364 onigenc_ascii_get_case_fold_codes_by_str, 00365 property_name_to_ctype, 00366 is_code_ctype, 00367 get_ctype_code_range, 00368 left_adjust_char_head, 00369 is_allowed_reverse_match, 00370 0 00371 }; 00372 /* 00373 * Name: EUC-JP 00374 * MIBenum: 18 00375 * Link: http://www.iana.org/assignments/character-sets 00376 * Link: http://home.m05.itscom.net/numa/cde/sjis-euc/sjis-euc.html 00377 * Link: http://home.m05.itscom.net/numa/uocjleE.pdf 00378 */ 00379 ENC_ALIAS("eucJP", "EUC-JP") /* UI-OSF Application Platform Profile for Japanese Environment Version 1.1 */ 00380 00381 /* 00382 * Name: eucJP-ms 00383 * Link: http://home.m05.itscom.net/numa/cde/ucs-conv/ucs-conv.html 00384 * Link: http://www2d.biglobe.ne.jp/~msyk/charcode/cp932/eucJP-ms.html 00385 * Link: http://ja.wikipedia.org/wiki/EUC-JP 00386 */ 00387 ENC_REPLICATE("eucJP-ms", "EUC-JP") /* TOG/JVC CDE/Motif Technical WG */ 00388 ENC_ALIAS("euc-jp-ms", "eucJP-ms") 00389 00390 /* 00391 * Name: CP51932 00392 * MIBenum: 2108 00393 * Link: http://www.iana.org/assignments/charset-reg/CP51932 00394 * Link: http://search.cpan.org/src/NARUSE/Encode-EUCJPMS-0.07/ucm/cp51932.ucm 00395 * Link: http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932 00396 * Link: http://msyk.at.webry.info/200511/article_2.html 00397 */ 00398 ENC_REPLICATE("CP51932", "EUC-JP") 00399