Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /********************************************************************** 00002 euc_tw.c - Oniguruma (regular expression library) 00003 **********************************************************************/ 00004 /*- 00005 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 00006 * All rights reserved. 00007 * 00008 * Redistribution and use in source and binary forms, with or without 00009 * modification, are permitted provided that the following conditions 00010 * are met: 00011 * 1. Redistributions of source code must retain the above copyright 00012 * notice, this list of conditions and the following disclaimer. 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in the 00015 * documentation and/or other materials provided with the distribution. 00016 * 00017 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 00018 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00019 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00020 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 00021 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00022 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 00023 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00024 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00025 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 00026 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00027 * SUCH DAMAGE. 00028 */ 00029 00030 #include "regenc.h" 00031 00032 static const int EncLen_EUCTW[] = { 00033 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00034 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00035 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00036 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00037 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00038 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00039 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00040 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00041 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 00042 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00043 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00044 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00045 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00046 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00047 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00048 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 00049 }; 00050 00051 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2, S3 } state_t; 00052 #define A ACCEPT 00053 #define F FAILURE 00054 static const signed char trans[][0x100] = { 00055 { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00056 /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00057 /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00058 /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00059 /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00060 /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00061 /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00062 /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00063 /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00064 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, 2, F, 00065 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00066 /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00067 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00068 /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00069 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00070 /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00071 /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F 00072 }, 00073 { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00074 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00075 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00076 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00077 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00078 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00079 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00080 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00081 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00082 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00083 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00084 /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00085 /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00086 /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00087 /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00088 /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00089 /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F 00090 }, 00091 { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00092 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00093 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00094 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00095 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00096 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00097 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00098 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00099 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00100 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00101 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00102 /* a */ F, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00103 /* b */ 3, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00104 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00105 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00106 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00107 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 00108 }, 00109 { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00110 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00111 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00112 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00113 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00114 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00115 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00116 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00117 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00118 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00119 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00120 /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00121 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00122 /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00123 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00124 /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00125 /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F 00126 } 00127 }; 00128 #undef A 00129 #undef F 00130 00131 static int 00132 euctw_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) 00133 { 00134 int firstbyte = *p++; 00135 state_t s = trans[0][firstbyte]; 00136 #define RETURN(n) \ 00137 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) : \ 00138 ONIGENC_CONSTRUCT_MBCLEN_INVALID() 00139 if (s < 0) RETURN(1); 00140 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCTW[firstbyte]-1); 00141 s = trans[s][*p++]; 00142 if (s < 0) RETURN(2); 00143 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-2); 00144 s = trans[s][*p++]; 00145 if (s < 0) RETURN(3); 00146 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-3); 00147 s = trans[s][*p++]; 00148 RETURN(4); 00149 #undef RETURN 00150 } 00151 00152 static OnigCodePoint 00153 euctw_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc ARG_UNUSED) 00154 { 00155 return onigenc_mbn_mbc_to_code(enc, p, end); 00156 } 00157 00158 static int 00159 euctw_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc) 00160 { 00161 return onigenc_mb4_code_to_mbc(enc, code, buf); 00162 } 00163 00164 static int 00165 euctw_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, 00166 UChar* lower, OnigEncoding enc) 00167 { 00168 return onigenc_mbn_mbc_case_fold(enc, flag, 00169 pp, end, lower); 00170 } 00171 00172 static int 00173 euctw_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc) 00174 { 00175 return onigenc_mb4_is_code_ctype(enc, code, ctype); 00176 } 00177 00178 #define euctw_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1) 00179 00180 static UChar* 00181 euctw_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc) 00182 { 00183 /* Assumed in this encoding, 00184 mb-trail bytes don't mix with single bytes. 00185 */ 00186 const UChar *p; 00187 int len; 00188 00189 if (s <= start) return (UChar* )s; 00190 p = s; 00191 00192 while (!euctw_islead(*p) && p > start) p--; 00193 len = enclen(enc, p, end); 00194 if (p + len > s) return (UChar* )p; 00195 p += len; 00196 return (UChar* )(p + ((s - p) & ~1)); 00197 } 00198 00199 static int 00200 euctw_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED, OnigEncoding enc ARG_UNUSED) 00201 { 00202 const UChar c = *s; 00203 if (c <= 0x7e) return TRUE; 00204 else return FALSE; 00205 } 00206 00207 OnigEncodingDefine(euc_tw, EUC_TW) = { 00208 euctw_mbc_enc_len, 00209 "EUC-TW", /* name */ 00210 4, /* max enc length */ 00211 1, /* min enc length */ 00212 onigenc_is_mbc_newline_0x0a, 00213 euctw_mbc_to_code, 00214 onigenc_mb4_code_to_mbclen, 00215 euctw_code_to_mbc, 00216 euctw_mbc_case_fold, 00217 onigenc_ascii_apply_all_case_fold, 00218 onigenc_ascii_get_case_fold_codes_by_str, 00219 onigenc_minimum_property_name_to_ctype, 00220 euctw_is_code_ctype, 00221 onigenc_not_support_get_ctype_code_range, 00222 euctw_left_adjust_char_head, 00223 euctw_is_allowed_reverse_match 00224 }; 00225 ENC_ALIAS("eucTW", "EUC-TW") 00226