Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /********************************************************************** 00002 sjis.c - Oniguruma (regular expression library) 00003 **********************************************************************/ 00004 /*- 00005 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 00006 * All rights reserved. 00007 * 00008 * Redistribution and use in source and binary forms, with or without 00009 * modification, are permitted provided that the following conditions 00010 * are met: 00011 * 1. Redistributions of source code must retain the above copyright 00012 * notice, this list of conditions and the following disclaimer. 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in the 00015 * documentation and/or other materials provided with the distribution. 00016 * 00017 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 00018 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00019 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00020 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 00021 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00022 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 00023 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00024 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00025 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 00026 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00027 * SUCH DAMAGE. 00028 */ 00029 00030 #include "regint.h" 00031 00032 static const int EncLen_SJIS[] = { 00033 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00034 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00035 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00036 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00037 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00038 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00039 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00040 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00041 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00042 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00043 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00044 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00045 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00046 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00047 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00048 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1 00049 }; 00050 00051 static const char SJIS_CAN_BE_TRAIL_TABLE[256] = { 00052 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00053 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00054 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00055 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00056 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00057 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00058 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00059 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 00060 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00061 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00062 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00063 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00064 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00065 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00066 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00067 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 00068 }; 00069 00070 #define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1) 00071 #define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)] 00072 00073 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t; 00074 #define A ACCEPT 00075 #define F FAILURE 00076 static const signed char trans[][0x100] = { 00077 { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00078 /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00079 /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00080 /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00081 /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00082 /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00083 /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00084 /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00085 /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00086 /* 8 */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00087 /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00088 /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00089 /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00090 /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00091 /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00092 /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00093 /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F, F, F 00094 }, 00095 { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00096 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00097 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00098 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00099 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00100 /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00101 /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00102 /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00103 /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F, 00104 /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00105 /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00106 /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00107 /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00108 /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00109 /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00110 /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00111 /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, F, F, F 00112 } 00113 }; 00114 #undef A 00115 #undef F 00116 00117 static int 00118 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) 00119 { 00120 int firstbyte = *p++; 00121 state_t s; 00122 s = trans[0][firstbyte]; 00123 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) : 00124 ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 00125 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_SJIS[firstbyte]-1); 00126 s = trans[s][*p++]; 00127 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) : 00128 ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 00129 } 00130 00131 static int 00132 code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED) 00133 { 00134 if (code < 256) { 00135 if (EncLen_SJIS[(int )code] == 1) 00136 return 1; 00137 else 00138 return ONIGERR_INVALID_CODE_POINT_VALUE; 00139 } 00140 else if (code <= 0xffff) { 00141 return 2; 00142 } 00143 else 00144 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; 00145 } 00146 00147 static OnigCodePoint 00148 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) 00149 { 00150 int c, i, len; 00151 OnigCodePoint n; 00152 00153 len = enclen(enc, p, end); 00154 c = *p++; 00155 n = c; 00156 if (len == 1) return n; 00157 00158 for (i = 1; i < len; i++) { 00159 if (p >= end) break; 00160 c = *p++; 00161 n <<= 8; n += c; 00162 } 00163 return n; 00164 } 00165 00166 static int 00167 code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc) 00168 { 00169 UChar *p = buf; 00170 00171 if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff)); 00172 *p++ = (UChar )(code & 0xff); 00173 00174 #if 0 00175 if (enclen(enc, buf) != (p - buf)) 00176 return REGERR_INVALID_CODE_POINT_VALUE; 00177 #endif 00178 return (int)(p - buf); 00179 } 00180 00181 static int 00182 mbc_case_fold(OnigCaseFoldType flag, 00183 const UChar** pp, const UChar* end, UChar* lower, 00184 OnigEncoding enc) 00185 { 00186 const UChar* p = *pp; 00187 00188 if (ONIGENC_IS_MBC_ASCII(p)) { 00189 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); 00190 (*pp)++; 00191 return 1; 00192 } 00193 else { 00194 int i; 00195 int len = enclen(enc, p, end); 00196 00197 for (i = 0; i < len; i++) { 00198 *lower++ = *p++; 00199 } 00200 (*pp) += len; 00201 return len; /* return byte length of converted char to lower */ 00202 } 00203 } 00204 00205 #if 0 00206 static int 00207 is_mbc_ambiguous(OnigCaseFoldType flag, 00208 const UChar** pp, const UChar* end) 00209 { 00210 return onigenc_mbn_is_mbc_ambiguous(enc, flag, pp, end); 00211 00212 } 00213 #endif 00214 00215 #if 0 00216 static int 00217 is_code_ctype(OnigCodePoint code, unsigned int ctype) 00218 { 00219 if (code < 128) 00220 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); 00221 else { 00222 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) { 00223 return (code_to_mbclen(code) > 1 ? TRUE : FALSE); 00224 } 00225 } 00226 00227 return FALSE; 00228 } 00229 #endif 00230 00231 static UChar* 00232 left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc) 00233 { 00234 const UChar *p; 00235 int len; 00236 00237 if (s <= start) return (UChar* )s; 00238 p = s; 00239 00240 if (SJIS_ISMB_TRAIL(*p)) { 00241 while (p > start) { 00242 if (! SJIS_ISMB_FIRST(*--p)) { 00243 p++; 00244 break; 00245 } 00246 } 00247 } 00248 len = enclen(enc, p, end); 00249 if (p + len > s) return (UChar* )p; 00250 p += len; 00251 return (UChar* )(p + ((s - p) & ~1)); 00252 } 00253 00254 static int 00255 is_allowed_reverse_match(const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED) 00256 { 00257 const UChar c = *s; 00258 return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE); 00259 } 00260 00261 00262 static int PropertyInited = 0; 00263 static const OnigCodePoint** PropertyList; 00264 static int PropertyListNum; 00265 static int PropertyListSize; 00266 static hash_table_type* PropertyNameTable; 00267 00268 static const OnigCodePoint CR_Hiragana[] = { 00269 1, 00270 0x829f, 0x82f1 00271 }; /* CR_Hiragana */ 00272 00273 static const OnigCodePoint CR_Katakana[] = { 00274 4, 00275 0x00a6, 0x00af, 00276 0x00b1, 0x00dd, 00277 0x8340, 0x837e, 00278 0x8380, 0x8396, 00279 }; /* CR_Katakana */ 00280 00281 static int 00282 init_property_list(void) 00283 { 00284 int r; 00285 00286 PROPERTY_LIST_ADD_PROP("hiragana", CR_Hiragana); 00287 PROPERTY_LIST_ADD_PROP("katakana", CR_Katakana); 00288 PropertyInited = 1; 00289 00290 end: 00291 return r; 00292 } 00293 00294 static int 00295 property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end) 00296 { 00297 hash_data_type ctype; 00298 UChar *s, *e; 00299 00300 PROPERTY_LIST_INIT_CHECK; 00301 00302 s = e = ALLOCA_N(UChar, end-p+1); 00303 for (; p < end; p++) { 00304 *e++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); 00305 } 00306 00307 if (onig_st_lookup_strend(PropertyNameTable, s, e, &ctype) == 0) { 00308 return onigenc_minimum_property_name_to_ctype(enc, s, e); 00309 } 00310 00311 return (int)ctype; 00312 } 00313 00314 static int 00315 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc) 00316 { 00317 if (ctype <= ONIGENC_MAX_STD_CTYPE) { 00318 if (code < 128) 00319 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); 00320 else { 00321 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) { 00322 return TRUE; 00323 } 00324 } 00325 } 00326 else { 00327 PROPERTY_LIST_INIT_CHECK; 00328 00329 ctype -= (ONIGENC_MAX_STD_CTYPE + 1); 00330 if (ctype >= (unsigned int )PropertyListNum) 00331 return ONIGERR_TYPE_BUG; 00332 00333 return onig_is_in_code_range((UChar* )PropertyList[ctype], code); 00334 } 00335 00336 return FALSE; 00337 } 00338 00339 static int 00340 get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out, 00341 const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED) 00342 { 00343 if (ctype <= ONIGENC_MAX_STD_CTYPE) { 00344 return ONIG_NO_SUPPORT_CONFIG; 00345 } 00346 else { 00347 *sb_out = 0x80; 00348 00349 PROPERTY_LIST_INIT_CHECK; 00350 00351 ctype -= (ONIGENC_MAX_STD_CTYPE + 1); 00352 if (ctype >= (OnigCtype )PropertyListNum) 00353 return ONIGERR_TYPE_BUG; 00354 00355 *ranges = PropertyList[ctype]; 00356 return 0; 00357 } 00358 } 00359 00360 OnigEncodingDefine(shift_jis, Shift_JIS) = { 00361 mbc_enc_len, 00362 "Shift_JIS", /* name */ 00363 2, /* max byte length */ 00364 1, /* min byte length */ 00365 onigenc_is_mbc_newline_0x0a, 00366 mbc_to_code, 00367 code_to_mbclen, 00368 code_to_mbc, 00369 mbc_case_fold, 00370 onigenc_ascii_apply_all_case_fold, 00371 onigenc_ascii_get_case_fold_codes_by_str, 00372 property_name_to_ctype, 00373 is_code_ctype, 00374 get_ctype_code_range, 00375 left_adjust_char_head, 00376 is_allowed_reverse_match, 00377 0 00378 }; 00379 /* 00380 * Name: Shift_JIS 00381 * MIBenum: 17 00382 * Link: http://www.iana.org/assignments/character-sets 00383 * Link: http://ja.wikipedia.org/wiki/Shift_JIS 00384 */ 00385 00386 /* 00387 * Name: Windows-31J 00388 * MIBenum: 2024 00389 * Link: http://www.iana.org/assignments/character-sets 00390 * Link: http://www.microsoft.com/globaldev/reference/dbcs/932.mspx 00391 * Link: http://ja.wikipedia.org/wiki/Windows-31J 00392 * Link: http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-932-2000.ucm 00393 * 00394 * Windows Standard Character Set and its mapping to Unicode by Microsoft. 00395 * Since 1.9.3, SJIS is the alias of Windows-31J because its character 00396 * set is usually this one even if its mapping may differ. 00397 */ 00398 ENC_REPLICATE("Windows-31J", "Shift_JIS") 00399 ENC_ALIAS("CP932", "Windows-31J") 00400 ENC_ALIAS("csWindows31J", "Windows-31J") /* IANA. IE6 don't accept Windows-31J but csWindows31J. */ 00401 ENC_ALIAS("SJIS", "Windows-31J") 00402 00403 /* 00404 * Name: PCK 00405 * Link: http://download.oracle.com/docs/cd/E19253-01/819-0606/x-2chn0/index.html 00406 * Link: http://download.oracle.com/docs/cd/E19253-01/819-0606/appb-pckwarn-1/index.html 00407 * 00408 * Solaris's SJIS variant. Its set is Windows Standard Character Set; it 00409 * consists JIS X 0201 Latin (US-ASCII), JIS X 0201 Katakana, JIS X 0208, NEC 00410 * special characters, NEC-selected IBM extended characters, and IBM extended 00411 * characters. Solaris's iconv seems to use SJIS-open. 00412 */ 00413 ENC_ALIAS("PCK", "Windows-31J") 00414 00415 /* 00416 * Name: MacJapanese 00417 * Link: http://unicode.org/Public/MAPPINGS/VENDORS/APPLE/JAPANESE.TXT 00418 * Link: http://ja.wikipedia.org/wiki/MacJapanese 00419 */ 00420 ENC_REPLICATE("MacJapanese", "Shift_JIS") 00421 ENC_ALIAS("MacJapan", "MacJapanese") 00422