Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /********************************************************************** 00002 iso8859_1.c - Oniguruma (regular expression library) 00003 **********************************************************************/ 00004 /*- 00005 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 00006 * All rights reserved. 00007 * 00008 * Redistribution and use in source and binary forms, with or without 00009 * modification, are permitted provided that the following conditions 00010 * are met: 00011 * 1. Redistributions of source code must retain the above copyright 00012 * notice, this list of conditions and the following disclaimer. 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in the 00015 * documentation and/or other materials provided with the distribution. 00016 * 00017 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 00018 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00019 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00020 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 00021 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00022 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 00023 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00024 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00025 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 00026 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00027 * SUCH DAMAGE. 00028 */ 00029 00030 #include "regenc.h" 00031 00032 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0])) 00033 00034 #define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \ 00035 ((EncISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0) 00036 00037 static const unsigned short EncISO_8859_1_CtypeTable[256] = { 00038 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 00039 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008, 00040 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 00041 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 00042 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 00043 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 00044 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 00045 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 00046 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2, 00047 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 00048 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 00049 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0, 00050 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2, 00051 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 00052 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 00053 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008, 00054 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 00055 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 00056 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 00057 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 00058 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 00059 0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0, 00060 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0, 00061 0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0, 00062 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 00063 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 00064 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0, 00065 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2, 00066 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 00067 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 00068 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0, 00069 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2 00070 }; 00071 00072 static const OnigPairCaseFoldCodes CaseFoldMap[] = { 00073 { 0xc0, 0xe0 }, 00074 { 0xc1, 0xe1 }, 00075 { 0xc2, 0xe2 }, 00076 { 0xc3, 0xe3 }, 00077 { 0xc4, 0xe4 }, 00078 { 0xc5, 0xe5 }, 00079 { 0xc6, 0xe6 }, 00080 { 0xc7, 0xe7 }, 00081 { 0xc8, 0xe8 }, 00082 { 0xc9, 0xe9 }, 00083 { 0xca, 0xea }, 00084 { 0xcb, 0xeb }, 00085 { 0xcc, 0xec }, 00086 { 0xcd, 0xed }, 00087 { 0xce, 0xee }, 00088 { 0xcf, 0xef }, 00089 00090 { 0xd0, 0xf0 }, 00091 { 0xd1, 0xf1 }, 00092 { 0xd2, 0xf2 }, 00093 { 0xd3, 0xf3 }, 00094 { 0xd4, 0xf4 }, 00095 { 0xd5, 0xf5 }, 00096 { 0xd6, 0xf6 }, 00097 { 0xd8, 0xf8 }, 00098 { 0xd9, 0xf9 }, 00099 { 0xda, 0xfa }, 00100 { 0xdb, 0xfb }, 00101 { 0xdc, 0xfc }, 00102 { 0xdd, 0xfd }, 00103 { 0xde, 0xfe } 00104 }; 00105 00106 static int 00107 apply_all_case_fold(OnigCaseFoldType flag, 00108 OnigApplyAllCaseFoldFunc f, void* arg, 00109 OnigEncoding enc ARG_UNUSED) 00110 { 00111 return onigenc_apply_all_case_fold_with_map( 00112 numberof(CaseFoldMap), CaseFoldMap, 1, 00113 flag, f, arg); 00114 } 00115 00116 static int 00117 get_case_fold_codes_by_str(OnigCaseFoldType flag, 00118 const OnigUChar* p, const OnigUChar* end, 00119 OnigCaseFoldCodeItem items[], 00120 OnigEncoding enc ARG_UNUSED) 00121 { 00122 if (0x41 <= *p && *p <= 0x5a) { 00123 items[0].byte_len = 1; 00124 items[0].code_len = 1; 00125 items[0].code[0] = (OnigCodePoint )(*p + 0x20); 00126 if (*p == 0x53 && end > p + 1 00127 && (*(p+1) == 0x53 || *(p+1) == 0x73)) { /* SS */ 00128 items[1].byte_len = 2; 00129 items[1].code_len = 1; 00130 items[1].code[0] = (OnigCodePoint )0xdf; 00131 return 2; 00132 } 00133 else 00134 return 1; 00135 } 00136 else if (0x61 <= *p && *p <= 0x7a) { 00137 items[0].byte_len = 1; 00138 items[0].code_len = 1; 00139 items[0].code[0] = (OnigCodePoint )(*p - 0x20); 00140 if (*p == 0x73 && end > p + 1 00141 && (*(p+1) == 0x73 || *(p+1) == 0x53)) { /* ss */ 00142 items[1].byte_len = 2; 00143 items[1].code_len = 1; 00144 items[1].code[0] = (OnigCodePoint )0xdf; 00145 return 2; 00146 } 00147 else 00148 return 1; 00149 } 00150 else if (0xc0 <= *p && *p <= 0xcf) { 00151 items[0].byte_len = 1; 00152 items[0].code_len = 1; 00153 items[0].code[0] = (OnigCodePoint )(*p + 0x20); 00154 return 1; 00155 } 00156 else if (0xd0 <= *p && *p <= 0xdf) { 00157 if (*p == 0xdf) { 00158 items[0].byte_len = 1; 00159 items[0].code_len = 2; 00160 items[0].code[0] = (OnigCodePoint )'s'; 00161 items[0].code[1] = (OnigCodePoint )'s'; 00162 00163 items[1].byte_len = 1; 00164 items[1].code_len = 2; 00165 items[1].code[0] = (OnigCodePoint )'S'; 00166 items[1].code[1] = (OnigCodePoint )'S'; 00167 00168 items[2].byte_len = 1; 00169 items[2].code_len = 2; 00170 items[2].code[0] = (OnigCodePoint )'s'; 00171 items[2].code[1] = (OnigCodePoint )'S'; 00172 00173 items[3].byte_len = 1; 00174 items[3].code_len = 2; 00175 items[3].code[0] = (OnigCodePoint )'S'; 00176 items[3].code[1] = (OnigCodePoint )'s'; 00177 00178 return 4; 00179 } 00180 else if (*p != 0xd7) { 00181 items[0].byte_len = 1; 00182 items[0].code_len = 1; 00183 items[0].code[0] = (OnigCodePoint )(*p + 0x20); 00184 return 1; 00185 } 00186 } 00187 else if (0xe0 <= *p && *p <= 0xef) { 00188 items[0].byte_len = 1; 00189 items[0].code_len = 1; 00190 items[0].code[0] = (OnigCodePoint )(*p - 0x20); 00191 return 1; 00192 } 00193 else if (0xf0 <= *p && *p <= 0xfe) { 00194 if (*p != 0xf7) { 00195 items[0].byte_len = 1; 00196 items[0].code_len = 1; 00197 items[0].code[0] = (OnigCodePoint )(*p - 0x20); 00198 return 1; 00199 } 00200 } 00201 00202 return 0; 00203 } 00204 00205 static int 00206 mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED, 00207 UChar* lower, OnigEncoding enc ARG_UNUSED) 00208 { 00209 const UChar* p = *pp; 00210 00211 if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { 00212 *lower++ = 's'; 00213 *lower = 's'; 00214 (*pp)++; 00215 return 2; 00216 } 00217 00218 *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); 00219 (*pp)++; 00220 return 1; 00221 } 00222 00223 #if 0 00224 static int 00225 is_mbc_ambiguous(OnigCaseFoldType flag, 00226 const UChar** pp, const UChar* end) 00227 { 00228 int v; 00229 const UChar* p = *pp; 00230 00231 if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { 00232 (*pp)++; 00233 return TRUE; 00234 } 00235 00236 (*pp)++; 00237 v = (EncISO_8859_1_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); 00238 if ((v | BIT_CTYPE_LOWER) != 0) { 00239 /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ 00240 if (*p >= 0xaa && *p <= 0xba) 00241 return FALSE; 00242 else 00243 return TRUE; 00244 } 00245 00246 return (v != 0 ? TRUE : FALSE); 00247 } 00248 #endif 00249 00250 static int 00251 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED) 00252 { 00253 if (code < 256) 00254 return ENC_IS_ISO_8859_1_CTYPE(code, ctype); 00255 else 00256 return FALSE; 00257 } 00258 00259 OnigEncodingDefine(iso_8859_1, ISO_8859_1) = { 00260 onigenc_single_byte_mbc_enc_len, 00261 "ISO-8859-1", /* name */ 00262 1, /* max enc length */ 00263 1, /* min enc length */ 00264 onigenc_is_mbc_newline_0x0a, 00265 onigenc_single_byte_mbc_to_code, 00266 onigenc_single_byte_code_to_mbclen, 00267 onigenc_single_byte_code_to_mbc, 00268 mbc_case_fold, 00269 apply_all_case_fold, 00270 get_case_fold_codes_by_str, 00271 onigenc_minimum_property_name_to_ctype, 00272 is_code_ctype, 00273 onigenc_not_support_get_ctype_code_range, 00274 onigenc_single_byte_left_adjust_char_head, 00275 onigenc_always_true_is_allowed_reverse_match 00276 }; 00277 ENC_ALIAS("ISO8859-1", "ISO-8859-1") 00278 00279 /* 00280 * Name: windows-1252 00281 * MIBenum: 2252 00282 * Link: http://www.iana.org/assignments/character-sets 00283 * Link: http://www.microsoft.com/globaldev/reference/sbcs/1252.mspx 00284 * Link: http://en.wikipedia.org/wiki/Windows-1252 00285 */ 00286 ENC_REPLICATE("Windows-1252", "ISO-8859-1") 00287 ENC_ALIAS("CP1252", "Windows-1252") 00288