Ruby 1.9.3p327(2012-11-10revision37606)
enc/iso_8859_1.c
Go to the documentation of this file.
00001 /**********************************************************************
00002   iso8859_1.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2002-2007  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions
00010  * are met:
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in the
00015  *    documentation and/or other materials provided with the distribution.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00018  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00021  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00022  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00023  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00024  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00025  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00026  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00027  * SUCH DAMAGE.
00028  */
00029 
00030 #include "regenc.h"
00031 
00032 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00033 
00034 #define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \
00035   ((EncISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
00036 
00037 static const unsigned short EncISO_8859_1_CtypeTable[256] = {
00038   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
00039   0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
00040   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
00041   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
00042   0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
00043   0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
00044   0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
00045   0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
00046   0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
00047   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
00048   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
00049   0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
00050   0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
00051   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
00052   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
00053   0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
00054   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
00055   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
00056   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
00057   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
00058   0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
00059   0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0,
00060   0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
00061   0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
00062   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
00063   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
00064   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
00065   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
00066   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
00067   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
00068   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
00069   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
00070 };
00071 
00072 static const OnigPairCaseFoldCodes CaseFoldMap[] = {
00073   { 0xc0, 0xe0 },
00074   { 0xc1, 0xe1 },
00075   { 0xc2, 0xe2 },
00076   { 0xc3, 0xe3 },
00077   { 0xc4, 0xe4 },
00078   { 0xc5, 0xe5 },
00079   { 0xc6, 0xe6 },
00080   { 0xc7, 0xe7 },
00081   { 0xc8, 0xe8 },
00082   { 0xc9, 0xe9 },
00083   { 0xca, 0xea },
00084   { 0xcb, 0xeb },
00085   { 0xcc, 0xec },
00086   { 0xcd, 0xed },
00087   { 0xce, 0xee },
00088   { 0xcf, 0xef },
00089 
00090   { 0xd0, 0xf0 },
00091   { 0xd1, 0xf1 },
00092   { 0xd2, 0xf2 },
00093   { 0xd3, 0xf3 },
00094   { 0xd4, 0xf4 },
00095   { 0xd5, 0xf5 },
00096   { 0xd6, 0xf6 },
00097   { 0xd8, 0xf8 },
00098   { 0xd9, 0xf9 },
00099   { 0xda, 0xfa },
00100   { 0xdb, 0xfb },
00101   { 0xdc, 0xfc },
00102   { 0xdd, 0xfd },
00103   { 0xde, 0xfe }
00104 };
00105 
00106 static int
00107 apply_all_case_fold(OnigCaseFoldType flag,
00108                     OnigApplyAllCaseFoldFunc f, void* arg,
00109                     OnigEncoding enc ARG_UNUSED)
00110 {
00111   return onigenc_apply_all_case_fold_with_map(
00112             numberof(CaseFoldMap), CaseFoldMap, 1,
00113             flag, f, arg);
00114 }
00115 
00116 static int
00117 get_case_fold_codes_by_str(OnigCaseFoldType flag,
00118                            const OnigUChar* p, const OnigUChar* end,
00119                            OnigCaseFoldCodeItem items[],
00120                            OnigEncoding enc ARG_UNUSED)
00121 {
00122   if (0x41 <= *p && *p <= 0x5a) {
00123     items[0].byte_len = 1;
00124     items[0].code_len = 1;
00125     items[0].code[0] = (OnigCodePoint )(*p + 0x20);
00126     if (*p == 0x53 && end > p + 1
00127         && (*(p+1) == 0x53 || *(p+1) == 0x73)) { /* SS */
00128       items[1].byte_len = 2;
00129       items[1].code_len = 1;
00130       items[1].code[0] = (OnigCodePoint )0xdf;
00131       return 2;
00132     }
00133     else
00134       return 1;
00135   }
00136   else if (0x61 <= *p && *p <= 0x7a) {
00137     items[0].byte_len = 1;
00138     items[0].code_len = 1;
00139     items[0].code[0] = (OnigCodePoint )(*p - 0x20);
00140     if (*p == 0x73 && end > p + 1
00141         && (*(p+1) == 0x73 || *(p+1) == 0x53)) { /* ss */
00142       items[1].byte_len = 2;
00143       items[1].code_len = 1;
00144       items[1].code[0] = (OnigCodePoint )0xdf;
00145       return 2;
00146     }
00147     else
00148       return 1;
00149   }
00150   else if (0xc0 <= *p && *p <= 0xcf) {
00151     items[0].byte_len = 1;
00152     items[0].code_len = 1;
00153     items[0].code[0] = (OnigCodePoint )(*p + 0x20);
00154     return 1;
00155   }
00156   else if (0xd0 <= *p && *p <= 0xdf) {
00157     if (*p == 0xdf) {
00158       items[0].byte_len = 1;
00159       items[0].code_len = 2;
00160       items[0].code[0] = (OnigCodePoint )'s';
00161       items[0].code[1] = (OnigCodePoint )'s';
00162 
00163       items[1].byte_len = 1;
00164       items[1].code_len = 2;
00165       items[1].code[0] = (OnigCodePoint )'S';
00166       items[1].code[1] = (OnigCodePoint )'S';
00167 
00168       items[2].byte_len = 1;
00169       items[2].code_len = 2;
00170       items[2].code[0] = (OnigCodePoint )'s';
00171       items[2].code[1] = (OnigCodePoint )'S';
00172 
00173       items[3].byte_len = 1;
00174       items[3].code_len = 2;
00175       items[3].code[0] = (OnigCodePoint )'S';
00176       items[3].code[1] = (OnigCodePoint )'s';
00177 
00178       return 4;
00179     }
00180     else if (*p != 0xd7) {
00181       items[0].byte_len = 1;
00182       items[0].code_len = 1;
00183       items[0].code[0] = (OnigCodePoint )(*p + 0x20);
00184       return 1;
00185     }
00186   }
00187   else if (0xe0 <= *p && *p <= 0xef) {
00188     items[0].byte_len = 1;
00189     items[0].code_len = 1;
00190     items[0].code[0] = (OnigCodePoint )(*p - 0x20);
00191     return 1;
00192   }
00193   else if (0xf0 <= *p && *p <= 0xfe) {
00194     if (*p != 0xf7) {
00195       items[0].byte_len = 1;
00196       items[0].code_len = 1;
00197       items[0].code[0] = (OnigCodePoint )(*p - 0x20);
00198       return 1;
00199     }
00200   }
00201 
00202   return 0;
00203 }
00204 
00205 static int
00206 mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED,
00207               UChar* lower, OnigEncoding enc ARG_UNUSED)
00208 {
00209   const UChar* p = *pp;
00210 
00211   if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
00212     *lower++ = 's';
00213     *lower   = 's';
00214     (*pp)++;
00215     return 2;
00216   }
00217 
00218   *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p);
00219   (*pp)++;
00220   return 1;
00221 }
00222 
00223 #if 0
00224 static int
00225 is_mbc_ambiguous(OnigCaseFoldType flag,
00226                  const UChar** pp, const UChar* end)
00227 {
00228   int v;
00229   const UChar* p = *pp;
00230 
00231   if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
00232     (*pp)++;
00233     return TRUE;
00234   }
00235 
00236   (*pp)++;
00237   v = (EncISO_8859_1_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
00238   if ((v | BIT_CTYPE_LOWER) != 0) {
00239     /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
00240     if (*p >= 0xaa && *p <= 0xba)
00241       return FALSE;
00242     else
00243       return TRUE;
00244   }
00245 
00246   return (v != 0 ? TRUE : FALSE);
00247 }
00248 #endif
00249 
00250 static int
00251 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
00252 {
00253   if (code < 256)
00254     return ENC_IS_ISO_8859_1_CTYPE(code, ctype);
00255   else
00256     return FALSE;
00257 }
00258 
00259 OnigEncodingDefine(iso_8859_1, ISO_8859_1) = {
00260   onigenc_single_byte_mbc_enc_len,
00261   "ISO-8859-1",  /* name */
00262   1,             /* max enc length */
00263   1,             /* min enc length */
00264   onigenc_is_mbc_newline_0x0a,
00265   onigenc_single_byte_mbc_to_code,
00266   onigenc_single_byte_code_to_mbclen,
00267   onigenc_single_byte_code_to_mbc,
00268   mbc_case_fold,
00269   apply_all_case_fold,
00270   get_case_fold_codes_by_str,
00271   onigenc_minimum_property_name_to_ctype,
00272   is_code_ctype,
00273   onigenc_not_support_get_ctype_code_range,
00274   onigenc_single_byte_left_adjust_char_head,
00275   onigenc_always_true_is_allowed_reverse_match
00276 };
00277 ENC_ALIAS("ISO8859-1", "ISO-8859-1")
00278 
00279 /*
00280  * Name: windows-1252
00281  * MIBenum: 2252
00282  * Link: http://www.iana.org/assignments/character-sets
00283  * Link: http://www.microsoft.com/globaldev/reference/sbcs/1252.mspx
00284  * Link: http://en.wikipedia.org/wiki/Windows-1252
00285  */
00286 ENC_REPLICATE("Windows-1252", "ISO-8859-1")
00287 ENC_ALIAS("CP1252", "Windows-1252")
00288