Ruby 1.9.3p327(2012-11-10revision37606)
enc/euc_jp.c
Go to the documentation of this file.
00001 /**********************************************************************
00002   euc_jp.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2002-2007  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions
00010  * are met:
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in the
00015  *    documentation and/or other materials provided with the distribution.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00018  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00021  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00022  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00023  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00024  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00025  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00026  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00027  * SUCH DAMAGE.
00028  */
00029 
00030 #include "regint.h"
00031 
00032 
00033 #define eucjp_islead(c)    ((UChar )((c) - 0xa1) > 0xfe - 0xa1)
00034 
00035 static const int EncLen_EUCJP[] = {
00036   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00037   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00038   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00039   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00040   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00041   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00042   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00043   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00044   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
00045   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00046   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00047   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00049   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00050   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00051   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
00052 };
00053 
00054 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2 } state_t;
00055 #define A ACCEPT
00056 #define F FAILURE
00057 static const signed char trans[][0x100] = {
00058   { /* S0   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00059     /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00060     /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00061     /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00062     /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00063     /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00064     /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00065     /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00066     /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00067     /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, 1, 2,
00068     /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00069     /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00070     /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00071     /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00072     /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00073     /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00074     /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
00075   },
00076   { /* S1   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00077     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00078     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00079     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00080     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00081     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00082     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00083     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00084     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00085     /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00086     /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00087     /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00088     /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00089     /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00090     /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00091     /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00092     /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F
00093   },
00094   { /* S2   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00095     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00096     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00097     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00098     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00099     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00100     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00101     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00102     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00103     /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00104     /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00105     /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00106     /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00107     /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00108     /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00109     /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00110     /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
00111   },
00112 
00113 };
00114 #undef A
00115 #undef F
00116 
00117 static int
00118 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00119 {
00120   int firstbyte = *p++;
00121   state_t s;
00122   s = trans[0][firstbyte];
00123   if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
00124                                   ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00125   if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-1);
00126   s = trans[s][*p++];
00127   if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
00128                                   ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00129   if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-2);
00130   s = trans[s][*p++];
00131   return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
00132                        ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00133 }
00134 
00135 static OnigCodePoint
00136 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
00137 {
00138   int c, i, len;
00139   OnigCodePoint n;
00140 
00141   len = enclen(enc, p, end);
00142   n = (OnigCodePoint )*p++;
00143   if (len == 1) return n;
00144 
00145   for (i = 1; i < len; i++) {
00146     if (p >= end) break;
00147     c = *p++;
00148     n <<= 8;  n += c;
00149   }
00150   return n;
00151 }
00152 
00153 static int
00154 code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
00155 {
00156   if (ONIGENC_IS_CODE_ASCII(code)) return 1;
00157   else if (code > 0xffffff)
00158       return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
00159   else if (code & 0x800000) return 3;
00160   else if (code & 0x8000) return 2;
00161   else
00162     return ONIGERR_INVALID_CODE_POINT_VALUE;
00163 }
00164 
00165 #if 0
00166 static int
00167 code_to_mbc_first(OnigCodePoint code)
00168 {
00169   int first;
00170 
00171   if ((code & 0xff0000) != 0) {
00172     first = (code >> 16) & 0xff;
00173   }
00174   else if ((code & 0xff00) != 0) {
00175     first = (code >> 8) & 0xff;
00176   }
00177   else {
00178     return (int )code;
00179   }
00180   return first;
00181 }
00182 #endif
00183 
00184 static int
00185 code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
00186 {
00187   UChar *p = buf;
00188 
00189   if ((code & 0xff0000) != 0) *p++ = (UChar )(((code >> 16) & 0xff));
00190   if ((code &   0xff00) != 0) *p++ = (UChar )(((code >>  8) & 0xff));
00191   *p++ = (UChar )(code & 0xff);
00192 
00193 #if 1
00194   if (enclen(enc, buf, p) != (p - buf))
00195     return ONIGERR_INVALID_CODE_POINT_VALUE;
00196 #endif
00197   return (int)(p - buf);
00198 }
00199 
00200 static int
00201 mbc_case_fold(OnigCaseFoldType flag,
00202               const UChar** pp, const UChar* end, UChar* lower,
00203               OnigEncoding enc)
00204 {
00205   int len;
00206   const UChar* p = *pp;
00207 
00208   if (ONIGENC_IS_MBC_ASCII(p)) {
00209     *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00210     (*pp)++;
00211     return 1;
00212   }
00213   else {
00214     int i;
00215 
00216     len = enclen(enc, p, end);
00217     for (i = 0; i < len; i++) {
00218       *lower++ = *p++;
00219     }
00220     (*pp) += len;
00221     return len; /* return byte length of converted char to lower */
00222   }
00223 }
00224 
00225 static UChar*
00226 left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
00227 {
00228   /* In this encoding
00229      mb-trail bytes doesn't mix with single bytes.
00230   */
00231   const UChar *p;
00232   int len;
00233 
00234   if (s <= start) return (UChar* )s;
00235   p = s;
00236 
00237   while (!eucjp_islead(*p) && p > start) p--;
00238   len = enclen(enc, p, end);
00239   if (p + len > s) return (UChar* )p;
00240   p += len;
00241   return (UChar* )(p + ((s - p) & ~1));
00242 }
00243 
00244 static int
00245 is_allowed_reverse_match(const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
00246 {
00247   const UChar c = *s;
00248   if (c <= 0x7e || c == 0x8e || c == 0x8f)
00249     return TRUE;
00250   else
00251     return FALSE;
00252 }
00253 
00254 
00255 static int PropertyInited = 0;
00256 static const OnigCodePoint** PropertyList;
00257 static int PropertyListNum;
00258 static int PropertyListSize;
00259 static hash_table_type* PropertyNameTable;
00260 
00261 static const OnigCodePoint CR_Hiragana[] = {
00262   1,
00263   0xa4a1, 0xa4f3
00264 }; /* CR_Hiragana */
00265 
00266 static const OnigCodePoint CR_Katakana[] = {
00267   3,
00268   0xa5a1, 0xa5f6,
00269   0xaaa6, 0xaaaf,
00270   0xaab1, 0xaadd
00271 }; /* CR_Katakana */
00272 
00273 static int
00274 init_property_list(void)
00275 {
00276   int r;
00277 
00278   PROPERTY_LIST_ADD_PROP("hiragana", CR_Hiragana);
00279   PROPERTY_LIST_ADD_PROP("katakana", CR_Katakana);
00280   PropertyInited = 1;
00281 
00282  end:
00283   return r;
00284 }
00285 
00286 static int
00287 property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
00288 {
00289   st_data_t ctype;
00290   UChar *s, *e;
00291 
00292   PROPERTY_LIST_INIT_CHECK;
00293 
00294   s = e = ALLOCA_N(UChar, end-p+1);
00295   for (; p < end; p++) {
00296     *e++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00297   }
00298 
00299   if (onig_st_lookup_strend(PropertyNameTable, s, e, &ctype) == 0) {
00300     return onigenc_minimum_property_name_to_ctype(enc, s, e);
00301   }
00302 
00303   return (int)ctype;
00304 }
00305 
00306 static int
00307 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
00308 {
00309   if (ctype <= ONIGENC_MAX_STD_CTYPE) {
00310     if (code < 128)
00311       return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
00312     else {
00313       if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
00314         return (code_to_mbclen(code, enc) > 1 ? TRUE : FALSE);
00315       }
00316     }
00317   }
00318   else {
00319     PROPERTY_LIST_INIT_CHECK;
00320 
00321     ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
00322     if (ctype >= (unsigned int )PropertyListNum)
00323       return ONIGERR_TYPE_BUG;
00324 
00325     return onig_is_in_code_range((UChar* )PropertyList[ctype], code);
00326   }
00327 
00328   return FALSE;
00329 }
00330 
00331 static int
00332 get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
00333                      const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
00334 {
00335   if (ctype <= ONIGENC_MAX_STD_CTYPE) {
00336     return ONIG_NO_SUPPORT_CONFIG;
00337   }
00338   else {
00339     *sb_out = 0x80;
00340 
00341     PROPERTY_LIST_INIT_CHECK;
00342 
00343     ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
00344     if (ctype >= (OnigCtype )PropertyListNum)
00345       return ONIGERR_TYPE_BUG;
00346 
00347     *ranges = PropertyList[ctype];
00348     return 0;
00349   }
00350 }
00351 
00352 
00353 OnigEncodingDefine(euc_jp, EUC_JP) = {
00354   mbc_enc_len,
00355   "EUC-JP",   /* name */
00356   3,          /* max enc length */
00357   1,          /* min enc length */
00358   onigenc_is_mbc_newline_0x0a,
00359   mbc_to_code,
00360   code_to_mbclen,
00361   code_to_mbc,
00362   mbc_case_fold,
00363   onigenc_ascii_apply_all_case_fold,
00364   onigenc_ascii_get_case_fold_codes_by_str,
00365   property_name_to_ctype,
00366   is_code_ctype,
00367   get_ctype_code_range,
00368   left_adjust_char_head,
00369   is_allowed_reverse_match,
00370   0
00371 };
00372 /*
00373  * Name: EUC-JP
00374  * MIBenum: 18
00375  * Link: http://www.iana.org/assignments/character-sets
00376  * Link: http://home.m05.itscom.net/numa/cde/sjis-euc/sjis-euc.html
00377  * Link: http://home.m05.itscom.net/numa/uocjleE.pdf
00378  */
00379 ENC_ALIAS("eucJP", "EUC-JP") /* UI-OSF Application Platform Profile for Japanese Environment Version 1.1 */
00380 
00381 /*
00382  * Name: eucJP-ms
00383  * Link: http://home.m05.itscom.net/numa/cde/ucs-conv/ucs-conv.html
00384  * Link: http://www2d.biglobe.ne.jp/~msyk/charcode/cp932/eucJP-ms.html
00385  * Link: http://ja.wikipedia.org/wiki/EUC-JP
00386  */
00387 ENC_REPLICATE("eucJP-ms", "EUC-JP") /* TOG/JVC CDE/Motif Technical WG */
00388 ENC_ALIAS("euc-jp-ms", "eucJP-ms")
00389 
00390 /*
00391  * Name: CP51932
00392  * MIBenum: 2108
00393  * Link: http://www.iana.org/assignments/charset-reg/CP51932
00394  * Link: http://search.cpan.org/src/NARUSE/Encode-EUCJPMS-0.07/ucm/cp51932.ucm
00395  * Link: http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932
00396  * Link: http://msyk.at.webry.info/200511/article_2.html
00397  */
00398 ENC_REPLICATE("CP51932", "EUC-JP")
00399