Ruby 1.9.3p327(2012-11-10revision37606)
enc/shift_jis.c
Go to the documentation of this file.
00001 /**********************************************************************
00002   sjis.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions
00010  * are met:
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in the
00015  *    documentation and/or other materials provided with the distribution.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00018  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00021  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00022  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00023  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00024  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00025  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00026  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00027  * SUCH DAMAGE.
00028  */
00029 
00030 #include "regint.h"
00031 
00032 static const int EncLen_SJIS[] = {
00033   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00034   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00035   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00036   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00037   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00038   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00039   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00040   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00041   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00042   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00043   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00044   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00045   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00046   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00047   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
00049 };
00050 
00051 static const char SJIS_CAN_BE_TRAIL_TABLE[256] = {
00052   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00053   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00054   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00055   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00056   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00057   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00058   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00059   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
00060   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00061   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00062   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00063   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00064   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00065   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00066   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00067   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
00068 };
00069 
00070 #define SJIS_ISMB_FIRST(byte)  (EncLen_SJIS[byte] > 1)
00071 #define SJIS_ISMB_TRAIL(byte)  SJIS_CAN_BE_TRAIL_TABLE[(byte)]
00072 
00073 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
00074 #define A ACCEPT
00075 #define F FAILURE
00076 static const signed char trans[][0x100] = {
00077   { /* S0   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00078     /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00079     /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00080     /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00081     /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00082     /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00083     /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00084     /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00085     /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00086     /* 8 */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00087     /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00088     /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00089     /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00090     /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00091     /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00092     /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00093     /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F, F, F
00094   },
00095   { /* S1   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00096     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00097     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00098     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00099     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00100     /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00101     /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00102     /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00103     /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
00104     /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00105     /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00106     /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00107     /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00108     /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00109     /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00110     /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00111     /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, F, F, F
00112   }
00113 };
00114 #undef A
00115 #undef F
00116 
00117 static int
00118 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00119 {
00120   int firstbyte = *p++;
00121   state_t s;
00122   s = trans[0][firstbyte];
00123   if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
00124                                   ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00125   if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_SJIS[firstbyte]-1);
00126   s = trans[s][*p++];
00127   return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
00128                        ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00129 }
00130 
00131 static int
00132 code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
00133 {
00134   if (code < 256) {
00135     if (EncLen_SJIS[(int )code] == 1)
00136       return 1;
00137     else
00138       return ONIGERR_INVALID_CODE_POINT_VALUE;
00139   }
00140   else if (code <= 0xffff) {
00141     return 2;
00142   }
00143   else
00144     return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
00145 }
00146 
00147 static OnigCodePoint
00148 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
00149 {
00150   int c, i, len;
00151   OnigCodePoint n;
00152 
00153   len = enclen(enc, p, end);
00154   c = *p++;
00155   n = c;
00156   if (len == 1) return n;
00157 
00158   for (i = 1; i < len; i++) {
00159     if (p >= end) break;
00160     c = *p++;
00161     n <<= 8;  n += c;
00162   }
00163   return n;
00164 }
00165 
00166 static int
00167 code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
00168 {
00169   UChar *p = buf;
00170 
00171   if ((code & 0xff00) != 0) *p++ = (UChar )(((code >>  8) & 0xff));
00172   *p++ = (UChar )(code & 0xff);
00173 
00174 #if 0
00175   if (enclen(enc, buf) != (p - buf))
00176     return REGERR_INVALID_CODE_POINT_VALUE;
00177 #endif
00178   return (int)(p - buf);
00179 }
00180 
00181 static int
00182 mbc_case_fold(OnigCaseFoldType flag,
00183               const UChar** pp, const UChar* end, UChar* lower,
00184               OnigEncoding enc)
00185 {
00186   const UChar* p = *pp;
00187 
00188   if (ONIGENC_IS_MBC_ASCII(p)) {
00189     *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00190     (*pp)++;
00191     return 1;
00192   }
00193   else {
00194     int i;
00195     int len = enclen(enc, p, end);
00196 
00197     for (i = 0; i < len; i++) {
00198       *lower++ = *p++;
00199     }
00200     (*pp) += len;
00201     return len; /* return byte length of converted char to lower */
00202   }
00203 }
00204 
00205 #if 0
00206 static int
00207 is_mbc_ambiguous(OnigCaseFoldType flag,
00208                  const UChar** pp, const UChar* end)
00209 {
00210   return onigenc_mbn_is_mbc_ambiguous(enc, flag, pp, end);
00211 
00212 }
00213 #endif
00214 
00215 #if 0
00216 static int
00217 is_code_ctype(OnigCodePoint code, unsigned int ctype)
00218 {
00219   if (code < 128)
00220     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
00221   else {
00222     if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
00223       return (code_to_mbclen(code) > 1 ? TRUE : FALSE);
00224     }
00225   }
00226 
00227   return FALSE;
00228 }
00229 #endif
00230 
00231 static UChar*
00232 left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
00233 {
00234   const UChar *p;
00235   int len;
00236 
00237   if (s <= start) return (UChar* )s;
00238   p = s;
00239 
00240   if (SJIS_ISMB_TRAIL(*p)) {
00241     while (p > start) {
00242       if (! SJIS_ISMB_FIRST(*--p)) {
00243         p++;
00244         break;
00245       }
00246     }
00247   }
00248   len = enclen(enc, p, end);
00249   if (p + len > s) return (UChar* )p;
00250   p += len;
00251   return (UChar* )(p + ((s - p) & ~1));
00252 }
00253 
00254 static int
00255 is_allowed_reverse_match(const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
00256 {
00257   const UChar c = *s;
00258   return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE);
00259 }
00260 
00261 
00262 static int PropertyInited = 0;
00263 static const OnigCodePoint** PropertyList;
00264 static int PropertyListNum;
00265 static int PropertyListSize;
00266 static hash_table_type* PropertyNameTable;
00267 
00268 static const OnigCodePoint CR_Hiragana[] = {
00269   1,
00270   0x829f, 0x82f1
00271 }; /* CR_Hiragana */
00272 
00273 static const OnigCodePoint CR_Katakana[] = {
00274   4,
00275   0x00a6, 0x00af,
00276   0x00b1, 0x00dd,
00277   0x8340, 0x837e,
00278   0x8380, 0x8396,
00279 }; /* CR_Katakana */
00280 
00281 static int
00282 init_property_list(void)
00283 {
00284   int r;
00285 
00286   PROPERTY_LIST_ADD_PROP("hiragana", CR_Hiragana);
00287   PROPERTY_LIST_ADD_PROP("katakana", CR_Katakana);
00288   PropertyInited = 1;
00289 
00290  end:
00291   return r;
00292 }
00293 
00294 static int
00295 property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
00296 {
00297   hash_data_type ctype;
00298   UChar *s, *e;
00299 
00300   PROPERTY_LIST_INIT_CHECK;
00301 
00302   s = e = ALLOCA_N(UChar, end-p+1);
00303   for (; p < end; p++) {
00304     *e++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00305   }
00306 
00307   if (onig_st_lookup_strend(PropertyNameTable, s, e, &ctype) == 0) {
00308     return onigenc_minimum_property_name_to_ctype(enc, s, e);
00309   }
00310 
00311   return (int)ctype;
00312 }
00313 
00314 static int
00315 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
00316 {
00317   if (ctype <= ONIGENC_MAX_STD_CTYPE) {
00318     if (code < 128)
00319       return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
00320     else {
00321       if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
00322         return TRUE;
00323       }
00324     }
00325   }
00326   else {
00327     PROPERTY_LIST_INIT_CHECK;
00328 
00329     ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
00330     if (ctype >= (unsigned int )PropertyListNum)
00331       return ONIGERR_TYPE_BUG;
00332 
00333     return onig_is_in_code_range((UChar* )PropertyList[ctype], code);
00334   }
00335 
00336   return FALSE;
00337 }
00338 
00339 static int
00340 get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
00341                      const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
00342 {
00343   if (ctype <= ONIGENC_MAX_STD_CTYPE) {
00344     return ONIG_NO_SUPPORT_CONFIG;
00345   }
00346   else {
00347     *sb_out = 0x80;
00348 
00349     PROPERTY_LIST_INIT_CHECK;
00350 
00351     ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
00352     if (ctype >= (OnigCtype )PropertyListNum)
00353       return ONIGERR_TYPE_BUG;
00354 
00355     *ranges = PropertyList[ctype];
00356     return 0;
00357   }
00358 }
00359 
00360 OnigEncodingDefine(shift_jis, Shift_JIS) = {
00361   mbc_enc_len,
00362   "Shift_JIS",   /* name */
00363   2,             /* max byte length */
00364   1,             /* min byte length */
00365   onigenc_is_mbc_newline_0x0a,
00366   mbc_to_code,
00367   code_to_mbclen,
00368   code_to_mbc,
00369   mbc_case_fold,
00370   onigenc_ascii_apply_all_case_fold,
00371   onigenc_ascii_get_case_fold_codes_by_str,
00372   property_name_to_ctype,
00373   is_code_ctype,
00374   get_ctype_code_range,
00375   left_adjust_char_head,
00376   is_allowed_reverse_match,
00377   0
00378 };
00379 /*
00380  * Name: Shift_JIS
00381  * MIBenum: 17
00382  * Link: http://www.iana.org/assignments/character-sets
00383  * Link: http://ja.wikipedia.org/wiki/Shift_JIS
00384  */
00385 
00386 /*
00387  * Name: Windows-31J
00388  * MIBenum: 2024
00389  * Link: http://www.iana.org/assignments/character-sets
00390  * Link: http://www.microsoft.com/globaldev/reference/dbcs/932.mspx
00391  * Link: http://ja.wikipedia.org/wiki/Windows-31J
00392  * Link: http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-932-2000.ucm
00393  *
00394  * Windows Standard Character Set and its mapping to Unicode by Microsoft.
00395  * Since 1.9.3, SJIS is the alias of Windows-31J because its character
00396  * set is usually this one even if its mapping may differ.
00397  */
00398 ENC_REPLICATE("Windows-31J", "Shift_JIS")
00399 ENC_ALIAS("CP932", "Windows-31J")
00400 ENC_ALIAS("csWindows31J", "Windows-31J") /* IANA.  IE6 don't accept Windows-31J but csWindows31J. */
00401 ENC_ALIAS("SJIS", "Windows-31J")
00402 
00403 /*
00404  * Name: PCK
00405  * Link: http://download.oracle.com/docs/cd/E19253-01/819-0606/x-2chn0/index.html
00406  * Link: http://download.oracle.com/docs/cd/E19253-01/819-0606/appb-pckwarn-1/index.html
00407  *
00408  * Solaris's SJIS variant. Its set is Windows Standard Character Set; it
00409  * consists JIS X 0201 Latin (US-ASCII), JIS X 0201 Katakana, JIS X 0208, NEC
00410  * special characters, NEC-selected IBM extended characters, and IBM extended
00411  * characters. Solaris's iconv seems to use SJIS-open.
00412  */
00413 ENC_ALIAS("PCK", "Windows-31J")
00414 
00415 /*
00416  * Name: MacJapanese
00417  * Link: http://unicode.org/Public/MAPPINGS/VENDORS/APPLE/JAPANESE.TXT
00418  * Link: http://ja.wikipedia.org/wiki/MacJapanese
00419  */
00420 ENC_REPLICATE("MacJapanese", "Shift_JIS")
00421 ENC_ALIAS("MacJapan", "MacJapanese")
00422