Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /********************************************************************** 00002 gb18030.c - Oniguruma (regular expression library) 00003 **********************************************************************/ 00004 /*- 00005 * Copyright (c) 2005-2007 KUBO Takehiro <kubo AT jiubao DOT org> 00006 * K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 00007 * All rights reserved. 00008 * 00009 * Redistribution and use in source and binary forms, with or without 00010 * modification, are permitted provided that the following conditions 00011 * are met: 00012 * 1. Redistributions of source code must retain the above copyright 00013 * notice, this list of conditions and the following disclaimer. 00014 * 2. Redistributions in binary form must reproduce the above copyright 00015 * notice, this list of conditions and the following disclaimer in the 00016 * documentation and/or other materials provided with the distribution. 00017 * 00018 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 00019 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00020 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00021 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 00022 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00023 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 00024 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00025 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00026 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 00027 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00028 * SUCH DAMAGE. 00029 */ 00030 00031 #include "regenc.h" 00032 00033 #if 1 00034 #define DEBUG_GB18030(arg) 00035 #else 00036 #define DEBUG_GB18030(arg) printf arg 00037 #endif 00038 00039 enum { 00040 C1, /* one-byte char */ 00041 C2, /* one-byte or second of two-byte char */ 00042 C4, /* one-byte or second or fourth of four-byte char */ 00043 CM /* first of two- or four-byte char or second of two-byte char */ 00044 }; 00045 00046 static const char GB18030_MAP[] = { 00047 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, 00048 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, 00049 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, 00050 C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1, 00051 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, 00052 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, 00053 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, 00054 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1, 00055 C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, 00056 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, 00057 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, 00058 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, 00059 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, 00060 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, 00061 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, 00062 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1 00063 }; 00064 00065 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2, S3 } state_t; 00066 #define A ACCEPT 00067 #define F FAILURE 00068 static const signed char trans[][0x100] = { 00069 { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00070 /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00071 /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00072 /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00073 /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00074 /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00075 /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00076 /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00077 /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00078 /* 8 */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00079 /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00080 /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00081 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00082 /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00083 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00084 /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00085 /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F 00086 }, 00087 { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00088 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00089 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00090 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00091 /* 3 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, F, F, F, F, F, F, 00092 /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00093 /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00094 /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00095 /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F, 00096 /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00097 /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00098 /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00099 /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00100 /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00101 /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00102 /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 00103 /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F 00104 }, 00105 { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00106 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00107 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00108 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00109 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00110 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00111 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00112 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00113 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00114 /* 8 */ F, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00115 /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00116 /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00117 /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00118 /* c */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00119 /* d */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00120 /* e */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 00121 /* f */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, F 00122 }, 00123 { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 00124 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00125 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00126 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00127 /* 3 */ A, A, A, A, A, A, A, A, A, A, F, F, F, F, F, F, 00128 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00129 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00130 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00131 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00132 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00133 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00134 /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00135 /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00136 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00137 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00138 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 00139 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 00140 } 00141 }; 00142 #undef A 00143 #undef F 00144 00145 static int 00146 gb18030_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) 00147 { 00148 int firstbyte = *p++; 00149 state_t s = trans[0][firstbyte]; 00150 #define RETURN(n) \ 00151 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) : \ 00152 ONIGENC_CONSTRUCT_MBCLEN_INVALID() 00153 if (s < 0) RETURN(1); 00154 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(2-1); 00155 s = trans[s][*p++]; 00156 if (s < 0) RETURN(2); 00157 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-2); 00158 s = trans[s][*p++]; 00159 if (s < 0) RETURN(3); 00160 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-3); 00161 s = trans[s][*p++]; 00162 RETURN(4); 00163 #undef RETURN 00164 } 00165 00166 static OnigCodePoint 00167 gb18030_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) 00168 { 00169 int c, i, len; 00170 OnigCodePoint n; 00171 00172 len = enclen(enc, p, end); 00173 n = (OnigCodePoint )(*p++); 00174 if (len == 1) return n; 00175 00176 for (i = 1; i < len; i++) { 00177 if (p >= end) break; 00178 c = *p++; 00179 n <<= 8; n += c; 00180 } 00181 return n; 00182 } 00183 00184 static int 00185 gb18030_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc) 00186 { 00187 return onigenc_mb4_code_to_mbc(enc, code, buf); 00188 } 00189 00190 static int 00191 gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, 00192 UChar* lower, OnigEncoding enc) 00193 { 00194 return onigenc_mbn_mbc_case_fold(enc, flag, 00195 pp, end, lower); 00196 } 00197 00198 #if 0 00199 static int 00200 gb18030_is_mbc_ambiguous(OnigCaseFoldType flag, 00201 const UChar** pp, const UChar* end, OnigEncoding enc) 00202 { 00203 return onigenc_mbn_is_mbc_ambiguous(enc, flag, pp, end); 00204 } 00205 #endif 00206 00207 static int 00208 gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc) 00209 { 00210 return onigenc_mb4_is_code_ctype(enc, code, ctype); 00211 } 00212 00213 enum state { 00214 S_START, 00215 S_one_C2, 00216 S_one_C4, 00217 S_one_CM, 00218 00219 S_odd_CM_one_CX, 00220 S_even_CM_one_CX, 00221 00222 /* CMC4 : pair of "CM C4" */ 00223 S_one_CMC4, 00224 S_odd_CMC4, 00225 S_one_C4_odd_CMC4, 00226 S_even_CMC4, 00227 S_one_C4_even_CMC4, 00228 00229 S_odd_CM_odd_CMC4, 00230 S_even_CM_odd_CMC4, 00231 00232 S_odd_CM_even_CMC4, 00233 S_even_CM_even_CMC4, 00234 00235 /* C4CM : pair of "C4 CM" */ 00236 S_odd_C4CM, 00237 S_one_CM_odd_C4CM, 00238 S_even_C4CM, 00239 S_one_CM_even_C4CM, 00240 00241 S_even_CM_odd_C4CM, 00242 S_odd_CM_odd_C4CM, 00243 S_even_CM_even_C4CM, 00244 S_odd_CM_even_C4CM 00245 }; 00246 00247 static UChar* 00248 gb18030_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc) 00249 { 00250 const UChar *p; 00251 enum state state = S_START; 00252 00253 DEBUG_GB18030(("----------------\n")); 00254 for (p = s; p >= start; p--) { 00255 DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p)); 00256 switch (state) { 00257 case S_START: 00258 switch (GB18030_MAP[*p]) { 00259 case C1: 00260 return (UChar *)s; 00261 case C2: 00262 state = S_one_C2; /* C2 */ 00263 break; 00264 case C4: 00265 state = S_one_C4; /* C4 */ 00266 break; 00267 case CM: 00268 state = S_one_CM; /* CM */ 00269 break; 00270 } 00271 break; 00272 case S_one_C2: /* C2 */ 00273 switch (GB18030_MAP[*p]) { 00274 case C1: 00275 case C2: 00276 case C4: 00277 return (UChar *)s; 00278 case CM: 00279 state = S_odd_CM_one_CX; /* CM C2 */ 00280 break; 00281 } 00282 break; 00283 case S_one_C4: /* C4 */ 00284 switch (GB18030_MAP[*p]) { 00285 case C1: 00286 case C2: 00287 case C4: 00288 return (UChar *)s; 00289 case CM: 00290 state = S_one_CMC4; 00291 break; 00292 } 00293 break; 00294 case S_one_CM: /* CM */ 00295 switch (GB18030_MAP[*p]) { 00296 case C1: 00297 case C2: 00298 return (UChar *)s; 00299 case C4: 00300 state = S_odd_C4CM; 00301 break; 00302 case CM: 00303 state = S_odd_CM_one_CX; /* CM CM */ 00304 break; 00305 } 00306 break; 00307 00308 case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */ 00309 switch (GB18030_MAP[*p]) { 00310 case C1: 00311 case C2: 00312 case C4: 00313 return (UChar *)(s - 1); 00314 case CM: 00315 state = S_even_CM_one_CX; 00316 break; 00317 } 00318 break; 00319 case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */ 00320 switch (GB18030_MAP[*p]) { 00321 case C1: 00322 case C2: 00323 case C4: 00324 return (UChar *)s; 00325 case CM: 00326 state = S_odd_CM_one_CX; 00327 break; 00328 } 00329 break; 00330 00331 case S_one_CMC4: /* CM C4 */ 00332 switch (GB18030_MAP[*p]) { 00333 case C1: 00334 case C2: 00335 return (UChar *)(s - 1); 00336 case C4: 00337 state = S_one_C4_odd_CMC4; /* C4 CM C4 */ 00338 break; 00339 case CM: 00340 state = S_even_CM_one_CX; /* CM CM C4 */ 00341 break; 00342 } 00343 break; 00344 case S_odd_CMC4: /* CM C4 CM C4 CM C4 */ 00345 switch (GB18030_MAP[*p]) { 00346 case C1: 00347 case C2: 00348 return (UChar *)(s - 1); 00349 case C4: 00350 state = S_one_C4_odd_CMC4; 00351 break; 00352 case CM: 00353 state = S_odd_CM_odd_CMC4; 00354 break; 00355 } 00356 break; 00357 case S_one_C4_odd_CMC4: /* C4 CM C4 */ 00358 switch (GB18030_MAP[*p]) { 00359 case C1: 00360 case C2: 00361 case C4: 00362 return (UChar *)(s - 1); 00363 case CM: 00364 state = S_even_CMC4; /* CM C4 CM C4 */ 00365 break; 00366 } 00367 break; 00368 case S_even_CMC4: /* CM C4 CM C4 */ 00369 switch (GB18030_MAP[*p]) { 00370 case C1: 00371 case C2: 00372 return (UChar *)(s - 3); 00373 case C4: 00374 state = S_one_C4_even_CMC4; 00375 break; 00376 case CM: 00377 state = S_odd_CM_even_CMC4; 00378 break; 00379 } 00380 break; 00381 case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */ 00382 switch (GB18030_MAP[*p]) { 00383 case C1: 00384 case C2: 00385 case C4: 00386 return (UChar *)(s - 3); 00387 case CM: 00388 state = S_odd_CMC4; 00389 break; 00390 } 00391 break; 00392 00393 case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */ 00394 switch (GB18030_MAP[*p]) { 00395 case C1: 00396 case C2: 00397 case C4: 00398 return (UChar *)(s - 3); 00399 case CM: 00400 state = S_even_CM_odd_CMC4; 00401 break; 00402 } 00403 break; 00404 case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */ 00405 switch (GB18030_MAP[*p]) { 00406 case C1: 00407 case C2: 00408 case C4: 00409 return (UChar *)(s - 1); 00410 case CM: 00411 state = S_odd_CM_odd_CMC4; 00412 break; 00413 } 00414 break; 00415 00416 case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */ 00417 switch (GB18030_MAP[*p]) { 00418 case C1: 00419 case C2: 00420 case C4: 00421 return (UChar *)(s - 1); 00422 case CM: 00423 state = S_even_CM_even_CMC4; 00424 break; 00425 } 00426 break; 00427 case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */ 00428 switch (GB18030_MAP[*p]) { 00429 case C1: 00430 case C2: 00431 case C4: 00432 return (UChar *)(s - 3); 00433 case CM: 00434 state = S_odd_CM_even_CMC4; 00435 break; 00436 } 00437 break; 00438 00439 case S_odd_C4CM: /* C4 CM */ /* C4 CM C4 CM C4 CM*/ 00440 switch (GB18030_MAP[*p]) { 00441 case C1: 00442 case C2: 00443 case C4: 00444 return (UChar *)s; 00445 case CM: 00446 state = S_one_CM_odd_C4CM; /* CM C4 CM */ 00447 break; 00448 } 00449 break; 00450 case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */ 00451 switch (GB18030_MAP[*p]) { 00452 case C1: 00453 case C2: 00454 return (UChar *)(s - 2); /* |CM C4 CM */ 00455 case C4: 00456 state = S_even_C4CM; 00457 break; 00458 case CM: 00459 state = S_even_CM_odd_C4CM; 00460 break; 00461 } 00462 break; 00463 case S_even_C4CM: /* C4 CM C4 CM */ 00464 switch (GB18030_MAP[*p]) { 00465 case C1: 00466 case C2: 00467 case C4: 00468 return (UChar *)(s - 2); /* C4|CM C4 CM */ 00469 case CM: 00470 state = S_one_CM_even_C4CM; 00471 break; 00472 } 00473 break; 00474 case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */ 00475 switch (GB18030_MAP[*p]) { 00476 case C1: 00477 case C2: 00478 return (UChar *)(s - 0); /*|CM C4 CM C4|CM */ 00479 case C4: 00480 state = S_odd_C4CM; 00481 break; 00482 case CM: 00483 state = S_even_CM_even_C4CM; 00484 break; 00485 } 00486 break; 00487 00488 case S_even_CM_odd_C4CM: /* CM CM C4 CM */ 00489 switch (GB18030_MAP[*p]) { 00490 case C1: 00491 case C2: 00492 case C4: 00493 return (UChar *)(s - 0); /* |CM CM|C4|CM */ 00494 case CM: 00495 state = S_odd_CM_odd_C4CM; 00496 break; 00497 } 00498 break; 00499 case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */ 00500 switch (GB18030_MAP[*p]) { 00501 case C1: 00502 case C2: 00503 case C4: 00504 return (UChar *)(s - 2); /* |CM CM|CM C4 CM */ 00505 case CM: 00506 state = S_even_CM_odd_C4CM; 00507 break; 00508 } 00509 break; 00510 00511 case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */ 00512 switch (GB18030_MAP[*p]) { 00513 case C1: 00514 case C2: 00515 case C4: 00516 return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */ 00517 case CM: 00518 state = S_odd_CM_even_C4CM; 00519 break; 00520 } 00521 break; 00522 case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */ 00523 switch (GB18030_MAP[*p]) { 00524 case C1: 00525 case C2: 00526 case C4: 00527 return (UChar *)(s - 0); /* |CM CM|CM C4 CM C4|CM */ 00528 case CM: 00529 state = S_even_CM_even_C4CM; 00530 break; 00531 } 00532 break; 00533 } 00534 } 00535 00536 DEBUG_GB18030(("state %d\n", state)); 00537 switch (state) { 00538 case S_START: return (UChar *)(s - 0); 00539 case S_one_C2: return (UChar *)(s - 0); 00540 case S_one_C4: return (UChar *)(s - 0); 00541 case S_one_CM: return (UChar *)(s - 0); 00542 00543 case S_odd_CM_one_CX: return (UChar *)(s - 1); 00544 case S_even_CM_one_CX: return (UChar *)(s - 0); 00545 00546 case S_one_CMC4: return (UChar *)(s - 1); 00547 case S_odd_CMC4: return (UChar *)(s - 1); 00548 case S_one_C4_odd_CMC4: return (UChar *)(s - 1); 00549 case S_even_CMC4: return (UChar *)(s - 3); 00550 case S_one_C4_even_CMC4: return (UChar *)(s - 3); 00551 00552 case S_odd_CM_odd_CMC4: return (UChar *)(s - 3); 00553 case S_even_CM_odd_CMC4: return (UChar *)(s - 1); 00554 00555 case S_odd_CM_even_CMC4: return (UChar *)(s - 1); 00556 case S_even_CM_even_CMC4: return (UChar *)(s - 3); 00557 00558 case S_odd_C4CM: return (UChar *)(s - 0); 00559 case S_one_CM_odd_C4CM: return (UChar *)(s - 2); 00560 case S_even_C4CM: return (UChar *)(s - 2); 00561 case S_one_CM_even_C4CM: return (UChar *)(s - 0); 00562 00563 case S_even_CM_odd_C4CM: return (UChar *)(s - 0); 00564 case S_odd_CM_odd_C4CM: return (UChar *)(s - 2); 00565 case S_even_CM_even_C4CM: return (UChar *)(s - 2); 00566 case S_odd_CM_even_C4CM: return (UChar *)(s - 0); 00567 } 00568 00569 return (UChar* )s; /* never come here. (escape warning) */ 00570 } 00571 00572 static int 00573 gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED, OnigEncoding enc ARG_UNUSED) 00574 { 00575 return GB18030_MAP[*s] == C1 ? TRUE : FALSE; 00576 } 00577 00578 /* 00579 * Name: GB18030 00580 * MIBenum: 114 00581 * Link: http://www.iana.org/assignments/charset-reg/GB18030 00582 */ 00583 OnigEncodingDefine(gb18030, GB18030) = { 00584 gb18030_mbc_enc_len, 00585 "GB18030", /* name */ 00586 4, /* max enc length */ 00587 1, /* min enc length */ 00588 onigenc_is_mbc_newline_0x0a, 00589 gb18030_mbc_to_code, 00590 onigenc_mb4_code_to_mbclen, 00591 gb18030_code_to_mbc, 00592 gb18030_mbc_case_fold, 00593 onigenc_ascii_apply_all_case_fold, 00594 onigenc_ascii_get_case_fold_codes_by_str, 00595 onigenc_minimum_property_name_to_ctype, 00596 gb18030_is_code_ctype, 00597 onigenc_not_support_get_ctype_code_range, 00598 gb18030_left_adjust_char_head, 00599 gb18030_is_allowed_reverse_match 00600 }; 00601 00602