Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /* -*- mode:c; c-file-style:"gnu" -*- */ 00002 /********************************************************************** 00003 regparse.c - Oniguruma (regular expression library) 00004 **********************************************************************/ 00005 /*- 00006 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 00007 * All rights reserved. 00008 * 00009 * Redistribution and use in source and binary forms, with or without 00010 * modification, are permitted provided that the following conditions 00011 * are met: 00012 * 1. Redistributions of source code must retain the above copyright 00013 * notice, this list of conditions and the following disclaimer. 00014 * 2. Redistributions in binary form must reproduce the above copyright 00015 * notice, this list of conditions and the following disclaimer in the 00016 * documentation and/or other materials provided with the distribution. 00017 * 00018 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 00019 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00020 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00021 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 00022 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00023 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 00024 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00025 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00026 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 00027 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00028 * SUCH DAMAGE. 00029 */ 00030 00031 #include "regparse.h" 00032 00033 #define WARN_BUFSIZE 256 00034 00035 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS 00036 00037 00038 const OnigSyntaxType OnigSyntaxRuby = { 00039 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | 00040 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | 00041 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS | 00042 ONIG_SYN_OP_ESC_C_CONTROL ) 00043 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) 00044 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | 00045 ONIG_SYN_OP2_OPTION_RUBY | 00046 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF | 00047 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL | 00048 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY | 00049 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | 00050 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | 00051 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | 00052 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB | 00053 ONIG_SYN_OP2_ESC_H_XDIGIT ) 00054 , ( SYN_GNU_REGEX_BV | 00055 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | 00056 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | 00057 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | 00058 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | 00059 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | 00060 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | 00061 ONIG_SYN_WARN_CC_DUP | 00062 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) 00063 , ONIG_OPTION_NONE 00064 , 00065 { 00066 (OnigCodePoint )'\\' /* esc */ 00067 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ 00068 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ 00069 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ 00070 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ 00071 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ 00072 } 00073 }; 00074 00075 const OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY; 00076 00077 extern void onig_null_warn(const char* s ARG_UNUSED) { } 00078 00079 #ifdef DEFAULT_WARN_FUNCTION 00080 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION; 00081 #else 00082 static OnigWarnFunc onig_warn = onig_null_warn; 00083 #endif 00084 00085 #ifdef DEFAULT_VERB_WARN_FUNCTION 00086 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION; 00087 #else 00088 static OnigWarnFunc onig_verb_warn = onig_null_warn; 00089 #endif 00090 00091 extern void onig_set_warn_func(OnigWarnFunc f) 00092 { 00093 onig_warn = f; 00094 } 00095 00096 extern void onig_set_verb_warn_func(OnigWarnFunc f) 00097 { 00098 onig_verb_warn = f; 00099 } 00100 00101 static void CC_DUP_WARN(ScanEnv *env); 00102 00103 static void 00104 bbuf_free(BBuf* bbuf) 00105 { 00106 if (IS_NOT_NULL(bbuf)) { 00107 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p); 00108 xfree(bbuf); 00109 } 00110 } 00111 00112 static int 00113 bbuf_clone(BBuf** rto, BBuf* from) 00114 { 00115 int r; 00116 BBuf *to; 00117 00118 *rto = to = (BBuf* )xmalloc(sizeof(BBuf)); 00119 CHECK_NULL_RETURN_MEMERR(to); 00120 r = BBUF_INIT(to, from->alloc); 00121 if (r != 0) return r; 00122 to->used = from->used; 00123 xmemcpy(to->p, from->p, from->used); 00124 return 0; 00125 } 00126 00127 #define BACKREF_REL_TO_ABS(rel_no, env) \ 00128 ((env)->num_mem + 1 + (rel_no)) 00129 00130 #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f)) 00131 00132 #define MBCODE_START_POS(enc) \ 00133 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80) 00134 00135 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \ 00136 add_code_range_to_buf(pbuf, env, MBCODE_START_POS(enc), ~((OnigCodePoint )0)) 00137 00138 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\ 00139 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\ 00140 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\ 00141 if (r) return r;\ 00142 }\ 00143 } while (0) 00144 00145 00146 #define BITSET_SET_BIT_CHKDUP(bs, pos) do { \ 00147 if (BITSET_AT(bs, pos)) CC_DUP_WARN(env); \ 00148 BS_ROOM(bs, pos) |= BS_BIT(pos); \ 00149 } while (0) 00150 00151 #define BITSET_IS_EMPTY(bs,empty) do {\ 00152 int i;\ 00153 empty = 1;\ 00154 for (i = 0; i < (int )BITSET_SIZE; i++) {\ 00155 if ((bs)[i] != 0) {\ 00156 empty = 0; break;\ 00157 }\ 00158 }\ 00159 } while (0) 00160 00161 static void 00162 bitset_set_range(ScanEnv *env, BitSetRef bs, int from, int to) 00163 { 00164 int i; 00165 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) { 00166 BITSET_SET_BIT_CHKDUP(bs, i); 00167 } 00168 } 00169 00170 #if 0 00171 static void 00172 bitset_set_all(BitSetRef bs) 00173 { 00174 int i; 00175 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); } 00176 } 00177 #endif 00178 00179 static void 00180 bitset_invert(BitSetRef bs) 00181 { 00182 int i; 00183 for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); } 00184 } 00185 00186 static void 00187 bitset_invert_to(BitSetRef from, BitSetRef to) 00188 { 00189 int i; 00190 for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); } 00191 } 00192 00193 static void 00194 bitset_and(BitSetRef dest, BitSetRef bs) 00195 { 00196 int i; 00197 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; } 00198 } 00199 00200 static void 00201 bitset_or(BitSetRef dest, BitSetRef bs) 00202 { 00203 int i; 00204 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; } 00205 } 00206 00207 static void 00208 bitset_copy(BitSetRef dest, BitSetRef bs) 00209 { 00210 int i; 00211 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; } 00212 } 00213 00214 extern int 00215 onig_strncmp(const UChar* s1, const UChar* s2, int n) 00216 { 00217 int x; 00218 00219 while (n-- > 0) { 00220 x = *s2++ - *s1++; 00221 if (x) return x; 00222 } 00223 return 0; 00224 } 00225 00226 extern void 00227 onig_strcpy(UChar* dest, const UChar* src, const UChar* end) 00228 { 00229 ptrdiff_t len = end - src; 00230 if (len > 0) { 00231 xmemcpy(dest, src, len); 00232 dest[len] = (UChar )0; 00233 } 00234 } 00235 00236 #ifdef USE_NAMED_GROUP 00237 static UChar* 00238 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end) 00239 { 00240 ptrdiff_t slen; 00241 int term_len, i; 00242 UChar *r; 00243 00244 slen = end - s; 00245 term_len = ONIGENC_MBC_MINLEN(enc); 00246 00247 r = (UChar* )xmalloc(slen + term_len); 00248 CHECK_NULL_RETURN(r); 00249 xmemcpy(r, s, slen); 00250 00251 for (i = 0; i < term_len; i++) 00252 r[slen + i] = (UChar )0; 00253 00254 return r; 00255 } 00256 #endif 00257 00258 /* scan pattern methods */ 00259 #define PEND_VALUE 0 00260 00261 #define PFETCH_READY UChar* pfetch_prev 00262 #define PEND (p < end ? 0 : 1) 00263 #define PUNFETCH p = pfetch_prev 00264 #define PINC do { \ 00265 pfetch_prev = p; \ 00266 p += enclen(enc, p, end); \ 00267 } while (0) 00268 #define PFETCH(c) do { \ 00269 c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \ 00270 pfetch_prev = p; \ 00271 p += enclen(enc, p, end); \ 00272 } while (0) 00273 00274 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE) 00275 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c) 00276 00277 static UChar* 00278 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end, 00279 size_t capa) 00280 { 00281 UChar* r; 00282 00283 if (dest) 00284 r = (UChar* )xrealloc(dest, capa + 1); 00285 else 00286 r = (UChar* )xmalloc(capa + 1); 00287 00288 CHECK_NULL_RETURN(r); 00289 onig_strcpy(r + (dest_end - dest), src, src_end); 00290 return r; 00291 } 00292 00293 /* dest on static area */ 00294 static UChar* 00295 strcat_capa_from_static(UChar* dest, UChar* dest_end, 00296 const UChar* src, const UChar* src_end, size_t capa) 00297 { 00298 UChar* r; 00299 00300 r = (UChar* )xmalloc(capa + 1); 00301 CHECK_NULL_RETURN(r); 00302 onig_strcpy(r, dest, dest_end); 00303 onig_strcpy(r + (dest_end - dest), src, src_end); 00304 return r; 00305 } 00306 00307 00308 #ifdef USE_ST_LIBRARY 00309 00310 #include "ruby/st.h" 00311 00312 typedef struct { 00313 const UChar* s; 00314 const UChar* end; 00315 } st_str_end_key; 00316 00317 static int 00318 str_end_cmp(st_data_t xp, st_data_t yp) 00319 { 00320 const st_str_end_key *x, *y; 00321 const UChar *p, *q; 00322 int c; 00323 00324 x = (const st_str_end_key *)xp; 00325 y = (const st_str_end_key *)yp; 00326 if ((x->end - x->s) != (y->end - y->s)) 00327 return 1; 00328 00329 p = x->s; 00330 q = y->s; 00331 while (p < x->end) { 00332 c = (int )*p - (int )*q; 00333 if (c != 0) return c; 00334 00335 p++; q++; 00336 } 00337 00338 return 0; 00339 } 00340 00341 static st_index_t 00342 str_end_hash(st_data_t xp) 00343 { 00344 const st_str_end_key *x = (const st_str_end_key *)xp; 00345 const UChar *p; 00346 st_index_t val = 0; 00347 00348 p = x->s; 00349 while (p < x->end) { 00350 val = val * 997 + (int )*p++; 00351 } 00352 00353 return val + (val >> 5); 00354 } 00355 00356 extern hash_table_type* 00357 onig_st_init_strend_table_with_size(st_index_t size) 00358 { 00359 static const struct st_hash_type hashType = { 00360 str_end_cmp, 00361 str_end_hash, 00362 }; 00363 00364 return (hash_table_type* ) 00365 onig_st_init_table_with_size(&hashType, size); 00366 } 00367 00368 extern int 00369 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key, 00370 const UChar* end_key, hash_data_type *value) 00371 { 00372 st_str_end_key key; 00373 00374 key.s = (UChar* )str_key; 00375 key.end = (UChar* )end_key; 00376 00377 return onig_st_lookup(table, (st_data_t )(&key), value); 00378 } 00379 00380 extern int 00381 onig_st_insert_strend(hash_table_type* table, const UChar* str_key, 00382 const UChar* end_key, hash_data_type value) 00383 { 00384 st_str_end_key* key; 00385 int result; 00386 00387 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key)); 00388 key->s = (UChar* )str_key; 00389 key->end = (UChar* )end_key; 00390 result = onig_st_insert(table, (st_data_t )key, value); 00391 if (result) { 00392 xfree(key); 00393 } 00394 return result; 00395 } 00396 00397 #endif /* USE_ST_LIBRARY */ 00398 00399 00400 #ifdef USE_NAMED_GROUP 00401 00402 #define INIT_NAME_BACKREFS_ALLOC_NUM 8 00403 00404 typedef struct { 00405 UChar* name; 00406 size_t name_len; /* byte length */ 00407 int back_num; /* number of backrefs */ 00408 int back_alloc; 00409 int back_ref1; 00410 int* back_refs; 00411 } NameEntry; 00412 00413 #ifdef USE_ST_LIBRARY 00414 00415 typedef st_table NameTable; 00416 typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */ 00417 00418 #define NAMEBUF_SIZE 24 00419 #define NAMEBUF_SIZE_1 25 00420 00421 #ifdef ONIG_DEBUG 00422 static int 00423 i_print_name_entry(UChar* key, NameEntry* e, void* arg) 00424 { 00425 int i; 00426 FILE* fp = (FILE* )arg; 00427 00428 fprintf(fp, "%s: ", e->name); 00429 if (e->back_num == 0) 00430 fputs("-", fp); 00431 else if (e->back_num == 1) 00432 fprintf(fp, "%d", e->back_ref1); 00433 else { 00434 for (i = 0; i < e->back_num; i++) { 00435 if (i > 0) fprintf(fp, ", "); 00436 fprintf(fp, "%d", e->back_refs[i]); 00437 } 00438 } 00439 fputs("\n", fp); 00440 return ST_CONTINUE; 00441 } 00442 00443 extern int 00444 onig_print_names(FILE* fp, regex_t* reg) 00445 { 00446 NameTable* t = (NameTable* )reg->name_table; 00447 00448 if (IS_NOT_NULL(t)) { 00449 fprintf(fp, "name table\n"); 00450 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp); 00451 fputs("\n", fp); 00452 } 00453 return 0; 00454 } 00455 #endif /* ONIG_DEBUG */ 00456 00457 static int 00458 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED) 00459 { 00460 xfree(e->name); 00461 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); 00462 xfree(key); 00463 xfree(e); 00464 return ST_DELETE; 00465 } 00466 00467 static int 00468 names_clear(regex_t* reg) 00469 { 00470 NameTable* t = (NameTable* )reg->name_table; 00471 00472 if (IS_NOT_NULL(t)) { 00473 onig_st_foreach(t, i_free_name_entry, 0); 00474 } 00475 return 0; 00476 } 00477 00478 extern int 00479 onig_names_free(regex_t* reg) 00480 { 00481 int r; 00482 NameTable* t; 00483 00484 r = names_clear(reg); 00485 if (r) return r; 00486 00487 t = (NameTable* )reg->name_table; 00488 if (IS_NOT_NULL(t)) onig_st_free_table(t); 00489 reg->name_table = (void* )NULL; 00490 return 0; 00491 } 00492 00493 static NameEntry* 00494 name_find(regex_t* reg, const UChar* name, const UChar* name_end) 00495 { 00496 NameEntry* e; 00497 NameTable* t = (NameTable* )reg->name_table; 00498 00499 e = (NameEntry* )NULL; 00500 if (IS_NOT_NULL(t)) { 00501 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e))); 00502 } 00503 return e; 00504 } 00505 00506 typedef struct { 00507 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*); 00508 regex_t* reg; 00509 void* arg; 00510 int ret; 00511 OnigEncoding enc; 00512 } INamesArg; 00513 00514 static int 00515 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg) 00516 { 00517 int r = (*(arg->func))(e->name, 00518 e->name + e->name_len, 00519 e->back_num, 00520 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), 00521 arg->reg, arg->arg); 00522 if (r != 0) { 00523 arg->ret = r; 00524 return ST_STOP; 00525 } 00526 return ST_CONTINUE; 00527 } 00528 00529 extern int 00530 onig_foreach_name(regex_t* reg, 00531 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) 00532 { 00533 INamesArg narg; 00534 NameTable* t = (NameTable* )reg->name_table; 00535 00536 narg.ret = 0; 00537 if (IS_NOT_NULL(t)) { 00538 narg.func = func; 00539 narg.reg = reg; 00540 narg.arg = arg; 00541 narg.enc = reg->enc; /* should be pattern encoding. */ 00542 onig_st_foreach(t, i_names, (HashDataType )&narg); 00543 } 00544 return narg.ret; 00545 } 00546 00547 static int 00548 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map) 00549 { 00550 int i; 00551 00552 if (e->back_num > 1) { 00553 for (i = 0; i < e->back_num; i++) { 00554 e->back_refs[i] = map[e->back_refs[i]].new_val; 00555 } 00556 } 00557 else if (e->back_num == 1) { 00558 e->back_ref1 = map[e->back_ref1].new_val; 00559 } 00560 00561 return ST_CONTINUE; 00562 } 00563 00564 extern int 00565 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map) 00566 { 00567 NameTable* t = (NameTable* )reg->name_table; 00568 00569 if (IS_NOT_NULL(t)) { 00570 onig_st_foreach(t, i_renumber_name, (HashDataType )map); 00571 } 00572 return 0; 00573 } 00574 00575 00576 extern int 00577 onig_number_of_names(regex_t* reg) 00578 { 00579 NameTable* t = (NameTable* )reg->name_table; 00580 00581 if (IS_NOT_NULL(t)) 00582 return t->num_entries; 00583 else 00584 return 0; 00585 } 00586 00587 #else /* USE_ST_LIBRARY */ 00588 00589 #define INIT_NAMES_ALLOC_NUM 8 00590 00591 typedef struct { 00592 NameEntry* e; 00593 int num; 00594 int alloc; 00595 } NameTable; 00596 00597 #ifdef ONIG_DEBUG 00598 extern int 00599 onig_print_names(FILE* fp, regex_t* reg) 00600 { 00601 int i, j; 00602 NameEntry* e; 00603 NameTable* t = (NameTable* )reg->name_table; 00604 00605 if (IS_NOT_NULL(t) && t->num > 0) { 00606 fprintf(fp, "name table\n"); 00607 for (i = 0; i < t->num; i++) { 00608 e = &(t->e[i]); 00609 fprintf(fp, "%s: ", e->name); 00610 if (e->back_num == 0) { 00611 fputs("-", fp); 00612 } 00613 else if (e->back_num == 1) { 00614 fprintf(fp, "%d", e->back_ref1); 00615 } 00616 else { 00617 for (j = 0; j < e->back_num; j++) { 00618 if (j > 0) fprintf(fp, ", "); 00619 fprintf(fp, "%d", e->back_refs[j]); 00620 } 00621 } 00622 fputs("\n", fp); 00623 } 00624 fputs("\n", fp); 00625 } 00626 return 0; 00627 } 00628 #endif 00629 00630 static int 00631 names_clear(regex_t* reg) 00632 { 00633 int i; 00634 NameEntry* e; 00635 NameTable* t = (NameTable* )reg->name_table; 00636 00637 if (IS_NOT_NULL(t)) { 00638 for (i = 0; i < t->num; i++) { 00639 e = &(t->e[i]); 00640 if (IS_NOT_NULL(e->name)) { 00641 xfree(e->name); 00642 e->name = NULL; 00643 e->name_len = 0; 00644 e->back_num = 0; 00645 e->back_alloc = 0; 00646 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); 00647 e->back_refs = (int* )NULL; 00648 } 00649 } 00650 if (IS_NOT_NULL(t->e)) { 00651 xfree(t->e); 00652 t->e = NULL; 00653 } 00654 t->num = 0; 00655 } 00656 return 0; 00657 } 00658 00659 extern int 00660 onig_names_free(regex_t* reg) 00661 { 00662 int r; 00663 NameTable* t; 00664 00665 r = names_clear(reg); 00666 if (r) return r; 00667 00668 t = (NameTable* )reg->name_table; 00669 if (IS_NOT_NULL(t)) xfree(t); 00670 reg->name_table = NULL; 00671 return 0; 00672 } 00673 00674 static NameEntry* 00675 name_find(regex_t* reg, UChar* name, UChar* name_end) 00676 { 00677 int i, len; 00678 NameEntry* e; 00679 NameTable* t = (NameTable* )reg->name_table; 00680 00681 if (IS_NOT_NULL(t)) { 00682 len = name_end - name; 00683 for (i = 0; i < t->num; i++) { 00684 e = &(t->e[i]); 00685 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0) 00686 return e; 00687 } 00688 } 00689 return (NameEntry* )NULL; 00690 } 00691 00692 extern int 00693 onig_foreach_name(regex_t* reg, 00694 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) 00695 { 00696 int i, r; 00697 NameEntry* e; 00698 NameTable* t = (NameTable* )reg->name_table; 00699 00700 if (IS_NOT_NULL(t)) { 00701 for (i = 0; i < t->num; i++) { 00702 e = &(t->e[i]); 00703 r = (*func)(e->name, e->name + e->name_len, e->back_num, 00704 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), 00705 reg, arg); 00706 if (r != 0) return r; 00707 } 00708 } 00709 return 0; 00710 } 00711 00712 extern int 00713 onig_number_of_names(regex_t* reg) 00714 { 00715 NameTable* t = (NameTable* )reg->name_table; 00716 00717 if (IS_NOT_NULL(t)) 00718 return t->num; 00719 else 00720 return 0; 00721 } 00722 00723 #endif /* else USE_ST_LIBRARY */ 00724 00725 static int 00726 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) 00727 { 00728 int alloc; 00729 NameEntry* e; 00730 NameTable* t = (NameTable* )reg->name_table; 00731 00732 if (name_end - name <= 0) 00733 return ONIGERR_EMPTY_GROUP_NAME; 00734 00735 e = name_find(reg, name, name_end); 00736 if (IS_NULL(e)) { 00737 #ifdef USE_ST_LIBRARY 00738 if (IS_NULL(t)) { 00739 t = onig_st_init_strend_table_with_size(5); 00740 reg->name_table = (void* )t; 00741 } 00742 e = (NameEntry* )xmalloc(sizeof(NameEntry)); 00743 CHECK_NULL_RETURN_MEMERR(e); 00744 00745 e->name = strdup_with_null(reg->enc, name, name_end); 00746 if (IS_NULL(e->name)) { 00747 xfree(e); 00748 return ONIGERR_MEMORY; 00749 } 00750 onig_st_insert_strend(t, e->name, (e->name + (name_end - name)), 00751 (HashDataType )e); 00752 00753 e->name_len = name_end - name; 00754 e->back_num = 0; 00755 e->back_alloc = 0; 00756 e->back_refs = (int* )NULL; 00757 00758 #else 00759 00760 if (IS_NULL(t)) { 00761 alloc = INIT_NAMES_ALLOC_NUM; 00762 t = (NameTable* )xmalloc(sizeof(NameTable)); 00763 CHECK_NULL_RETURN_MEMERR(t); 00764 t->e = NULL; 00765 t->alloc = 0; 00766 t->num = 0; 00767 00768 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc); 00769 if (IS_NULL(t->e)) { 00770 xfree(t); 00771 return ONIGERR_MEMORY; 00772 } 00773 t->alloc = alloc; 00774 reg->name_table = t; 00775 goto clear; 00776 } 00777 else if (t->num == t->alloc) { 00778 int i; 00779 00780 alloc = t->alloc * 2; 00781 t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc); 00782 CHECK_NULL_RETURN_MEMERR(t->e); 00783 t->alloc = alloc; 00784 00785 clear: 00786 for (i = t->num; i < t->alloc; i++) { 00787 t->e[i].name = NULL; 00788 t->e[i].name_len = 0; 00789 t->e[i].back_num = 0; 00790 t->e[i].back_alloc = 0; 00791 t->e[i].back_refs = (int* )NULL; 00792 } 00793 } 00794 e = &(t->e[t->num]); 00795 t->num++; 00796 e->name = strdup_with_null(reg->enc, name, name_end); 00797 if (IS_NULL(e->name)) return ONIGERR_MEMORY; 00798 e->name_len = name_end - name; 00799 #endif 00800 } 00801 00802 if (e->back_num >= 1 && 00803 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) { 00804 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME, 00805 name, name_end); 00806 return ONIGERR_MULTIPLEX_DEFINED_NAME; 00807 } 00808 00809 e->back_num++; 00810 if (e->back_num == 1) { 00811 e->back_ref1 = backref; 00812 } 00813 else { 00814 if (e->back_num == 2) { 00815 alloc = INIT_NAME_BACKREFS_ALLOC_NUM; 00816 e->back_refs = (int* )xmalloc(sizeof(int) * alloc); 00817 CHECK_NULL_RETURN_MEMERR(e->back_refs); 00818 e->back_alloc = alloc; 00819 e->back_refs[0] = e->back_ref1; 00820 e->back_refs[1] = backref; 00821 } 00822 else { 00823 if (e->back_num > e->back_alloc) { 00824 alloc = e->back_alloc * 2; 00825 e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc); 00826 CHECK_NULL_RETURN_MEMERR(e->back_refs); 00827 e->back_alloc = alloc; 00828 } 00829 e->back_refs[e->back_num - 1] = backref; 00830 } 00831 } 00832 00833 return 0; 00834 } 00835 00836 extern int 00837 onig_name_to_group_numbers(regex_t* reg, const UChar* name, 00838 const UChar* name_end, int** nums) 00839 { 00840 NameEntry* e = name_find(reg, name, name_end); 00841 00842 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE; 00843 00844 switch (e->back_num) { 00845 case 0: 00846 *nums = 0; 00847 break; 00848 case 1: 00849 *nums = &(e->back_ref1); 00850 break; 00851 default: 00852 *nums = e->back_refs; 00853 break; 00854 } 00855 return e->back_num; 00856 } 00857 00858 extern int 00859 onig_name_to_backref_number(regex_t* reg, const UChar* name, 00860 const UChar* name_end, OnigRegion *region) 00861 { 00862 int i, n, *nums; 00863 00864 n = onig_name_to_group_numbers(reg, name, name_end, &nums); 00865 if (n < 0) 00866 return n; 00867 else if (n == 0) 00868 return ONIGERR_PARSER_BUG; 00869 else if (n == 1) 00870 return nums[0]; 00871 else { 00872 if (IS_NOT_NULL(region)) { 00873 for (i = n - 1; i >= 0; i--) { 00874 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS) 00875 return nums[i]; 00876 } 00877 } 00878 return nums[n - 1]; 00879 } 00880 } 00881 00882 #else /* USE_NAMED_GROUP */ 00883 00884 extern int 00885 onig_name_to_group_numbers(regex_t* reg, const UChar* name, 00886 const UChar* name_end, int** nums) 00887 { 00888 return ONIG_NO_SUPPORT_CONFIG; 00889 } 00890 00891 extern int 00892 onig_name_to_backref_number(regex_t* reg, const UChar* name, 00893 const UChar* name_end, OnigRegion* region) 00894 { 00895 return ONIG_NO_SUPPORT_CONFIG; 00896 } 00897 00898 extern int 00899 onig_foreach_name(regex_t* reg, 00900 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) 00901 { 00902 return ONIG_NO_SUPPORT_CONFIG; 00903 } 00904 00905 extern int 00906 onig_number_of_names(regex_t* reg) 00907 { 00908 return 0; 00909 } 00910 #endif /* else USE_NAMED_GROUP */ 00911 00912 extern int 00913 onig_noname_group_capture_is_active(regex_t* reg) 00914 { 00915 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP)) 00916 return 0; 00917 00918 #ifdef USE_NAMED_GROUP 00919 if (onig_number_of_names(reg) > 0 && 00920 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && 00921 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) { 00922 return 0; 00923 } 00924 #endif 00925 00926 return 1; 00927 } 00928 00929 00930 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16 00931 00932 static void 00933 scan_env_clear(ScanEnv* env) 00934 { 00935 int i; 00936 00937 BIT_STATUS_CLEAR(env->capture_history); 00938 BIT_STATUS_CLEAR(env->bt_mem_start); 00939 BIT_STATUS_CLEAR(env->bt_mem_end); 00940 BIT_STATUS_CLEAR(env->backrefed_mem); 00941 env->error = (UChar* )NULL; 00942 env->error_end = (UChar* )NULL; 00943 env->num_call = 0; 00944 env->num_mem = 0; 00945 #ifdef USE_NAMED_GROUP 00946 env->num_named = 0; 00947 #endif 00948 env->mem_alloc = 0; 00949 env->mem_nodes_dynamic = (Node** )NULL; 00950 00951 for (i = 0; i < SCANENV_MEMNODES_SIZE; i++) 00952 env->mem_nodes_static[i] = NULL_NODE; 00953 00954 #ifdef USE_COMBINATION_EXPLOSION_CHECK 00955 env->num_comb_exp_check = 0; 00956 env->comb_exp_max_regnum = 0; 00957 env->curr_max_regnum = 0; 00958 env->has_recursion = 0; 00959 #endif 00960 env->warnings_flag = 0; 00961 } 00962 00963 static int 00964 scan_env_add_mem_entry(ScanEnv* env) 00965 { 00966 int i, need, alloc; 00967 Node** p; 00968 00969 need = env->num_mem + 1; 00970 if (need >= SCANENV_MEMNODES_SIZE) { 00971 if (env->mem_alloc <= need) { 00972 if (IS_NULL(env->mem_nodes_dynamic)) { 00973 alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE; 00974 p = (Node** )xmalloc(sizeof(Node*) * alloc); 00975 xmemcpy(p, env->mem_nodes_static, 00976 sizeof(Node*) * SCANENV_MEMNODES_SIZE); 00977 } 00978 else { 00979 alloc = env->mem_alloc * 2; 00980 p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc); 00981 } 00982 CHECK_NULL_RETURN_MEMERR(p); 00983 00984 for (i = env->num_mem + 1; i < alloc; i++) 00985 p[i] = NULL_NODE; 00986 00987 env->mem_nodes_dynamic = p; 00988 env->mem_alloc = alloc; 00989 } 00990 } 00991 00992 env->num_mem++; 00993 return env->num_mem; 00994 } 00995 00996 static int 00997 scan_env_set_mem_node(ScanEnv* env, int num, Node* node) 00998 { 00999 if (env->num_mem >= num) 01000 SCANENV_MEM_NODES(env)[num] = node; 01001 else 01002 return ONIGERR_PARSER_BUG; 01003 return 0; 01004 } 01005 01006 01007 #ifdef USE_PARSE_TREE_NODE_RECYCLE 01008 typedef struct _FreeNode { 01009 struct _FreeNode* next; 01010 } FreeNode; 01011 01012 static FreeNode* FreeNodeList = (FreeNode* )NULL; 01013 #endif 01014 01015 extern void 01016 onig_node_free(Node* node) 01017 { 01018 start: 01019 if (IS_NULL(node)) return ; 01020 01021 switch (NTYPE(node)) { 01022 case NT_STR: 01023 if (NSTR(node)->capa != 0 && 01024 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) { 01025 xfree(NSTR(node)->s); 01026 } 01027 break; 01028 01029 case NT_LIST: 01030 case NT_ALT: 01031 onig_node_free(NCAR(node)); 01032 { 01033 Node* next_node = NCDR(node); 01034 01035 #ifdef USE_PARSE_TREE_NODE_RECYCLE 01036 { 01037 FreeNode* n = (FreeNode* )node; 01038 01039 THREAD_ATOMIC_START; 01040 n->next = FreeNodeList; 01041 FreeNodeList = n; 01042 THREAD_ATOMIC_END; 01043 } 01044 #else 01045 xfree(node); 01046 #endif 01047 node = next_node; 01048 goto start; 01049 } 01050 break; 01051 01052 case NT_CCLASS: 01053 { 01054 CClassNode* cc = NCCLASS(node); 01055 01056 if (IS_NCCLASS_SHARE(cc)) return ; 01057 if (cc->mbuf) 01058 bbuf_free(cc->mbuf); 01059 } 01060 break; 01061 01062 case NT_QTFR: 01063 if (NQTFR(node)->target) 01064 onig_node_free(NQTFR(node)->target); 01065 break; 01066 01067 case NT_ENCLOSE: 01068 if (NENCLOSE(node)->target) 01069 onig_node_free(NENCLOSE(node)->target); 01070 break; 01071 01072 case NT_BREF: 01073 if (IS_NOT_NULL(NBREF(node)->back_dynamic)) 01074 xfree(NBREF(node)->back_dynamic); 01075 break; 01076 01077 case NT_ANCHOR: 01078 if (NANCHOR(node)->target) 01079 onig_node_free(NANCHOR(node)->target); 01080 break; 01081 } 01082 01083 #ifdef USE_PARSE_TREE_NODE_RECYCLE 01084 { 01085 FreeNode* n = (FreeNode* )node; 01086 01087 THREAD_ATOMIC_START; 01088 n->next = FreeNodeList; 01089 FreeNodeList = n; 01090 THREAD_ATOMIC_END; 01091 } 01092 #else 01093 xfree(node); 01094 #endif 01095 } 01096 01097 #ifdef USE_PARSE_TREE_NODE_RECYCLE 01098 extern int 01099 onig_free_node_list(void) 01100 { 01101 FreeNode* n; 01102 01103 /* THREAD_ATOMIC_START; */ 01104 while (IS_NOT_NULL(FreeNodeList)) { 01105 n = FreeNodeList; 01106 FreeNodeList = FreeNodeList->next; 01107 xfree(n); 01108 } 01109 /* THREAD_ATOMIC_END; */ 01110 return 0; 01111 } 01112 #endif 01113 01114 static Node* 01115 node_new(void) 01116 { 01117 Node* node; 01118 01119 #ifdef USE_PARSE_TREE_NODE_RECYCLE 01120 THREAD_ATOMIC_START; 01121 if (IS_NOT_NULL(FreeNodeList)) { 01122 node = (Node* )FreeNodeList; 01123 FreeNodeList = FreeNodeList->next; 01124 THREAD_ATOMIC_END; 01125 return node; 01126 } 01127 THREAD_ATOMIC_END; 01128 #endif 01129 01130 node = (Node* )xmalloc(sizeof(Node)); 01131 /* xmemset(node, 0, sizeof(Node)); */ 01132 return node; 01133 } 01134 01135 01136 static void 01137 initialize_cclass(CClassNode* cc) 01138 { 01139 BITSET_CLEAR(cc->bs); 01140 /* cc->base.flags = 0; */ 01141 cc->flags = 0; 01142 cc->mbuf = NULL; 01143 } 01144 01145 static Node* 01146 node_new_cclass(void) 01147 { 01148 Node* node = node_new(); 01149 CHECK_NULL_RETURN(node); 01150 01151 SET_NTYPE(node, NT_CCLASS); 01152 initialize_cclass(NCCLASS(node)); 01153 return node; 01154 } 01155 01156 static Node* 01157 node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out, 01158 const OnigCodePoint ranges[]) 01159 { 01160 int n, i; 01161 CClassNode* cc; 01162 OnigCodePoint j; 01163 01164 Node* node = node_new_cclass(); 01165 CHECK_NULL_RETURN(node); 01166 01167 cc = NCCLASS(node); 01168 if (not != 0) NCCLASS_SET_NOT(cc); 01169 01170 BITSET_CLEAR(cc->bs); 01171 if (sb_out > 0 && IS_NOT_NULL(ranges)) { 01172 n = ONIGENC_CODE_RANGE_NUM(ranges); 01173 for (i = 0; i < n; i++) { 01174 for (j = ONIGENC_CODE_RANGE_FROM(ranges, i); 01175 j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) { 01176 if (j >= sb_out) goto sb_end; 01177 01178 BITSET_SET_BIT(cc->bs, j); 01179 } 01180 } 01181 } 01182 01183 sb_end: 01184 if (IS_NULL(ranges)) { 01185 is_null: 01186 cc->mbuf = NULL; 01187 } 01188 else { 01189 BBuf* bbuf; 01190 01191 n = ONIGENC_CODE_RANGE_NUM(ranges); 01192 if (n == 0) goto is_null; 01193 01194 bbuf = (BBuf* )xmalloc(sizeof(BBuf)); 01195 CHECK_NULL_RETURN(bbuf); 01196 bbuf->alloc = n + 1; 01197 bbuf->used = n + 1; 01198 bbuf->p = (UChar* )((void* )ranges); 01199 01200 cc->mbuf = bbuf; 01201 } 01202 01203 return node; 01204 } 01205 01206 static Node* 01207 node_new_ctype(int type, int not) 01208 { 01209 Node* node = node_new(); 01210 CHECK_NULL_RETURN(node); 01211 01212 SET_NTYPE(node, NT_CTYPE); 01213 NCTYPE(node)->ctype = type; 01214 NCTYPE(node)->not = not; 01215 return node; 01216 } 01217 01218 static Node* 01219 node_new_anychar(void) 01220 { 01221 Node* node = node_new(); 01222 CHECK_NULL_RETURN(node); 01223 01224 SET_NTYPE(node, NT_CANY); 01225 return node; 01226 } 01227 01228 static Node* 01229 node_new_list(Node* left, Node* right) 01230 { 01231 Node* node = node_new(); 01232 CHECK_NULL_RETURN(node); 01233 01234 SET_NTYPE(node, NT_LIST); 01235 NCAR(node) = left; 01236 NCDR(node) = right; 01237 return node; 01238 } 01239 01240 extern Node* 01241 onig_node_new_list(Node* left, Node* right) 01242 { 01243 return node_new_list(left, right); 01244 } 01245 01246 extern Node* 01247 onig_node_list_add(Node* list, Node* x) 01248 { 01249 Node *n; 01250 01251 n = onig_node_new_list(x, NULL); 01252 if (IS_NULL(n)) return NULL_NODE; 01253 01254 if (IS_NOT_NULL(list)) { 01255 while (IS_NOT_NULL(NCDR(list))) 01256 list = NCDR(list); 01257 01258 NCDR(list) = n; 01259 } 01260 01261 return n; 01262 } 01263 01264 extern Node* 01265 onig_node_new_alt(Node* left, Node* right) 01266 { 01267 Node* node = node_new(); 01268 CHECK_NULL_RETURN(node); 01269 01270 SET_NTYPE(node, NT_ALT); 01271 NCAR(node) = left; 01272 NCDR(node) = right; 01273 return node; 01274 } 01275 01276 extern Node* 01277 onig_node_new_anchor(int type) 01278 { 01279 Node* node = node_new(); 01280 CHECK_NULL_RETURN(node); 01281 01282 SET_NTYPE(node, NT_ANCHOR); 01283 NANCHOR(node)->type = type; 01284 NANCHOR(node)->target = NULL; 01285 NANCHOR(node)->char_len = -1; 01286 return node; 01287 } 01288 01289 static Node* 01290 node_new_backref(int back_num, int* backrefs, int by_name, 01291 #ifdef USE_BACKREF_WITH_LEVEL 01292 int exist_level, int nest_level, 01293 #endif 01294 ScanEnv* env) 01295 { 01296 int i; 01297 Node* node = node_new(); 01298 01299 CHECK_NULL_RETURN(node); 01300 01301 SET_NTYPE(node, NT_BREF); 01302 NBREF(node)->state = 0; 01303 NBREF(node)->back_num = back_num; 01304 NBREF(node)->back_dynamic = (int* )NULL; 01305 if (by_name != 0) 01306 NBREF(node)->state |= NST_NAME_REF; 01307 01308 #ifdef USE_BACKREF_WITH_LEVEL 01309 if (exist_level != 0) { 01310 NBREF(node)->state |= NST_NEST_LEVEL; 01311 NBREF(node)->nest_level = nest_level; 01312 } 01313 #endif 01314 01315 for (i = 0; i < back_num; i++) { 01316 if (backrefs[i] <= env->num_mem && 01317 IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) { 01318 NBREF(node)->state |= NST_RECURSION; /* /...(\1).../ */ 01319 break; 01320 } 01321 } 01322 01323 if (back_num <= NODE_BACKREFS_SIZE) { 01324 for (i = 0; i < back_num; i++) 01325 NBREF(node)->back_static[i] = backrefs[i]; 01326 } 01327 else { 01328 int* p = (int* )xmalloc(sizeof(int) * back_num); 01329 if (IS_NULL(p)) { 01330 onig_node_free(node); 01331 return NULL; 01332 } 01333 NBREF(node)->back_dynamic = p; 01334 for (i = 0; i < back_num; i++) 01335 p[i] = backrefs[i]; 01336 } 01337 return node; 01338 } 01339 01340 #ifdef USE_SUBEXP_CALL 01341 static Node* 01342 node_new_call(UChar* name, UChar* name_end, int gnum) 01343 { 01344 Node* node = node_new(); 01345 CHECK_NULL_RETURN(node); 01346 01347 SET_NTYPE(node, NT_CALL); 01348 NCALL(node)->state = 0; 01349 NCALL(node)->target = NULL_NODE; 01350 NCALL(node)->name = name; 01351 NCALL(node)->name_end = name_end; 01352 NCALL(node)->group_num = gnum; /* call by number if gnum != 0 */ 01353 return node; 01354 } 01355 #endif 01356 01357 static Node* 01358 node_new_quantifier(int lower, int upper, int by_number) 01359 { 01360 Node* node = node_new(); 01361 CHECK_NULL_RETURN(node); 01362 01363 SET_NTYPE(node, NT_QTFR); 01364 NQTFR(node)->state = 0; 01365 NQTFR(node)->target = NULL; 01366 NQTFR(node)->lower = lower; 01367 NQTFR(node)->upper = upper; 01368 NQTFR(node)->greedy = 1; 01369 NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY; 01370 NQTFR(node)->head_exact = NULL_NODE; 01371 NQTFR(node)->next_head_exact = NULL_NODE; 01372 NQTFR(node)->is_refered = 0; 01373 if (by_number != 0) 01374 NQTFR(node)->state |= NST_BY_NUMBER; 01375 01376 #ifdef USE_COMBINATION_EXPLOSION_CHECK 01377 NQTFR(node)->comb_exp_check_num = 0; 01378 #endif 01379 01380 return node; 01381 } 01382 01383 static Node* 01384 node_new_enclose(int type) 01385 { 01386 Node* node = node_new(); 01387 CHECK_NULL_RETURN(node); 01388 01389 SET_NTYPE(node, NT_ENCLOSE); 01390 NENCLOSE(node)->type = type; 01391 NENCLOSE(node)->state = 0; 01392 NENCLOSE(node)->regnum = 0; 01393 NENCLOSE(node)->option = 0; 01394 NENCLOSE(node)->target = NULL; 01395 NENCLOSE(node)->call_addr = -1; 01396 NENCLOSE(node)->opt_count = 0; 01397 return node; 01398 } 01399 01400 extern Node* 01401 onig_node_new_enclose(int type) 01402 { 01403 return node_new_enclose(type); 01404 } 01405 01406 static Node* 01407 node_new_enclose_memory(OnigOptionType option, int is_named) 01408 { 01409 Node* node = node_new_enclose(ENCLOSE_MEMORY); 01410 CHECK_NULL_RETURN(node); 01411 if (is_named != 0) 01412 SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP); 01413 01414 #ifdef USE_SUBEXP_CALL 01415 NENCLOSE(node)->option = option; 01416 #endif 01417 return node; 01418 } 01419 01420 static Node* 01421 node_new_option(OnigOptionType option) 01422 { 01423 Node* node = node_new_enclose(ENCLOSE_OPTION); 01424 CHECK_NULL_RETURN(node); 01425 NENCLOSE(node)->option = option; 01426 return node; 01427 } 01428 01429 extern int 01430 onig_node_str_cat(Node* node, const UChar* s, const UChar* end) 01431 { 01432 ptrdiff_t addlen = end - s; 01433 01434 if (addlen > 0) { 01435 ptrdiff_t len = NSTR(node)->end - NSTR(node)->s; 01436 01437 if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) { 01438 UChar* p; 01439 ptrdiff_t capa = len + addlen + NODE_STR_MARGIN; 01440 01441 if (capa <= NSTR(node)->capa) { 01442 onig_strcpy(NSTR(node)->s + len, s, end); 01443 } 01444 else { 01445 if (NSTR(node)->s == NSTR(node)->buf) 01446 p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end, 01447 s, end, capa); 01448 else 01449 p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa); 01450 01451 CHECK_NULL_RETURN_MEMERR(p); 01452 NSTR(node)->s = p; 01453 NSTR(node)->capa = (int)capa; 01454 } 01455 } 01456 else { 01457 onig_strcpy(NSTR(node)->s + len, s, end); 01458 } 01459 NSTR(node)->end = NSTR(node)->s + len + addlen; 01460 } 01461 01462 return 0; 01463 } 01464 01465 extern int 01466 onig_node_str_set(Node* node, const UChar* s, const UChar* end) 01467 { 01468 onig_node_str_clear(node); 01469 return onig_node_str_cat(node, s, end); 01470 } 01471 01472 static int 01473 node_str_cat_char(Node* node, UChar c) 01474 { 01475 UChar s[1]; 01476 01477 s[0] = c; 01478 return onig_node_str_cat(node, s, s + 1); 01479 } 01480 01481 extern void 01482 onig_node_conv_to_str_node(Node* node, int flag) 01483 { 01484 SET_NTYPE(node, NT_STR); 01485 NSTR(node)->flag = flag; 01486 NSTR(node)->capa = 0; 01487 NSTR(node)->s = NSTR(node)->buf; 01488 NSTR(node)->end = NSTR(node)->buf; 01489 } 01490 01491 extern void 01492 onig_node_str_clear(Node* node) 01493 { 01494 if (NSTR(node)->capa != 0 && 01495 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) { 01496 xfree(NSTR(node)->s); 01497 } 01498 01499 NSTR(node)->capa = 0; 01500 NSTR(node)->flag = 0; 01501 NSTR(node)->s = NSTR(node)->buf; 01502 NSTR(node)->end = NSTR(node)->buf; 01503 } 01504 01505 static Node* 01506 node_new_str(const UChar* s, const UChar* end) 01507 { 01508 Node* node = node_new(); 01509 CHECK_NULL_RETURN(node); 01510 01511 SET_NTYPE(node, NT_STR); 01512 NSTR(node)->capa = 0; 01513 NSTR(node)->flag = 0; 01514 NSTR(node)->s = NSTR(node)->buf; 01515 NSTR(node)->end = NSTR(node)->buf; 01516 if (onig_node_str_cat(node, s, end)) { 01517 onig_node_free(node); 01518 return NULL; 01519 } 01520 return node; 01521 } 01522 01523 extern Node* 01524 onig_node_new_str(const UChar* s, const UChar* end) 01525 { 01526 return node_new_str(s, end); 01527 } 01528 01529 static Node* 01530 node_new_str_raw(UChar* s, UChar* end) 01531 { 01532 Node* node = node_new_str(s, end); 01533 NSTRING_SET_RAW(node); 01534 return node; 01535 } 01536 01537 static Node* 01538 node_new_empty(void) 01539 { 01540 return node_new_str(NULL, NULL); 01541 } 01542 01543 static Node* 01544 node_new_str_raw_char(UChar c) 01545 { 01546 UChar p[1]; 01547 01548 p[0] = c; 01549 return node_new_str_raw(p, p + 1); 01550 } 01551 01552 static Node* 01553 str_node_split_last_char(StrNode* sn, OnigEncoding enc) 01554 { 01555 const UChar *p; 01556 Node* n = NULL_NODE; 01557 01558 if (sn->end > sn->s) { 01559 p = onigenc_get_prev_char_head(enc, sn->s, sn->end, sn->end); 01560 if (p && p > sn->s) { /* can be splitted. */ 01561 n = node_new_str(p, sn->end); 01562 if ((sn->flag & NSTR_RAW) != 0) 01563 NSTRING_SET_RAW(n); 01564 sn->end = (UChar* )p; 01565 } 01566 } 01567 return n; 01568 } 01569 01570 static int 01571 str_node_can_be_split(StrNode* sn, OnigEncoding enc) 01572 { 01573 if (sn->end > sn->s) { 01574 return ((enclen(enc, sn->s, sn->end) < sn->end - sn->s) ? 1 : 0); 01575 } 01576 return 0; 01577 } 01578 01579 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR 01580 static int 01581 node_str_head_pad(StrNode* sn, int num, UChar val) 01582 { 01583 UChar buf[NODE_STR_BUF_SIZE]; 01584 int i, len; 01585 01586 len = sn->end - sn->s; 01587 onig_strcpy(buf, sn->s, sn->end); 01588 onig_strcpy(&(sn->s[num]), buf, buf + len); 01589 sn->end += num; 01590 01591 for (i = 0; i < num; i++) { 01592 sn->s[i] = val; 01593 } 01594 } 01595 #endif 01596 01597 extern int 01598 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) 01599 { 01600 unsigned int num, val; 01601 OnigCodePoint c; 01602 UChar* p = *src; 01603 PFETCH_READY; 01604 01605 num = 0; 01606 while (!PEND) { 01607 PFETCH(c); 01608 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 01609 val = (unsigned int )DIGITVAL(c); 01610 if ((INT_MAX_LIMIT - val) / 10UL < num) 01611 return -1; /* overflow */ 01612 01613 num = num * 10 + val; 01614 } 01615 else { 01616 PUNFETCH; 01617 break; 01618 } 01619 } 01620 *src = p; 01621 return num; 01622 } 01623 01624 static int 01625 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen, 01626 OnigEncoding enc) 01627 { 01628 OnigCodePoint c; 01629 unsigned int num, val; 01630 UChar* p = *src; 01631 PFETCH_READY; 01632 01633 num = 0; 01634 while (!PEND && maxlen-- != 0) { 01635 PFETCH(c); 01636 if (ONIGENC_IS_CODE_XDIGIT(enc, c)) { 01637 val = (unsigned int )XDIGITVAL(enc,c); 01638 if ((INT_MAX_LIMIT - val) / 16UL < num) 01639 return -1; /* overflow */ 01640 01641 num = (num << 4) + XDIGITVAL(enc,c); 01642 } 01643 else { 01644 PUNFETCH; 01645 break; 01646 } 01647 } 01648 *src = p; 01649 return num; 01650 } 01651 01652 static int 01653 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen, 01654 OnigEncoding enc) 01655 { 01656 OnigCodePoint c; 01657 unsigned int num, val; 01658 UChar* p = *src; 01659 PFETCH_READY; 01660 01661 num = 0; 01662 while (!PEND && maxlen-- != 0) { 01663 PFETCH(c); 01664 if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') { 01665 val = ODIGITVAL(c); 01666 if ((INT_MAX_LIMIT - val) / 8UL < num) 01667 return -1; /* overflow */ 01668 01669 num = (num << 3) + val; 01670 } 01671 else { 01672 PUNFETCH; 01673 break; 01674 } 01675 } 01676 *src = p; 01677 return num; 01678 } 01679 01680 01681 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \ 01682 BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT) 01683 01684 /* data format: 01685 [n][from-1][to-1][from-2][to-2] ... [from-n][to-n] 01686 (all data size is OnigCodePoint) 01687 */ 01688 static int 01689 new_code_range(BBuf** pbuf) 01690 { 01691 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5) 01692 int r; 01693 OnigCodePoint n; 01694 BBuf* bbuf; 01695 01696 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf)); 01697 CHECK_NULL_RETURN_MEMERR(*pbuf); 01698 r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE); 01699 if (r) return r; 01700 01701 n = 0; 01702 BBUF_WRITE_CODE_POINT(bbuf, 0, n); 01703 return 0; 01704 } 01705 01706 static int 01707 add_code_range_to_buf0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to, 01708 int checkdup) 01709 { 01710 int r, inc_n, pos; 01711 int low, high, bound, x; 01712 OnigCodePoint n, *data; 01713 BBuf* bbuf; 01714 01715 if (from > to) { 01716 n = from; from = to; to = n; 01717 } 01718 01719 if (IS_NULL(*pbuf)) { 01720 r = new_code_range(pbuf); 01721 if (r) return r; 01722 bbuf = *pbuf; 01723 n = 0; 01724 } 01725 else { 01726 bbuf = *pbuf; 01727 GET_CODE_POINT(n, bbuf->p); 01728 } 01729 data = (OnigCodePoint* )(bbuf->p); 01730 data++; 01731 01732 for (low = 0, bound = n; low < bound; ) { 01733 x = (low + bound) >> 1; 01734 if (from > data[x*2 + 1]) 01735 low = x + 1; 01736 else 01737 bound = x; 01738 } 01739 01740 for (high = low, bound = n; high < bound; ) { 01741 x = (high + bound) >> 1; 01742 if (to >= data[x*2] - 1) 01743 high = x + 1; 01744 else 01745 bound = x; 01746 } 01747 /* data[(low-1)*2+1] << from <= data[low*2] 01748 * data[(high-1)*2+1] <= to << data[high*2] 01749 */ 01750 01751 inc_n = low + 1 - high; 01752 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM) 01753 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES; 01754 01755 if (inc_n != 1) { 01756 if (checkdup && from <= data[low*2+1] 01757 && (data[low*2] <= from || data[low*2+1] <= to)) 01758 CC_DUP_WARN(env); 01759 if (from > data[low*2]) 01760 from = data[low*2]; 01761 if (to < data[(high - 1)*2 + 1]) 01762 to = data[(high - 1)*2 + 1]; 01763 } 01764 01765 if (inc_n != 0 && (OnigCodePoint )high < n) { 01766 int from_pos = SIZE_CODE_POINT * (1 + high * 2); 01767 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2); 01768 int size = (n - high) * 2 * SIZE_CODE_POINT; 01769 01770 if (inc_n > 0) { 01771 BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size); 01772 } 01773 else { 01774 BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos); 01775 } 01776 } 01777 01778 pos = SIZE_CODE_POINT * (1 + low * 2); 01779 BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2); 01780 BBUF_WRITE_CODE_POINT(bbuf, pos, from); 01781 BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to); 01782 n += inc_n; 01783 BBUF_WRITE_CODE_POINT(bbuf, 0, n); 01784 01785 return 0; 01786 } 01787 01788 static int 01789 add_code_range_to_buf(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) 01790 { 01791 return add_code_range_to_buf0(pbuf, env, from, to, 1); 01792 } 01793 01794 static int 01795 add_code_range0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to, int checkdup) 01796 { 01797 if (from > to) { 01798 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) 01799 return 0; 01800 else 01801 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; 01802 } 01803 01804 return add_code_range_to_buf0(pbuf, env, from, to, checkdup); 01805 } 01806 01807 static int 01808 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) 01809 { 01810 return add_code_range0(pbuf, env, from, to, 1); 01811 } 01812 01813 static int 01814 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf, ScanEnv* env) 01815 { 01816 int r, i, n; 01817 OnigCodePoint pre, from, *data, to = 0; 01818 01819 *pbuf = (BBuf* )NULL; 01820 if (IS_NULL(bbuf)) { 01821 set_all: 01822 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); 01823 } 01824 01825 data = (OnigCodePoint* )(bbuf->p); 01826 GET_CODE_POINT(n, data); 01827 data++; 01828 if (n <= 0) goto set_all; 01829 01830 r = 0; 01831 pre = MBCODE_START_POS(enc); 01832 for (i = 0; i < n; i++) { 01833 from = data[i*2]; 01834 to = data[i*2+1]; 01835 if (pre <= from - 1) { 01836 r = add_code_range_to_buf(pbuf, env, pre, from - 1); 01837 if (r != 0) return r; 01838 } 01839 if (to == ~((OnigCodePoint )0)) break; 01840 pre = to + 1; 01841 } 01842 if (to < ~((OnigCodePoint )0)) { 01843 r = add_code_range_to_buf(pbuf, env, to + 1, ~((OnigCodePoint )0)); 01844 } 01845 return r; 01846 } 01847 01848 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\ 01849 BBuf *tbuf; \ 01850 int tnot; \ 01851 tnot = not1; not1 = not2; not2 = tnot; \ 01852 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \ 01853 } while (0) 01854 01855 static int 01856 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1, 01857 BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env) 01858 { 01859 int r; 01860 OnigCodePoint i, n1, *data1; 01861 OnigCodePoint from, to; 01862 01863 *pbuf = (BBuf* )NULL; 01864 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) { 01865 if (not1 != 0 || not2 != 0) 01866 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); 01867 return 0; 01868 } 01869 01870 r = 0; 01871 if (IS_NULL(bbuf2)) 01872 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); 01873 01874 if (IS_NULL(bbuf1)) { 01875 if (not1 != 0) { 01876 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); 01877 } 01878 else { 01879 if (not2 == 0) { 01880 return bbuf_clone(pbuf, bbuf2); 01881 } 01882 else { 01883 return not_code_range_buf(enc, bbuf2, pbuf, env); 01884 } 01885 } 01886 } 01887 01888 if (not1 != 0) 01889 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); 01890 01891 data1 = (OnigCodePoint* )(bbuf1->p); 01892 GET_CODE_POINT(n1, data1); 01893 data1++; 01894 01895 if (not2 == 0 && not1 == 0) { /* 1 OR 2 */ 01896 r = bbuf_clone(pbuf, bbuf2); 01897 } 01898 else if (not1 == 0) { /* 1 OR (not 2) */ 01899 r = not_code_range_buf(enc, bbuf2, pbuf, env); 01900 } 01901 if (r != 0) return r; 01902 01903 for (i = 0; i < n1; i++) { 01904 from = data1[i*2]; 01905 to = data1[i*2+1]; 01906 r = add_code_range_to_buf(pbuf, env, from, to); 01907 if (r != 0) return r; 01908 } 01909 return 0; 01910 } 01911 01912 static int 01913 and_code_range1(BBuf** pbuf, ScanEnv* env, OnigCodePoint from1, OnigCodePoint to1, 01914 OnigCodePoint* data, int n) 01915 { 01916 int i, r; 01917 OnigCodePoint from2, to2; 01918 01919 for (i = 0; i < n; i++) { 01920 from2 = data[i*2]; 01921 to2 = data[i*2+1]; 01922 if (from2 < from1) { 01923 if (to2 < from1) continue; 01924 else { 01925 from1 = to2 + 1; 01926 } 01927 } 01928 else if (from2 <= to1) { 01929 if (to2 < to1) { 01930 if (from1 <= from2 - 1) { 01931 r = add_code_range_to_buf(pbuf, env, from1, from2-1); 01932 if (r != 0) return r; 01933 } 01934 from1 = to2 + 1; 01935 } 01936 else { 01937 to1 = from2 - 1; 01938 } 01939 } 01940 else { 01941 from1 = from2; 01942 } 01943 if (from1 > to1) break; 01944 } 01945 if (from1 <= to1) { 01946 r = add_code_range_to_buf(pbuf, env, from1, to1); 01947 if (r != 0) return r; 01948 } 01949 return 0; 01950 } 01951 01952 static int 01953 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env) 01954 { 01955 int r; 01956 OnigCodePoint i, j, n1, n2, *data1, *data2; 01957 OnigCodePoint from, to, from1, to1, from2, to2; 01958 01959 *pbuf = (BBuf* )NULL; 01960 if (IS_NULL(bbuf1)) { 01961 if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */ 01962 return bbuf_clone(pbuf, bbuf2); 01963 return 0; 01964 } 01965 else if (IS_NULL(bbuf2)) { 01966 if (not2 != 0) 01967 return bbuf_clone(pbuf, bbuf1); 01968 return 0; 01969 } 01970 01971 if (not1 != 0) 01972 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); 01973 01974 data1 = (OnigCodePoint* )(bbuf1->p); 01975 data2 = (OnigCodePoint* )(bbuf2->p); 01976 GET_CODE_POINT(n1, data1); 01977 GET_CODE_POINT(n2, data2); 01978 data1++; 01979 data2++; 01980 01981 if (not2 == 0 && not1 == 0) { /* 1 AND 2 */ 01982 for (i = 0; i < n1; i++) { 01983 from1 = data1[i*2]; 01984 to1 = data1[i*2+1]; 01985 for (j = 0; j < n2; j++) { 01986 from2 = data2[j*2]; 01987 to2 = data2[j*2+1]; 01988 if (from2 > to1) break; 01989 if (to2 < from1) continue; 01990 from = MAX(from1, from2); 01991 to = MIN(to1, to2); 01992 r = add_code_range_to_buf(pbuf, env, from, to); 01993 if (r != 0) return r; 01994 } 01995 } 01996 } 01997 else if (not1 == 0) { /* 1 AND (not 2) */ 01998 for (i = 0; i < n1; i++) { 01999 from1 = data1[i*2]; 02000 to1 = data1[i*2+1]; 02001 r = and_code_range1(pbuf, env, from1, to1, data2, n2); 02002 if (r != 0) return r; 02003 } 02004 } 02005 02006 return 0; 02007 } 02008 02009 static int 02010 and_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env) 02011 { 02012 OnigEncoding enc = env->enc; 02013 int r, not1, not2; 02014 BBuf *buf1, *buf2, *pbuf = 0; 02015 BitSetRef bsr1, bsr2; 02016 BitSet bs1, bs2; 02017 02018 not1 = IS_NCCLASS_NOT(dest); 02019 bsr1 = dest->bs; 02020 buf1 = dest->mbuf; 02021 not2 = IS_NCCLASS_NOT(cc); 02022 bsr2 = cc->bs; 02023 buf2 = cc->mbuf; 02024 02025 if (not1 != 0) { 02026 bitset_invert_to(bsr1, bs1); 02027 bsr1 = bs1; 02028 } 02029 if (not2 != 0) { 02030 bitset_invert_to(bsr2, bs2); 02031 bsr2 = bs2; 02032 } 02033 bitset_and(bsr1, bsr2); 02034 if (bsr1 != dest->bs) { 02035 bitset_copy(dest->bs, bsr1); 02036 bsr1 = dest->bs; 02037 } 02038 if (not1 != 0) { 02039 bitset_invert(dest->bs); 02040 } 02041 02042 if (! ONIGENC_IS_SINGLEBYTE(enc)) { 02043 if (not1 != 0 && not2 != 0) { 02044 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf, env); 02045 } 02046 else { 02047 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf, env); 02048 if (r == 0 && not1 != 0) { 02049 BBuf *tbuf = 0; 02050 r = not_code_range_buf(enc, pbuf, &tbuf, env); 02051 bbuf_free(pbuf); 02052 pbuf = tbuf; 02053 } 02054 } 02055 if (r != 0) { 02056 bbuf_free(pbuf); 02057 return r; 02058 } 02059 02060 dest->mbuf = pbuf; 02061 bbuf_free(buf1); 02062 return r; 02063 } 02064 return 0; 02065 } 02066 02067 static int 02068 or_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env) 02069 { 02070 OnigEncoding enc = env->enc; 02071 int r, not1, not2; 02072 BBuf *buf1, *buf2, *pbuf = 0; 02073 BitSetRef bsr1, bsr2; 02074 BitSet bs1, bs2; 02075 02076 not1 = IS_NCCLASS_NOT(dest); 02077 bsr1 = dest->bs; 02078 buf1 = dest->mbuf; 02079 not2 = IS_NCCLASS_NOT(cc); 02080 bsr2 = cc->bs; 02081 buf2 = cc->mbuf; 02082 02083 if (not1 != 0) { 02084 bitset_invert_to(bsr1, bs1); 02085 bsr1 = bs1; 02086 } 02087 if (not2 != 0) { 02088 bitset_invert_to(bsr2, bs2); 02089 bsr2 = bs2; 02090 } 02091 bitset_or(bsr1, bsr2); 02092 if (bsr1 != dest->bs) { 02093 bitset_copy(dest->bs, bsr1); 02094 bsr1 = dest->bs; 02095 } 02096 if (not1 != 0) { 02097 bitset_invert(dest->bs); 02098 } 02099 02100 if (! ONIGENC_IS_SINGLEBYTE(enc)) { 02101 if (not1 != 0 && not2 != 0) { 02102 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf, env); 02103 } 02104 else { 02105 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf, env); 02106 if (r == 0 && not1 != 0) { 02107 BBuf *tbuf = 0; 02108 r = not_code_range_buf(enc, pbuf, &tbuf, env); 02109 bbuf_free(pbuf); 02110 pbuf = tbuf; 02111 } 02112 } 02113 if (r != 0) { 02114 bbuf_free(pbuf); 02115 return r; 02116 } 02117 02118 dest->mbuf = pbuf; 02119 bbuf_free(buf1); 02120 return r; 02121 } 02122 else 02123 return 0; 02124 } 02125 02126 static void UNKNOWN_ESC_WARN(ScanEnv *env, int c); 02127 02128 static int 02129 conv_backslash_value(int c, ScanEnv* env) 02130 { 02131 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) { 02132 switch (c) { 02133 case 'n': return '\n'; 02134 case 't': return '\t'; 02135 case 'r': return '\r'; 02136 case 'f': return '\f'; 02137 case 'a': return '\007'; 02138 case 'b': return '\010'; 02139 case 'e': return '\033'; 02140 case 'v': 02141 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB)) 02142 return '\v'; 02143 break; 02144 02145 default: 02146 if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) 02147 UNKNOWN_ESC_WARN(env, c); 02148 break; 02149 } 02150 } 02151 return c; 02152 } 02153 02154 #if 0 /* no invalid quantifier */ 02155 static int 02156 is_invalid_quantifier_target(Node* node) 02157 { 02158 switch (NTYPE(node)) { 02159 case NT_ANCHOR: 02160 return 1; 02161 break; 02162 02163 case NT_ENCLOSE: 02164 /* allow enclosed elements */ 02165 /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */ 02166 break; 02167 02168 case NT_LIST: 02169 do { 02170 if (! is_invalid_quantifier_target(NCAR(node))) return 0; 02171 } while (IS_NOT_NULL(node = NCDR(node))); 02172 return 0; 02173 break; 02174 02175 case NT_ALT: 02176 do { 02177 if (is_invalid_quantifier_target(NCAR(node))) return 1; 02178 } while (IS_NOT_NULL(node = NCDR(node))); 02179 break; 02180 02181 default: 02182 break; 02183 } 02184 return 0; 02185 } 02186 #else 02187 #define is_invalid_quantifier_target(node) 0 02188 #endif 02189 02190 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */ 02191 static int 02192 popular_quantifier_num(QtfrNode* q) 02193 { 02194 if (q->greedy) { 02195 if (q->lower == 0) { 02196 if (q->upper == 1) return 0; 02197 else if (IS_REPEAT_INFINITE(q->upper)) return 1; 02198 } 02199 else if (q->lower == 1) { 02200 if (IS_REPEAT_INFINITE(q->upper)) return 2; 02201 } 02202 } 02203 else { 02204 if (q->lower == 0) { 02205 if (q->upper == 1) return 3; 02206 else if (IS_REPEAT_INFINITE(q->upper)) return 4; 02207 } 02208 else if (q->lower == 1) { 02209 if (IS_REPEAT_INFINITE(q->upper)) return 5; 02210 } 02211 } 02212 return -1; 02213 } 02214 02215 02216 enum ReduceType { 02217 RQ_ASIS = 0, /* as is */ 02218 RQ_DEL = 1, /* delete parent */ 02219 RQ_A, /* to '*' */ 02220 RQ_AQ, /* to '*?' */ 02221 RQ_QQ, /* to '??' */ 02222 RQ_P_QQ, /* to '+)??' */ 02223 RQ_PQ_Q /* to '+?)?' */ 02224 }; 02225 02226 static enum ReduceType const ReduceTypeTable[6][6] = { 02227 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */ 02228 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */ 02229 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */ 02230 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */ 02231 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */ 02232 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ 02233 }; 02234 02235 extern void 02236 onig_reduce_nested_quantifier(Node* pnode, Node* cnode) 02237 { 02238 int pnum, cnum; 02239 QtfrNode *p, *c; 02240 02241 p = NQTFR(pnode); 02242 c = NQTFR(cnode); 02243 pnum = popular_quantifier_num(p); 02244 cnum = popular_quantifier_num(c); 02245 if (pnum < 0 || cnum < 0) return ; 02246 02247 switch(ReduceTypeTable[cnum][pnum]) { 02248 case RQ_DEL: 02249 *pnode = *cnode; 02250 break; 02251 case RQ_A: 02252 p->target = c->target; 02253 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1; 02254 break; 02255 case RQ_AQ: 02256 p->target = c->target; 02257 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0; 02258 break; 02259 case RQ_QQ: 02260 p->target = c->target; 02261 p->lower = 0; p->upper = 1; p->greedy = 0; 02262 break; 02263 case RQ_P_QQ: 02264 p->target = cnode; 02265 p->lower = 0; p->upper = 1; p->greedy = 0; 02266 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1; 02267 return ; 02268 break; 02269 case RQ_PQ_Q: 02270 p->target = cnode; 02271 p->lower = 0; p->upper = 1; p->greedy = 1; 02272 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0; 02273 return ; 02274 break; 02275 case RQ_ASIS: 02276 p->target = cnode; 02277 return ; 02278 break; 02279 } 02280 02281 c->target = NULL_NODE; 02282 onig_node_free(cnode); 02283 } 02284 02285 02286 enum TokenSyms { 02287 TK_EOT = 0, /* end of token */ 02288 TK_RAW_BYTE = 1, 02289 TK_CHAR, 02290 TK_STRING, 02291 TK_CODE_POINT, 02292 TK_ANYCHAR, 02293 TK_CHAR_TYPE, 02294 TK_BACKREF, 02295 TK_CALL, 02296 TK_ANCHOR, 02297 TK_OP_REPEAT, 02298 TK_INTERVAL, 02299 TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */ 02300 TK_ALT, 02301 TK_SUBEXP_OPEN, 02302 TK_SUBEXP_CLOSE, 02303 TK_CC_OPEN, 02304 TK_QUOTE_OPEN, 02305 TK_CHAR_PROPERTY, /* \p{...}, \P{...} */ 02306 /* in cc */ 02307 TK_CC_CLOSE, 02308 TK_CC_RANGE, 02309 TK_POSIX_BRACKET_OPEN, 02310 TK_CC_AND, /* && */ 02311 TK_CC_CC_OPEN /* [ */ 02312 }; 02313 02314 typedef struct { 02315 enum TokenSyms type; 02316 int escaped; 02317 int base; /* is number: 8, 16 (used in [....]) */ 02318 UChar* backp; 02319 union { 02320 UChar* s; 02321 int c; 02322 OnigCodePoint code; 02323 int anchor; 02324 int subtype; 02325 struct { 02326 int lower; 02327 int upper; 02328 int greedy; 02329 int possessive; 02330 } repeat; 02331 struct { 02332 int num; 02333 int ref1; 02334 int* refs; 02335 int by_name; 02336 #ifdef USE_BACKREF_WITH_LEVEL 02337 int exist_level; 02338 int level; /* \k<name+n> */ 02339 #endif 02340 } backref; 02341 struct { 02342 UChar* name; 02343 UChar* name_end; 02344 int gnum; 02345 } call; 02346 struct { 02347 int ctype; 02348 int not; 02349 } prop; 02350 } u; 02351 } OnigToken; 02352 02353 02354 static int 02355 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) 02356 { 02357 int low, up, syn_allow, non_low = 0; 02358 int r = 0; 02359 OnigCodePoint c; 02360 OnigEncoding enc = env->enc; 02361 UChar* p = *src; 02362 PFETCH_READY; 02363 02364 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL); 02365 02366 if (PEND) { 02367 if (syn_allow) 02368 return 1; /* "....{" : OK! */ 02369 else 02370 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */ 02371 } 02372 02373 if (! syn_allow) { 02374 c = PPEEK; 02375 if (c == ')' || c == '(' || c == '|') { 02376 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; 02377 } 02378 } 02379 02380 low = onig_scan_unsigned_number(&p, end, env->enc); 02381 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 02382 if (low > ONIG_MAX_REPEAT_NUM) 02383 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 02384 02385 if (p == *src) { /* can't read low */ 02386 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) { 02387 /* allow {,n} as {0,n} */ 02388 low = 0; 02389 non_low = 1; 02390 } 02391 else 02392 goto invalid; 02393 } 02394 02395 if (PEND) goto invalid; 02396 PFETCH(c); 02397 if (c == ',') { 02398 UChar* prev = p; 02399 up = onig_scan_unsigned_number(&p, end, env->enc); 02400 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 02401 if (up > ONIG_MAX_REPEAT_NUM) 02402 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 02403 02404 if (p == prev) { 02405 if (non_low != 0) 02406 goto invalid; 02407 up = REPEAT_INFINITE; /* {n,} : {n,infinite} */ 02408 } 02409 } 02410 else { 02411 if (non_low != 0) 02412 goto invalid; 02413 02414 PUNFETCH; 02415 up = low; /* {n} : exact n times */ 02416 r = 2; /* fixed */ 02417 } 02418 02419 if (PEND) goto invalid; 02420 PFETCH(c); 02421 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { 02422 if (c != MC_ESC(env->syntax)) goto invalid; 02423 PFETCH(c); 02424 } 02425 if (c != '}') goto invalid; 02426 02427 if (!IS_REPEAT_INFINITE(up) && low > up) { 02428 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE; 02429 } 02430 02431 tok->type = TK_INTERVAL; 02432 tok->u.repeat.lower = low; 02433 tok->u.repeat.upper = up; 02434 *src = p; 02435 return r; /* 0: normal {n,m}, 2: fixed {n} */ 02436 02437 invalid: 02438 if (syn_allow) 02439 return 1; /* OK */ 02440 else 02441 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN; 02442 } 02443 02444 /* \M-, \C-, \c, or \... */ 02445 static int 02446 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) 02447 { 02448 int v; 02449 OnigCodePoint c; 02450 OnigEncoding enc = env->enc; 02451 UChar* p = *src; 02452 PFETCH_READY; 02453 02454 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; 02455 02456 PFETCH(c); 02457 switch (c) { 02458 case 'M': 02459 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) { 02460 if (PEND) return ONIGERR_END_PATTERN_AT_META; 02461 PFETCH(c); 02462 if (c != '-') return ONIGERR_META_CODE_SYNTAX; 02463 if (PEND) return ONIGERR_END_PATTERN_AT_META; 02464 PFETCH(c); 02465 if (c == MC_ESC(env->syntax)) { 02466 v = fetch_escaped_value(&p, end, env); 02467 if (v < 0) return v; 02468 c = (OnigCodePoint )v; 02469 } 02470 c = ((c & 0xff) | 0x80); 02471 } 02472 else 02473 goto backslash; 02474 break; 02475 02476 case 'C': 02477 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) { 02478 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; 02479 PFETCH(c); 02480 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX; 02481 goto control; 02482 } 02483 else 02484 goto backslash; 02485 02486 case 'c': 02487 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) { 02488 control: 02489 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; 02490 PFETCH(c); 02491 if (c == '?') { 02492 c = 0177; 02493 } 02494 else { 02495 if (c == MC_ESC(env->syntax)) { 02496 v = fetch_escaped_value(&p, end, env); 02497 if (v < 0) return v; 02498 c = (OnigCodePoint )v; 02499 } 02500 c &= 0x9f; 02501 } 02502 break; 02503 } 02504 /* fall through */ 02505 02506 default: 02507 { 02508 backslash: 02509 c = conv_backslash_value(c, env); 02510 } 02511 break; 02512 } 02513 02514 *src = p; 02515 return c; 02516 } 02517 02518 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env); 02519 02520 static OnigCodePoint 02521 get_name_end_code_point(OnigCodePoint start) 02522 { 02523 switch (start) { 02524 case '<': return (OnigCodePoint )'>'; break; 02525 case '\'': return (OnigCodePoint )'\''; break; 02526 default: 02527 break; 02528 } 02529 02530 return (OnigCodePoint )0; 02531 } 02532 02533 #ifdef USE_NAMED_GROUP 02534 #ifdef USE_BACKREF_WITH_LEVEL 02535 /* 02536 \k<name+n>, \k<name-n> 02537 \k<num+n>, \k<num-n> 02538 \k<-num+n>, \k<-num-n> 02539 */ 02540 static int 02541 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, 02542 UChar** rname_end, ScanEnv* env, 02543 int* rback_num, int* rlevel) 02544 { 02545 int r, sign, is_num, exist_level; 02546 OnigCodePoint end_code; 02547 OnigCodePoint c = 0; 02548 OnigEncoding enc = env->enc; 02549 UChar *name_end; 02550 UChar *pnum_head; 02551 UChar *p = *src; 02552 PFETCH_READY; 02553 02554 *rback_num = 0; 02555 is_num = exist_level = 0; 02556 sign = 1; 02557 pnum_head = *src; 02558 02559 end_code = get_name_end_code_point(start_code); 02560 02561 name_end = end; 02562 r = 0; 02563 if (PEND) { 02564 return ONIGERR_EMPTY_GROUP_NAME; 02565 } 02566 else { 02567 PFETCH(c); 02568 if (c == end_code) 02569 return ONIGERR_EMPTY_GROUP_NAME; 02570 02571 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 02572 is_num = 1; 02573 } 02574 else if (c == '-') { 02575 is_num = 2; 02576 sign = -1; 02577 pnum_head = p; 02578 } 02579 else if (!ONIGENC_IS_CODE_WORD(enc, c)) { 02580 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 02581 } 02582 } 02583 02584 while (!PEND) { 02585 name_end = p; 02586 PFETCH(c); 02587 if (c == end_code || c == ')' || c == '+' || c == '-') { 02588 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME; 02589 break; 02590 } 02591 02592 if (is_num != 0) { 02593 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 02594 is_num = 1; 02595 } 02596 else { 02597 r = ONIGERR_INVALID_GROUP_NAME; 02598 is_num = 0; 02599 } 02600 } 02601 else if (!ONIGENC_IS_CODE_WORD(enc, c)) { 02602 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 02603 } 02604 } 02605 02606 if (r == 0 && c != end_code) { 02607 if (c == '+' || c == '-') { 02608 int level; 02609 int flag = (c == '-' ? -1 : 1); 02610 02611 PFETCH(c); 02612 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err; 02613 PUNFETCH; 02614 level = onig_scan_unsigned_number(&p, end, enc); 02615 if (level < 0) return ONIGERR_TOO_BIG_NUMBER; 02616 *rlevel = (level * flag); 02617 exist_level = 1; 02618 02619 PFETCH(c); 02620 if (c == end_code) 02621 goto end; 02622 } 02623 02624 err: 02625 r = ONIGERR_INVALID_GROUP_NAME; 02626 name_end = end; 02627 } 02628 02629 end: 02630 if (r == 0) { 02631 if (is_num != 0) { 02632 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); 02633 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; 02634 else if (*rback_num == 0) goto err; 02635 02636 *rback_num *= sign; 02637 } 02638 02639 *rname_end = name_end; 02640 *src = p; 02641 return (exist_level ? 1 : 0); 02642 } 02643 else { 02644 onig_scan_env_set_error_string(env, r, *src, name_end); 02645 return r; 02646 } 02647 } 02648 #endif /* USE_BACKREF_WITH_LEVEL */ 02649 02650 /* 02651 def: 0 -> define name (don't allow number name) 02652 1 -> reference name (allow number name) 02653 */ 02654 static int 02655 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, 02656 UChar** rname_end, ScanEnv* env, int* rback_num, int ref) 02657 { 02658 int r, is_num, sign; 02659 OnigCodePoint end_code; 02660 OnigCodePoint c = 0; 02661 OnigEncoding enc = env->enc; 02662 UChar *name_end; 02663 UChar *pnum_head; 02664 UChar *p = *src; 02665 PFETCH_READY; 02666 02667 *rback_num = 0; 02668 02669 end_code = get_name_end_code_point(start_code); 02670 02671 name_end = end; 02672 pnum_head = *src; 02673 r = 0; 02674 is_num = 0; 02675 sign = 1; 02676 if (PEND) { 02677 return ONIGERR_EMPTY_GROUP_NAME; 02678 } 02679 else { 02680 PFETCH(c); 02681 if (c == end_code) 02682 return ONIGERR_EMPTY_GROUP_NAME; 02683 02684 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 02685 if (ref == 1) 02686 is_num = 1; 02687 else { 02688 r = ONIGERR_INVALID_GROUP_NAME; 02689 is_num = 0; 02690 } 02691 } 02692 else if (c == '-') { 02693 if (ref == 1) { 02694 is_num = 2; 02695 sign = -1; 02696 pnum_head = p; 02697 } 02698 else { 02699 r = ONIGERR_INVALID_GROUP_NAME; 02700 is_num = 0; 02701 } 02702 } 02703 else if (!ONIGENC_IS_CODE_WORD(enc, c)) { 02704 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 02705 } 02706 } 02707 02708 if (r == 0) { 02709 while (!PEND) { 02710 name_end = p; 02711 PFETCH(c); 02712 if (c == end_code || c == ')') { 02713 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME; 02714 break; 02715 } 02716 02717 if (is_num != 0) { 02718 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 02719 is_num = 1; 02720 } 02721 else { 02722 if (!ONIGENC_IS_CODE_WORD(enc, c)) 02723 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 02724 else 02725 r = ONIGERR_INVALID_GROUP_NAME; 02726 02727 is_num = 0; 02728 } 02729 } 02730 else { 02731 if (!ONIGENC_IS_CODE_WORD(enc, c)) { 02732 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 02733 } 02734 } 02735 } 02736 02737 if (c != end_code) { 02738 r = ONIGERR_INVALID_GROUP_NAME; 02739 name_end = end; 02740 } 02741 02742 if (is_num != 0) { 02743 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); 02744 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; 02745 else if (*rback_num == 0) { 02746 r = ONIGERR_INVALID_GROUP_NAME; 02747 goto err; 02748 } 02749 02750 *rback_num *= sign; 02751 } 02752 02753 *rname_end = name_end; 02754 *src = p; 02755 return 0; 02756 } 02757 else { 02758 while (!PEND) { 02759 name_end = p; 02760 PFETCH(c); 02761 if (c == end_code || c == ')') 02762 break; 02763 } 02764 if (PEND) 02765 name_end = end; 02766 02767 err: 02768 onig_scan_env_set_error_string(env, r, *src, name_end); 02769 return r; 02770 } 02771 } 02772 #else 02773 static int 02774 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, 02775 UChar** rname_end, ScanEnv* env, int* rback_num, int ref) 02776 { 02777 int r, is_num, sign; 02778 OnigCodePoint end_code; 02779 OnigCodePoint c = 0; 02780 UChar *name_end; 02781 OnigEncoding enc = env->enc; 02782 UChar *pnum_head; 02783 UChar *p = *src; 02784 PFETCH_READY; 02785 02786 *rback_num = 0; 02787 02788 end_code = get_name_end_code_point(start_code); 02789 02790 *rname_end = name_end = end; 02791 r = 0; 02792 pnum_head = *src; 02793 is_num = 0; 02794 sign = 1; 02795 02796 if (PEND) { 02797 return ONIGERR_EMPTY_GROUP_NAME; 02798 } 02799 else { 02800 PFETCH(c); 02801 if (c == end_code) 02802 return ONIGERR_EMPTY_GROUP_NAME; 02803 02804 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 02805 is_num = 1; 02806 } 02807 else if (c == '-') { 02808 is_num = 2; 02809 sign = -1; 02810 pnum_head = p; 02811 } 02812 else { 02813 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 02814 } 02815 } 02816 02817 while (!PEND) { 02818 name_end = p; 02819 02820 PFETCH(c); 02821 if (c == end_code || c == ')') break; 02822 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) 02823 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 02824 } 02825 if (r == 0 && c != end_code) { 02826 r = ONIGERR_INVALID_GROUP_NAME; 02827 name_end = end; 02828 } 02829 02830 if (r == 0) { 02831 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); 02832 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; 02833 else if (*rback_num == 0) { 02834 r = ONIGERR_INVALID_GROUP_NAME; 02835 goto err; 02836 } 02837 *rback_num *= sign; 02838 02839 *rname_end = name_end; 02840 *src = p; 02841 return 0; 02842 } 02843 else { 02844 err: 02845 onig_scan_env_set_error_string(env, r, *src, name_end); 02846 return r; 02847 } 02848 } 02849 #endif /* USE_NAMED_GROUP */ 02850 02851 void onig_vsnprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc, 02852 UChar* pat, UChar* pat_end, const UChar *fmt, va_list args); 02853 02854 static void 02855 onig_syntax_warn(ScanEnv *env, const char *fmt, ...) 02856 { 02857 va_list args; 02858 UChar buf[WARN_BUFSIZE]; 02859 va_start(args, fmt); 02860 onig_vsnprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, 02861 env->pattern, env->pattern_end, 02862 (const UChar *)fmt, args); 02863 va_end(args); 02864 if (env->sourcefile == NULL) 02865 rb_warn("%s", (char *)buf); 02866 else 02867 rb_compile_warn(env->sourcefile, env->sourceline, "%s", (char *)buf); 02868 } 02869 02870 static void 02871 CC_ESC_WARN(ScanEnv *env, UChar *c) 02872 { 02873 if (onig_warn == onig_null_warn) return ; 02874 02875 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) && 02876 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) { 02877 onig_syntax_warn(env, "character class has '%s' without escape", c); 02878 } 02879 } 02880 02881 static void 02882 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c) 02883 { 02884 if (onig_warn == onig_null_warn) return ; 02885 02886 if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) { 02887 onig_syntax_warn(env, "regular expression has '%s' without escape", c); 02888 } 02889 } 02890 02891 static void 02892 CC_DUP_WARN(ScanEnv *env) 02893 { 02894 if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ; 02895 02896 if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_DUP) && 02897 !((env)->warnings_flag & ONIG_SYN_WARN_CC_DUP)) { 02898 (env)->warnings_flag |= ONIG_SYN_WARN_CC_DUP; 02899 onig_syntax_warn(env, "character class has duplicated range"); 02900 } 02901 } 02902 02903 static void 02904 UNKNOWN_ESC_WARN(ScanEnv *env, int c) 02905 { 02906 if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ; 02907 onig_syntax_warn(env, "Unknown escape \\%c is ignored", c); 02908 } 02909 02910 static UChar* 02911 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to, 02912 UChar **next, OnigEncoding enc) 02913 { 02914 int i; 02915 OnigCodePoint x; 02916 UChar *q; 02917 UChar *p = from; 02918 02919 while (p < to) { 02920 x = ONIGENC_MBC_TO_CODE(enc, p, to); 02921 q = p + enclen(enc, p, to); 02922 if (x == s[0]) { 02923 for (i = 1; i < n && q < to; i++) { 02924 x = ONIGENC_MBC_TO_CODE(enc, q, to); 02925 if (x != s[i]) break; 02926 q += enclen(enc, q, to); 02927 } 02928 if (i >= n) { 02929 if (IS_NOT_NULL(next)) 02930 *next = q; 02931 return p; 02932 } 02933 } 02934 p = q; 02935 } 02936 return NULL_UCHARP; 02937 } 02938 02939 static int 02940 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, 02941 OnigCodePoint bad, OnigEncoding enc, const OnigSyntaxType* syn) 02942 { 02943 int i, in_esc; 02944 OnigCodePoint x; 02945 UChar *q; 02946 UChar *p = from; 02947 02948 in_esc = 0; 02949 while (p < to) { 02950 if (in_esc) { 02951 in_esc = 0; 02952 p += enclen(enc, p, to); 02953 } 02954 else { 02955 x = ONIGENC_MBC_TO_CODE(enc, p, to); 02956 q = p + enclen(enc, p, to); 02957 if (x == s[0]) { 02958 for (i = 1; i < n && q < to; i++) { 02959 x = ONIGENC_MBC_TO_CODE(enc, q, to); 02960 if (x != s[i]) break; 02961 q += enclen(enc, q, to); 02962 } 02963 if (i >= n) return 1; 02964 p += enclen(enc, p, to); 02965 } 02966 else { 02967 x = ONIGENC_MBC_TO_CODE(enc, p, to); 02968 if (x == bad) return 0; 02969 else if (x == MC_ESC(syn)) in_esc = 1; 02970 p = q; 02971 } 02972 } 02973 } 02974 return 0; 02975 } 02976 02977 static int 02978 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) 02979 { 02980 int num; 02981 OnigCodePoint c, c2; 02982 const OnigSyntaxType* syn = env->syntax; 02983 OnigEncoding enc = env->enc; 02984 UChar* prev; 02985 UChar* p = *src; 02986 PFETCH_READY; 02987 02988 if (PEND) { 02989 tok->type = TK_EOT; 02990 return tok->type; 02991 } 02992 02993 PFETCH(c); 02994 tok->type = TK_CHAR; 02995 tok->base = 0; 02996 tok->u.c = c; 02997 tok->escaped = 0; 02998 02999 if (c == ']') { 03000 tok->type = TK_CC_CLOSE; 03001 } 03002 else if (c == '-') { 03003 tok->type = TK_CC_RANGE; 03004 } 03005 else if (c == MC_ESC(syn)) { 03006 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) 03007 goto end; 03008 03009 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; 03010 03011 PFETCH(c); 03012 tok->escaped = 1; 03013 tok->u.c = c; 03014 switch (c) { 03015 case 'w': 03016 tok->type = TK_CHAR_TYPE; 03017 tok->u.prop.ctype = ONIGENC_CTYPE_W; 03018 tok->u.prop.not = 0; 03019 break; 03020 case 'W': 03021 tok->type = TK_CHAR_TYPE; 03022 tok->u.prop.ctype = ONIGENC_CTYPE_W; 03023 tok->u.prop.not = 1; 03024 break; 03025 case 'd': 03026 tok->type = TK_CHAR_TYPE; 03027 tok->u.prop.ctype = ONIGENC_CTYPE_D; 03028 tok->u.prop.not = 0; 03029 break; 03030 case 'D': 03031 tok->type = TK_CHAR_TYPE; 03032 tok->u.prop.ctype = ONIGENC_CTYPE_D; 03033 tok->u.prop.not = 1; 03034 break; 03035 case 's': 03036 tok->type = TK_CHAR_TYPE; 03037 tok->u.prop.ctype = ONIGENC_CTYPE_S; 03038 tok->u.prop.not = 0; 03039 break; 03040 case 'S': 03041 tok->type = TK_CHAR_TYPE; 03042 tok->u.prop.ctype = ONIGENC_CTYPE_S; 03043 tok->u.prop.not = 1; 03044 break; 03045 case 'h': 03046 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; 03047 tok->type = TK_CHAR_TYPE; 03048 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; 03049 tok->u.prop.not = 0; 03050 break; 03051 case 'H': 03052 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; 03053 tok->type = TK_CHAR_TYPE; 03054 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; 03055 tok->u.prop.not = 1; 03056 break; 03057 03058 case 'p': 03059 case 'P': 03060 c2 = PPEEK; 03061 if (c2 == '{' && 03062 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { 03063 PINC; 03064 tok->type = TK_CHAR_PROPERTY; 03065 tok->u.prop.not = (c == 'P' ? 1 : 0); 03066 03067 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { 03068 PFETCH(c2); 03069 if (c2 == '^') { 03070 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); 03071 } 03072 else 03073 PUNFETCH; 03074 } 03075 } 03076 else { 03077 onig_syntax_warn(env, "invalid Unicode Property \\%c", c); 03078 } 03079 break; 03080 03081 case 'x': 03082 if (PEND) break; 03083 03084 prev = p; 03085 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { 03086 PINC; 03087 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); 03088 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; 03089 if (!PEND) { 03090 c2 = PPEEK; 03091 if (ONIGENC_IS_CODE_XDIGIT(enc, c2)) 03092 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; 03093 } 03094 03095 if (p > prev + enclen(enc, prev, end) && !PEND && (PPEEK_IS('}'))) { 03096 PINC; 03097 tok->type = TK_CODE_POINT; 03098 tok->base = 16; 03099 tok->u.code = (OnigCodePoint )num; 03100 } 03101 else { 03102 /* can't read nothing or invalid format */ 03103 p = prev; 03104 } 03105 } 03106 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { 03107 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc); 03108 if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 03109 if (p == prev) { /* can't read nothing. */ 03110 num = 0; /* but, it's not error */ 03111 } 03112 tok->type = TK_RAW_BYTE; 03113 tok->base = 16; 03114 tok->u.c = num; 03115 } 03116 break; 03117 03118 case 'u': 03119 if (PEND) break; 03120 03121 prev = p; 03122 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { 03123 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc); 03124 if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 03125 if (p == prev) { /* can't read nothing. */ 03126 num = 0; /* but, it's not error */ 03127 } 03128 tok->type = TK_CODE_POINT; 03129 tok->base = 16; 03130 tok->u.code = (OnigCodePoint )num; 03131 } 03132 break; 03133 03134 case '0': 03135 case '1': case '2': case '3': case '4': case '5': case '6': case '7': 03136 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { 03137 PUNFETCH; 03138 prev = p; 03139 num = scan_unsigned_octal_number(&p, end, 3, enc); 03140 if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 03141 if (p == prev) { /* can't read nothing. */ 03142 num = 0; /* but, it's not error */ 03143 } 03144 tok->type = TK_RAW_BYTE; 03145 tok->base = 8; 03146 tok->u.c = num; 03147 } 03148 break; 03149 03150 default: 03151 PUNFETCH; 03152 num = fetch_escaped_value(&p, end, env); 03153 if (num < 0) return num; 03154 if (tok->u.c != num) { 03155 tok->u.code = (OnigCodePoint )num; 03156 tok->type = TK_CODE_POINT; 03157 } 03158 break; 03159 } 03160 } 03161 else if (c == '[') { 03162 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) { 03163 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' }; 03164 tok->backp = p; /* point at '[' is readed */ 03165 PINC; 03166 if (str_exist_check_with_esc(send, 2, p, end, 03167 (OnigCodePoint )']', enc, syn)) { 03168 tok->type = TK_POSIX_BRACKET_OPEN; 03169 } 03170 else { 03171 PUNFETCH; 03172 goto cc_in_cc; 03173 } 03174 } 03175 else { 03176 cc_in_cc: 03177 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) { 03178 tok->type = TK_CC_CC_OPEN; 03179 } 03180 else { 03181 CC_ESC_WARN(env, (UChar* )"["); 03182 } 03183 } 03184 } 03185 else if (c == '&') { 03186 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) && 03187 !PEND && (PPEEK_IS('&'))) { 03188 PINC; 03189 tok->type = TK_CC_AND; 03190 } 03191 } 03192 03193 end: 03194 *src = p; 03195 return tok->type; 03196 } 03197 03198 static int 03199 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) 03200 { 03201 int r, num; 03202 OnigCodePoint c; 03203 OnigEncoding enc = env->enc; 03204 const OnigSyntaxType* syn = env->syntax; 03205 UChar* prev; 03206 UChar* p = *src; 03207 PFETCH_READY; 03208 03209 start: 03210 if (PEND) { 03211 tok->type = TK_EOT; 03212 return tok->type; 03213 } 03214 03215 tok->type = TK_STRING; 03216 tok->base = 0; 03217 tok->backp = p; 03218 03219 PFETCH(c); 03220 if (IS_MC_ESC_CODE(c, syn)) { 03221 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; 03222 03223 tok->backp = p; 03224 PFETCH(c); 03225 03226 tok->u.c = c; 03227 tok->escaped = 1; 03228 switch (c) { 03229 case '*': 03230 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break; 03231 tok->type = TK_OP_REPEAT; 03232 tok->u.repeat.lower = 0; 03233 tok->u.repeat.upper = REPEAT_INFINITE; 03234 goto greedy_check; 03235 break; 03236 03237 case '+': 03238 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break; 03239 tok->type = TK_OP_REPEAT; 03240 tok->u.repeat.lower = 1; 03241 tok->u.repeat.upper = REPEAT_INFINITE; 03242 goto greedy_check; 03243 break; 03244 03245 case '?': 03246 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break; 03247 tok->type = TK_OP_REPEAT; 03248 tok->u.repeat.lower = 0; 03249 tok->u.repeat.upper = 1; 03250 greedy_check: 03251 if (!PEND && PPEEK_IS('?') && 03252 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) { 03253 PFETCH(c); 03254 tok->u.repeat.greedy = 0; 03255 tok->u.repeat.possessive = 0; 03256 } 03257 else { 03258 possessive_check: 03259 if (!PEND && PPEEK_IS('+') && 03260 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) && 03261 tok->type != TK_INTERVAL) || 03262 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) && 03263 tok->type == TK_INTERVAL))) { 03264 PFETCH(c); 03265 tok->u.repeat.greedy = 1; 03266 tok->u.repeat.possessive = 1; 03267 } 03268 else { 03269 tok->u.repeat.greedy = 1; 03270 tok->u.repeat.possessive = 0; 03271 } 03272 } 03273 break; 03274 03275 case '{': 03276 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break; 03277 r = fetch_range_quantifier(&p, end, tok, env); 03278 if (r < 0) return r; /* error */ 03279 if (r == 0) goto greedy_check; 03280 else if (r == 2) { /* {n} */ 03281 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) 03282 goto possessive_check; 03283 03284 goto greedy_check; 03285 } 03286 /* r == 1 : normal char */ 03287 break; 03288 03289 case '|': 03290 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break; 03291 tok->type = TK_ALT; 03292 break; 03293 03294 case '(': 03295 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break; 03296 tok->type = TK_SUBEXP_OPEN; 03297 break; 03298 03299 case ')': 03300 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break; 03301 tok->type = TK_SUBEXP_CLOSE; 03302 break; 03303 03304 case 'w': 03305 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; 03306 tok->type = TK_CHAR_TYPE; 03307 tok->u.prop.ctype = ONIGENC_CTYPE_W; 03308 tok->u.prop.not = 0; 03309 break; 03310 03311 case 'W': 03312 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; 03313 tok->type = TK_CHAR_TYPE; 03314 tok->u.prop.ctype = ONIGENC_CTYPE_W; 03315 tok->u.prop.not = 1; 03316 break; 03317 03318 case 'b': 03319 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; 03320 tok->type = TK_ANCHOR; 03321 tok->u.anchor = ANCHOR_WORD_BOUND; 03322 break; 03323 03324 case 'B': 03325 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; 03326 tok->type = TK_ANCHOR; 03327 tok->u.anchor = ANCHOR_NOT_WORD_BOUND; 03328 break; 03329 03330 #ifdef USE_WORD_BEGIN_END 03331 case '<': 03332 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; 03333 tok->type = TK_ANCHOR; 03334 tok->u.anchor = ANCHOR_WORD_BEGIN; 03335 break; 03336 03337 case '>': 03338 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; 03339 tok->type = TK_ANCHOR; 03340 tok->u.anchor = ANCHOR_WORD_END; 03341 break; 03342 #endif 03343 03344 case 's': 03345 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; 03346 tok->type = TK_CHAR_TYPE; 03347 tok->u.prop.ctype = ONIGENC_CTYPE_S; 03348 tok->u.prop.not = 0; 03349 break; 03350 03351 case 'S': 03352 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; 03353 tok->type = TK_CHAR_TYPE; 03354 tok->u.prop.ctype = ONIGENC_CTYPE_S; 03355 tok->u.prop.not = 1; 03356 break; 03357 03358 case 'd': 03359 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; 03360 tok->type = TK_CHAR_TYPE; 03361 tok->u.prop.ctype = ONIGENC_CTYPE_D; 03362 tok->u.prop.not = 0; 03363 break; 03364 03365 case 'D': 03366 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; 03367 tok->type = TK_CHAR_TYPE; 03368 tok->u.prop.ctype = ONIGENC_CTYPE_D; 03369 tok->u.prop.not = 1; 03370 break; 03371 03372 case 'h': 03373 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; 03374 tok->type = TK_CHAR_TYPE; 03375 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; 03376 tok->u.prop.not = 0; 03377 break; 03378 03379 case 'H': 03380 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; 03381 tok->type = TK_CHAR_TYPE; 03382 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; 03383 tok->u.prop.not = 1; 03384 break; 03385 03386 case 'A': 03387 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; 03388 begin_buf: 03389 tok->type = TK_ANCHOR; 03390 tok->u.subtype = ANCHOR_BEGIN_BUF; 03391 break; 03392 03393 case 'Z': 03394 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; 03395 tok->type = TK_ANCHOR; 03396 tok->u.subtype = ANCHOR_SEMI_END_BUF; 03397 break; 03398 03399 case 'z': 03400 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; 03401 end_buf: 03402 tok->type = TK_ANCHOR; 03403 tok->u.subtype = ANCHOR_END_BUF; 03404 break; 03405 03406 case 'G': 03407 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break; 03408 tok->type = TK_ANCHOR; 03409 tok->u.subtype = ANCHOR_BEGIN_POSITION; 03410 break; 03411 03412 case '`': 03413 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break; 03414 goto begin_buf; 03415 break; 03416 03417 case '\'': 03418 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break; 03419 goto end_buf; 03420 break; 03421 03422 case 'x': 03423 if (PEND) break; 03424 03425 prev = p; 03426 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { 03427 PINC; 03428 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); 03429 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; 03430 if (!PEND) { 03431 if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK)) 03432 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; 03433 } 03434 03435 if ((p > prev + enclen(enc, prev, end)) && !PEND && PPEEK_IS('}')) { 03436 PINC; 03437 tok->type = TK_CODE_POINT; 03438 tok->u.code = (OnigCodePoint )num; 03439 } 03440 else { 03441 /* can't read nothing or invalid format */ 03442 p = prev; 03443 } 03444 } 03445 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { 03446 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc); 03447 if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 03448 if (p == prev) { /* can't read nothing. */ 03449 num = 0; /* but, it's not error */ 03450 } 03451 tok->type = TK_RAW_BYTE; 03452 tok->base = 16; 03453 tok->u.c = num; 03454 } 03455 break; 03456 03457 case 'u': 03458 if (PEND) break; 03459 03460 prev = p; 03461 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { 03462 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc); 03463 if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 03464 if (p == prev) { /* can't read nothing. */ 03465 num = 0; /* but, it's not error */ 03466 } 03467 tok->type = TK_CODE_POINT; 03468 tok->base = 16; 03469 tok->u.code = (OnigCodePoint )num; 03470 } 03471 break; 03472 03473 case '1': case '2': case '3': case '4': 03474 case '5': case '6': case '7': case '8': case '9': 03475 PUNFETCH; 03476 prev = p; 03477 num = onig_scan_unsigned_number(&p, end, enc); 03478 if (num < 0 || num > ONIG_MAX_BACKREF_NUM) { 03479 goto skip_backref; 03480 } 03481 03482 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && 03483 (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */ 03484 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { 03485 if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num])) 03486 return ONIGERR_INVALID_BACKREF; 03487 } 03488 03489 tok->type = TK_BACKREF; 03490 tok->u.backref.num = 1; 03491 tok->u.backref.ref1 = num; 03492 tok->u.backref.by_name = 0; 03493 #ifdef USE_BACKREF_WITH_LEVEL 03494 tok->u.backref.exist_level = 0; 03495 #endif 03496 break; 03497 } 03498 03499 skip_backref: 03500 if (c == '8' || c == '9') { 03501 /* normal char */ 03502 p = prev; PINC; 03503 break; 03504 } 03505 03506 p = prev; 03507 /* fall through */ 03508 case '0': 03509 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { 03510 prev = p; 03511 num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc); 03512 if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 03513 if (p == prev) { /* can't read nothing. */ 03514 num = 0; /* but, it's not error */ 03515 } 03516 tok->type = TK_RAW_BYTE; 03517 tok->base = 8; 03518 tok->u.c = num; 03519 } 03520 else if (c != '0') { 03521 PINC; 03522 } 03523 break; 03524 03525 #ifdef USE_NAMED_GROUP 03526 case 'k': 03527 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) { 03528 PFETCH(c); 03529 if (c == '<' || c == '\'') { 03530 UChar* name_end; 03531 int* backs; 03532 int back_num; 03533 03534 prev = p; 03535 03536 #ifdef USE_BACKREF_WITH_LEVEL 03537 name_end = NULL_UCHARP; /* no need. escape gcc warning. */ 03538 r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end, 03539 env, &back_num, &tok->u.backref.level); 03540 if (r == 1) tok->u.backref.exist_level = 1; 03541 else tok->u.backref.exist_level = 0; 03542 #else 03543 r = fetch_name(&p, end, &name_end, env, &back_num, 1); 03544 #endif 03545 if (r < 0) return r; 03546 03547 if (back_num != 0) { 03548 if (back_num < 0) { 03549 back_num = BACKREF_REL_TO_ABS(back_num, env); 03550 if (back_num <= 0) 03551 return ONIGERR_INVALID_BACKREF; 03552 } 03553 03554 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { 03555 if (back_num > env->num_mem || 03556 IS_NULL(SCANENV_MEM_NODES(env)[back_num])) 03557 return ONIGERR_INVALID_BACKREF; 03558 } 03559 tok->type = TK_BACKREF; 03560 tok->u.backref.by_name = 0; 03561 tok->u.backref.num = 1; 03562 tok->u.backref.ref1 = back_num; 03563 } 03564 else { 03565 num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); 03566 if (num <= 0) { 03567 onig_scan_env_set_error_string(env, 03568 ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); 03569 return ONIGERR_UNDEFINED_NAME_REFERENCE; 03570 } 03571 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { 03572 int i; 03573 for (i = 0; i < num; i++) { 03574 if (backs[i] > env->num_mem || 03575 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]])) 03576 return ONIGERR_INVALID_BACKREF; 03577 } 03578 } 03579 03580 tok->type = TK_BACKREF; 03581 tok->u.backref.by_name = 1; 03582 if (num == 1) { 03583 tok->u.backref.num = 1; 03584 tok->u.backref.ref1 = backs[0]; 03585 } 03586 else { 03587 tok->u.backref.num = num; 03588 tok->u.backref.refs = backs; 03589 } 03590 } 03591 } 03592 else { 03593 PUNFETCH; 03594 onig_syntax_warn(env, "invalid back reference"); 03595 } 03596 } 03597 break; 03598 #endif 03599 03600 #ifdef USE_SUBEXP_CALL 03601 case 'g': 03602 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) { 03603 PFETCH(c); 03604 if (c == '<' || c == '\'') { 03605 int gnum; 03606 UChar* name_end; 03607 03608 prev = p; 03609 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1); 03610 if (r < 0) return r; 03611 03612 tok->type = TK_CALL; 03613 tok->u.call.name = prev; 03614 tok->u.call.name_end = name_end; 03615 tok->u.call.gnum = gnum; 03616 } 03617 else { 03618 onig_syntax_warn(env, "invalid subexp call"); 03619 PUNFETCH; 03620 } 03621 } 03622 break; 03623 #endif 03624 03625 case 'Q': 03626 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) { 03627 tok->type = TK_QUOTE_OPEN; 03628 } 03629 break; 03630 03631 case 'p': 03632 case 'P': 03633 if (PPEEK_IS('{') && 03634 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { 03635 PINC; 03636 tok->type = TK_CHAR_PROPERTY; 03637 tok->u.prop.not = (c == 'P' ? 1 : 0); 03638 03639 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { 03640 PFETCH(c); 03641 if (c == '^') { 03642 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); 03643 } 03644 else 03645 PUNFETCH; 03646 } 03647 } 03648 else { 03649 onig_syntax_warn(env, "invalid Unicode Property \\%c", c); 03650 } 03651 break; 03652 03653 default: 03654 PUNFETCH; 03655 num = fetch_escaped_value(&p, end, env); 03656 if (num < 0) return num; 03657 /* set_raw: */ 03658 if (tok->u.c != num) { 03659 tok->type = TK_CODE_POINT; 03660 tok->u.code = (OnigCodePoint )num; 03661 } 03662 else { /* string */ 03663 p = tok->backp + enclen(enc, tok->backp, end); 03664 } 03665 break; 03666 } 03667 } 03668 else { 03669 tok->u.c = c; 03670 tok->escaped = 0; 03671 03672 #ifdef USE_VARIABLE_META_CHARS 03673 if ((c != ONIG_INEFFECTIVE_META_CHAR) && 03674 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) { 03675 if (c == MC_ANYCHAR(syn)) 03676 goto any_char; 03677 else if (c == MC_ANYTIME(syn)) 03678 goto anytime; 03679 else if (c == MC_ZERO_OR_ONE_TIME(syn)) 03680 goto zero_or_one_time; 03681 else if (c == MC_ONE_OR_MORE_TIME(syn)) 03682 goto one_or_more_time; 03683 else if (c == MC_ANYCHAR_ANYTIME(syn)) { 03684 tok->type = TK_ANYCHAR_ANYTIME; 03685 goto out; 03686 } 03687 } 03688 #endif 03689 03690 switch (c) { 03691 case '.': 03692 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break; 03693 #ifdef USE_VARIABLE_META_CHARS 03694 any_char: 03695 #endif 03696 tok->type = TK_ANYCHAR; 03697 break; 03698 03699 case '*': 03700 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break; 03701 #ifdef USE_VARIABLE_META_CHARS 03702 anytime: 03703 #endif 03704 tok->type = TK_OP_REPEAT; 03705 tok->u.repeat.lower = 0; 03706 tok->u.repeat.upper = REPEAT_INFINITE; 03707 goto greedy_check; 03708 break; 03709 03710 case '+': 03711 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break; 03712 #ifdef USE_VARIABLE_META_CHARS 03713 one_or_more_time: 03714 #endif 03715 tok->type = TK_OP_REPEAT; 03716 tok->u.repeat.lower = 1; 03717 tok->u.repeat.upper = REPEAT_INFINITE; 03718 goto greedy_check; 03719 break; 03720 03721 case '?': 03722 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break; 03723 #ifdef USE_VARIABLE_META_CHARS 03724 zero_or_one_time: 03725 #endif 03726 tok->type = TK_OP_REPEAT; 03727 tok->u.repeat.lower = 0; 03728 tok->u.repeat.upper = 1; 03729 goto greedy_check; 03730 break; 03731 03732 case '{': 03733 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break; 03734 r = fetch_range_quantifier(&p, end, tok, env); 03735 if (r < 0) return r; /* error */ 03736 if (r == 0) goto greedy_check; 03737 else if (r == 2) { /* {n} */ 03738 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) 03739 goto possessive_check; 03740 03741 goto greedy_check; 03742 } 03743 /* r == 1 : normal char */ 03744 break; 03745 03746 case '|': 03747 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break; 03748 tok->type = TK_ALT; 03749 break; 03750 03751 case '(': 03752 if (PPEEK_IS('?') && 03753 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { 03754 PINC; 03755 if (PPEEK_IS('#')) { 03756 PFETCH(c); 03757 while (1) { 03758 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 03759 PFETCH(c); 03760 if (c == MC_ESC(syn)) { 03761 if (!PEND) PFETCH(c); 03762 } 03763 else { 03764 if (c == ')') break; 03765 } 03766 } 03767 goto start; 03768 } 03769 PUNFETCH; 03770 } 03771 03772 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; 03773 tok->type = TK_SUBEXP_OPEN; 03774 break; 03775 03776 case ')': 03777 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; 03778 tok->type = TK_SUBEXP_CLOSE; 03779 break; 03780 03781 case '^': 03782 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; 03783 tok->type = TK_ANCHOR; 03784 tok->u.subtype = (IS_SINGLELINE(env->option) 03785 ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE); 03786 break; 03787 03788 case '$': 03789 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; 03790 tok->type = TK_ANCHOR; 03791 tok->u.subtype = (IS_SINGLELINE(env->option) 03792 ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE); 03793 break; 03794 03795 case '[': 03796 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break; 03797 tok->type = TK_CC_OPEN; 03798 break; 03799 03800 case ']': 03801 if (*src > env->pattern) /* /].../ is allowed. */ 03802 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]"); 03803 break; 03804 03805 case '#': 03806 if (IS_EXTEND(env->option)) { 03807 while (!PEND) { 03808 PFETCH(c); 03809 if (ONIGENC_IS_CODE_NEWLINE(enc, c)) 03810 break; 03811 } 03812 goto start; 03813 break; 03814 } 03815 break; 03816 03817 case ' ': case '\t': case '\n': case '\r': case '\f': 03818 if (IS_EXTEND(env->option)) 03819 goto start; 03820 break; 03821 03822 default: 03823 /* string */ 03824 break; 03825 } 03826 } 03827 03828 #ifdef USE_VARIABLE_META_CHARS 03829 out: 03830 #endif 03831 *src = p; 03832 return tok->type; 03833 } 03834 03835 static int 03836 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not, 03837 ScanEnv* env, 03838 OnigCodePoint sb_out, const OnigCodePoint mbr[]) 03839 { 03840 int i, r; 03841 OnigCodePoint j; 03842 03843 int n = ONIGENC_CODE_RANGE_NUM(mbr); 03844 03845 if (not == 0) { 03846 for (i = 0; i < n; i++) { 03847 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i); 03848 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) { 03849 if (j >= sb_out) { 03850 if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) { 03851 r = add_code_range_to_buf(&(cc->mbuf), env, j, 03852 ONIGENC_CODE_RANGE_TO(mbr, i)); 03853 if (r != 0) return r; 03854 i++; 03855 } 03856 03857 goto sb_end; 03858 } 03859 BITSET_SET_BIT_CHKDUP(cc->bs, j); 03860 } 03861 } 03862 03863 sb_end: 03864 for ( ; i < n; i++) { 03865 r = add_code_range_to_buf(&(cc->mbuf), env, 03866 ONIGENC_CODE_RANGE_FROM(mbr, i), 03867 ONIGENC_CODE_RANGE_TO(mbr, i)); 03868 if (r != 0) return r; 03869 } 03870 } 03871 else { 03872 OnigCodePoint prev = 0; 03873 03874 for (i = 0; i < n; i++) { 03875 for (j = prev; 03876 j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) { 03877 if (j >= sb_out) { 03878 goto sb_end2; 03879 } 03880 BITSET_SET_BIT_CHKDUP(cc->bs, j); 03881 } 03882 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; 03883 } 03884 for (j = prev; j < sb_out; j++) { 03885 BITSET_SET_BIT_CHKDUP(cc->bs, j); 03886 } 03887 03888 sb_end2: 03889 prev = sb_out; 03890 03891 for (i = 0; i < n; i++) { 03892 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) { 03893 r = add_code_range_to_buf(&(cc->mbuf), env, prev, 03894 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1); 03895 if (r != 0) return r; 03896 } 03897 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; 03898 } 03899 if (prev < 0x7fffffff) { 03900 r = add_code_range_to_buf(&(cc->mbuf), env, prev, 0x7fffffff); 03901 if (r != 0) return r; 03902 } 03903 } 03904 03905 return 0; 03906 } 03907 03908 static int 03909 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) 03910 { 03911 int c, r; 03912 const OnigCodePoint *ranges; 03913 OnigCodePoint sb_out; 03914 OnigEncoding enc = env->enc; 03915 03916 switch (ctype) { 03917 case ONIGENC_CTYPE_D: 03918 case ONIGENC_CTYPE_S: 03919 case ONIGENC_CTYPE_W: 03920 ctype ^= ONIGENC_CTYPE_SPECIAL_MASK; 03921 if (not != 0) { 03922 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 03923 if (! ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype)) 03924 BITSET_SET_BIT_CHKDUP(cc->bs, c); 03925 } 03926 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); 03927 } 03928 else { 03929 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 03930 if (ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype)) 03931 BITSET_SET_BIT_CHKDUP(cc->bs, c); 03932 } 03933 } 03934 return 0; 03935 break; 03936 } 03937 03938 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges); 03939 if (r == 0) { 03940 return add_ctype_to_cc_by_range(cc, ctype, not, env, sb_out, ranges); 03941 } 03942 else if (r != ONIG_NO_SUPPORT_CONFIG) { 03943 return r; 03944 } 03945 03946 r = 0; 03947 switch (ctype) { 03948 case ONIGENC_CTYPE_ALPHA: 03949 case ONIGENC_CTYPE_BLANK: 03950 case ONIGENC_CTYPE_CNTRL: 03951 case ONIGENC_CTYPE_DIGIT: 03952 case ONIGENC_CTYPE_LOWER: 03953 case ONIGENC_CTYPE_PUNCT: 03954 case ONIGENC_CTYPE_SPACE: 03955 case ONIGENC_CTYPE_UPPER: 03956 case ONIGENC_CTYPE_XDIGIT: 03957 case ONIGENC_CTYPE_ASCII: 03958 case ONIGENC_CTYPE_ALNUM: 03959 if (not != 0) { 03960 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 03961 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) 03962 BITSET_SET_BIT_CHKDUP(cc->bs, c); 03963 } 03964 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); 03965 } 03966 else { 03967 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 03968 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) 03969 BITSET_SET_BIT_CHKDUP(cc->bs, c); 03970 } 03971 } 03972 break; 03973 03974 case ONIGENC_CTYPE_GRAPH: 03975 case ONIGENC_CTYPE_PRINT: 03976 if (not != 0) { 03977 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 03978 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) 03979 BITSET_SET_BIT_CHKDUP(cc->bs, c); 03980 } 03981 } 03982 else { 03983 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 03984 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) 03985 BITSET_SET_BIT_CHKDUP(cc->bs, c); 03986 } 03987 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); 03988 } 03989 break; 03990 03991 case ONIGENC_CTYPE_WORD: 03992 if (not == 0) { 03993 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 03994 if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT_CHKDUP(cc->bs, c); 03995 } 03996 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); 03997 } 03998 else { 03999 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 04000 if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */ 04001 && ! ONIGENC_IS_CODE_WORD(enc, c)) 04002 BITSET_SET_BIT_CHKDUP(cc->bs, c); 04003 } 04004 } 04005 break; 04006 04007 default: 04008 return ONIGERR_PARSER_BUG; 04009 break; 04010 } 04011 04012 return r; 04013 } 04014 04015 static int 04016 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) 04017 { 04018 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20 04019 #define POSIX_BRACKET_NAME_MIN_LEN 4 04020 04021 static const PosixBracketEntryType PBS[] = { 04022 { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 }, 04023 { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 }, 04024 { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 }, 04025 { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 }, 04026 { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 }, 04027 { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 }, 04028 { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 }, 04029 { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 }, 04030 { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 }, 04031 { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 }, 04032 { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 }, 04033 { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 }, 04034 { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 }, 04035 { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 }, 04036 { (UChar* )NULL, -1, 0 } 04037 }; 04038 04039 const PosixBracketEntryType *pb; 04040 int not, i, r; 04041 OnigCodePoint c; 04042 OnigEncoding enc = env->enc; 04043 UChar *p = *src; 04044 PFETCH_READY; 04045 04046 if (PPEEK_IS('^')) { 04047 PINC; 04048 not = 1; 04049 } 04050 else 04051 not = 0; 04052 04053 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3) 04054 goto not_posix_bracket; 04055 04056 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { 04057 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) { 04058 p = (UChar* )onigenc_step(enc, p, end, pb->len); 04059 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0) 04060 return ONIGERR_INVALID_POSIX_BRACKET_TYPE; 04061 04062 r = add_ctype_to_cc(cc, pb->ctype, not, env); 04063 if (r != 0) return r; 04064 04065 PINC; PINC; 04066 *src = p; 04067 return 0; 04068 } 04069 } 04070 04071 not_posix_bracket: 04072 c = 0; 04073 i = 0; 04074 while (!PEND && ((c = PPEEK) != ':') && c != ']') { 04075 PINC; 04076 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break; 04077 } 04078 if (c == ':' && ! PEND) { 04079 PINC; 04080 if (! PEND) { 04081 PFETCH(c); 04082 if (c == ']') 04083 return ONIGERR_INVALID_POSIX_BRACKET_TYPE; 04084 } 04085 } 04086 04087 return 1; /* 1: is not POSIX bracket, but no error. */ 04088 } 04089 04090 static int 04091 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) 04092 { 04093 int r; 04094 OnigCodePoint c; 04095 OnigEncoding enc = env->enc; 04096 UChar *prev, *start, *p = *src; 04097 PFETCH_READY; 04098 04099 r = 0; 04100 start = prev = p; 04101 04102 while (!PEND) { 04103 prev = p; 04104 PFETCH(c); 04105 if (c == '}') { 04106 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev); 04107 if (r < 0) break; 04108 04109 *src = p; 04110 return r; 04111 } 04112 else if (c == '(' || c == ')' || c == '{' || c == '|') { 04113 r = ONIGERR_INVALID_CHAR_PROPERTY_NAME; 04114 break; 04115 } 04116 } 04117 04118 onig_scan_env_set_error_string(env, r, *src, prev); 04119 return r; 04120 } 04121 04122 static int 04123 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end, 04124 ScanEnv* env) 04125 { 04126 int r, ctype; 04127 CClassNode* cc; 04128 04129 ctype = fetch_char_property_to_ctype(src, end, env); 04130 if (ctype < 0) return ctype; 04131 04132 *np = node_new_cclass(); 04133 CHECK_NULL_RETURN_MEMERR(*np); 04134 cc = NCCLASS(*np); 04135 r = add_ctype_to_cc(cc, ctype, 0, env); 04136 if (r != 0) return r; 04137 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); 04138 04139 return 0; 04140 } 04141 04142 04143 enum CCSTATE { 04144 CCS_VALUE, 04145 CCS_RANGE, 04146 CCS_COMPLETE, 04147 CCS_START 04148 }; 04149 04150 enum CCVALTYPE { 04151 CCV_SB, 04152 CCV_CODE_POINT, 04153 CCV_CLASS 04154 }; 04155 04156 static int 04157 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type, 04158 enum CCSTATE* state, ScanEnv* env) 04159 { 04160 int r; 04161 04162 if (*state == CCS_RANGE) 04163 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE; 04164 04165 if (*state == CCS_VALUE && *type != CCV_CLASS) { 04166 if (*type == CCV_SB) 04167 BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*vs)); 04168 else if (*type == CCV_CODE_POINT) { 04169 r = add_code_range(&(cc->mbuf), env, *vs, *vs); 04170 if (r < 0) return r; 04171 } 04172 } 04173 04174 *state = CCS_VALUE; 04175 *type = CCV_CLASS; 04176 return 0; 04177 } 04178 04179 static int 04180 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, 04181 int* vs_israw, int v_israw, 04182 enum CCVALTYPE intype, enum CCVALTYPE* type, 04183 enum CCSTATE* state, ScanEnv* env) 04184 { 04185 int r; 04186 04187 switch (*state) { 04188 case CCS_VALUE: 04189 if (*type == CCV_SB) 04190 BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*vs)); 04191 else if (*type == CCV_CODE_POINT) { 04192 r = add_code_range(&(cc->mbuf), env, *vs, *vs); 04193 if (r < 0) return r; 04194 } 04195 break; 04196 04197 case CCS_RANGE: 04198 if (intype == *type) { 04199 if (intype == CCV_SB) { 04200 if (*vs > 0xff || v > 0xff) 04201 return ONIGERR_INVALID_CODE_POINT_VALUE; 04202 04203 if (*vs > v) { 04204 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) 04205 goto ccs_range_end; 04206 else 04207 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; 04208 } 04209 bitset_set_range(env, cc->bs, (int )*vs, (int )v); 04210 } 04211 else { 04212 r = add_code_range(&(cc->mbuf), env, *vs, v); 04213 if (r < 0) return r; 04214 } 04215 } 04216 else { 04217 #if 0 04218 if (intype == CCV_CODE_POINT && *type == CCV_SB) { 04219 #endif 04220 if (*vs > v) { 04221 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) 04222 goto ccs_range_end; 04223 else 04224 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; 04225 } 04226 bitset_set_range(env, cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff)); 04227 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v); 04228 if (r < 0) return r; 04229 #if 0 04230 } 04231 else 04232 return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE; 04233 #endif 04234 } 04235 ccs_range_end: 04236 *state = CCS_COMPLETE; 04237 break; 04238 04239 case CCS_COMPLETE: 04240 case CCS_START: 04241 *state = CCS_VALUE; 04242 break; 04243 04244 default: 04245 break; 04246 } 04247 04248 *vs_israw = v_israw; 04249 *vs = v; 04250 *type = intype; 04251 return 0; 04252 } 04253 04254 static int 04255 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, 04256 ScanEnv* env) 04257 { 04258 int in_esc; 04259 OnigCodePoint code; 04260 OnigEncoding enc = env->enc; 04261 UChar* p = from; 04262 PFETCH_READY; 04263 04264 in_esc = 0; 04265 while (! PEND) { 04266 if (ignore_escaped && in_esc) { 04267 in_esc = 0; 04268 } 04269 else { 04270 PFETCH(code); 04271 if (code == c) return 1; 04272 if (code == MC_ESC(env->syntax)) in_esc = 1; 04273 } 04274 } 04275 return 0; 04276 } 04277 04278 static int 04279 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, 04280 ScanEnv* env) 04281 { 04282 int r, neg, len, fetched, and_start; 04283 OnigCodePoint v, vs; 04284 UChar *p; 04285 Node* node; 04286 CClassNode *cc, *prev_cc; 04287 CClassNode work_cc; 04288 04289 enum CCSTATE state; 04290 enum CCVALTYPE val_type, in_type; 04291 int val_israw, in_israw; 04292 04293 prev_cc = (CClassNode* )NULL; 04294 *np = NULL_NODE; 04295 r = fetch_token_in_cc(tok, src, end, env); 04296 if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { 04297 neg = 1; 04298 r = fetch_token_in_cc(tok, src, end, env); 04299 } 04300 else { 04301 neg = 0; 04302 } 04303 04304 if (r < 0) return r; 04305 if (r == TK_CC_CLOSE) { 04306 if (! code_exist_check((OnigCodePoint )']', 04307 *src, env->pattern_end, 1, env)) 04308 return ONIGERR_EMPTY_CHAR_CLASS; 04309 04310 CC_ESC_WARN(env, (UChar* )"]"); 04311 r = tok->type = TK_CHAR; /* allow []...] */ 04312 } 04313 04314 *np = node = node_new_cclass(); 04315 CHECK_NULL_RETURN_MEMERR(node); 04316 cc = NCCLASS(node); 04317 04318 and_start = 0; 04319 state = CCS_START; 04320 p = *src; 04321 while (r != TK_CC_CLOSE) { 04322 fetched = 0; 04323 switch (r) { 04324 case TK_CHAR: 04325 if ((tok->u.code >= SINGLE_BYTE_SIZE) || 04326 (len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c)) > 1) { 04327 in_type = CCV_CODE_POINT; 04328 } 04329 else if (len < 0) { 04330 r = len; 04331 goto err; 04332 } 04333 else { 04334 sb_char: 04335 in_type = CCV_SB; 04336 } 04337 v = (OnigCodePoint )tok->u.c; 04338 in_israw = 0; 04339 goto val_entry2; 04340 break; 04341 04342 case TK_RAW_BYTE: 04343 /* tok->base != 0 : octal or hexadec. */ 04344 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { 04345 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; 04346 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN; 04347 UChar* psave = p; 04348 int i, base = tok->base; 04349 04350 buf[0] = tok->u.c; 04351 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { 04352 r = fetch_token_in_cc(tok, &p, end, env); 04353 if (r < 0) goto err; 04354 if (r != TK_RAW_BYTE || tok->base != base) { 04355 fetched = 1; 04356 break; 04357 } 04358 buf[i] = tok->u.c; 04359 } 04360 04361 if (i < ONIGENC_MBC_MINLEN(env->enc)) { 04362 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; 04363 goto err; 04364 } 04365 04366 len = enclen(env->enc, buf, buf+i); 04367 if (i < len) { 04368 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; 04369 goto err; 04370 } 04371 else if (i > len) { /* fetch back */ 04372 p = psave; 04373 for (i = 1; i < len; i++) { 04374 r = fetch_token_in_cc(tok, &p, end, env); 04375 } 04376 fetched = 0; 04377 } 04378 04379 if (i == 1) { 04380 v = (OnigCodePoint )buf[0]; 04381 goto raw_single; 04382 } 04383 else { 04384 v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); 04385 in_type = CCV_CODE_POINT; 04386 } 04387 } 04388 else { 04389 v = (OnigCodePoint )tok->u.c; 04390 raw_single: 04391 in_type = CCV_SB; 04392 } 04393 in_israw = 1; 04394 goto val_entry2; 04395 break; 04396 04397 case TK_CODE_POINT: 04398 v = tok->u.code; 04399 in_israw = 1; 04400 val_entry: 04401 len = ONIGENC_CODE_TO_MBCLEN(env->enc, v); 04402 if (len < 0) { 04403 r = len; 04404 goto err; 04405 } 04406 in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT); 04407 val_entry2: 04408 r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type, 04409 &state, env); 04410 if (r != 0) goto err; 04411 break; 04412 04413 case TK_POSIX_BRACKET_OPEN: 04414 r = parse_posix_bracket(cc, &p, end, env); 04415 if (r < 0) goto err; 04416 if (r == 1) { /* is not POSIX bracket */ 04417 CC_ESC_WARN(env, (UChar* )"["); 04418 p = tok->backp; 04419 v = (OnigCodePoint )tok->u.c; 04420 in_israw = 0; 04421 goto val_entry; 04422 } 04423 goto next_class; 04424 break; 04425 04426 case TK_CHAR_TYPE: 04427 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env); 04428 if (r != 0) return r; 04429 04430 next_class: 04431 r = next_state_class(cc, &vs, &val_type, &state, env); 04432 if (r != 0) goto err; 04433 break; 04434 04435 case TK_CHAR_PROPERTY: 04436 { 04437 int ctype; 04438 04439 ctype = fetch_char_property_to_ctype(&p, end, env); 04440 if (ctype < 0) return ctype; 04441 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env); 04442 if (r != 0) return r; 04443 goto next_class; 04444 } 04445 break; 04446 04447 case TK_CC_RANGE: 04448 if (state == CCS_VALUE) { 04449 r = fetch_token_in_cc(tok, &p, end, env); 04450 if (r < 0) goto err; 04451 fetched = 1; 04452 if (r == TK_CC_CLOSE) { /* allow [x-] */ 04453 range_end_val: 04454 v = (OnigCodePoint )'-'; 04455 in_israw = 0; 04456 goto val_entry; 04457 } 04458 else if (r == TK_CC_AND) { 04459 CC_ESC_WARN(env, (UChar* )"-"); 04460 goto range_end_val; 04461 } 04462 state = CCS_RANGE; 04463 } 04464 else if (state == CCS_START) { 04465 /* [-xa] is allowed */ 04466 v = (OnigCodePoint )tok->u.c; 04467 in_israw = 0; 04468 04469 r = fetch_token_in_cc(tok, &p, end, env); 04470 if (r < 0) goto err; 04471 fetched = 1; 04472 /* [--x] or [a&&-x] is warned. */ 04473 if (r == TK_CC_RANGE || and_start != 0) 04474 CC_ESC_WARN(env, (UChar* )"-"); 04475 04476 goto val_entry; 04477 } 04478 else if (state == CCS_RANGE) { 04479 CC_ESC_WARN(env, (UChar* )"-"); 04480 goto sb_char; /* [!--x] is allowed */ 04481 } 04482 else { /* CCS_COMPLETE */ 04483 r = fetch_token_in_cc(tok, &p, end, env); 04484 if (r < 0) goto err; 04485 fetched = 1; 04486 if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */ 04487 else if (r == TK_CC_AND) { 04488 CC_ESC_WARN(env, (UChar* )"-"); 04489 goto range_end_val; 04490 } 04491 04492 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) { 04493 CC_ESC_WARN(env, (UChar* )"-"); 04494 goto range_end_val; /* [0-9-a] is allowed as [0-9\-a] */ 04495 } 04496 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; 04497 goto err; 04498 } 04499 break; 04500 04501 case TK_CC_CC_OPEN: /* [ */ 04502 { 04503 Node *anode; 04504 CClassNode* acc; 04505 04506 r = parse_char_class(&anode, tok, &p, end, env); 04507 if (r == 0) { 04508 acc = NCCLASS(anode); 04509 r = or_cclass(cc, acc, env); 04510 } 04511 onig_node_free(anode); 04512 if (r != 0) goto err; 04513 } 04514 break; 04515 04516 case TK_CC_AND: /* && */ 04517 { 04518 if (state == CCS_VALUE) { 04519 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, 04520 &val_type, &state, env); 04521 if (r != 0) goto err; 04522 } 04523 /* initialize local variables */ 04524 and_start = 1; 04525 state = CCS_START; 04526 04527 if (IS_NOT_NULL(prev_cc)) { 04528 r = and_cclass(prev_cc, cc, env); 04529 if (r != 0) goto err; 04530 bbuf_free(cc->mbuf); 04531 } 04532 else { 04533 prev_cc = cc; 04534 cc = &work_cc; 04535 } 04536 initialize_cclass(cc); 04537 } 04538 break; 04539 04540 case TK_EOT: 04541 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS; 04542 goto err; 04543 break; 04544 default: 04545 r = ONIGERR_PARSER_BUG; 04546 goto err; 04547 break; 04548 } 04549 04550 if (fetched) 04551 r = tok->type; 04552 else { 04553 r = fetch_token_in_cc(tok, &p, end, env); 04554 if (r < 0) goto err; 04555 } 04556 } 04557 04558 if (state == CCS_VALUE) { 04559 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, 04560 &val_type, &state, env); 04561 if (r != 0) goto err; 04562 } 04563 04564 if (IS_NOT_NULL(prev_cc)) { 04565 r = and_cclass(prev_cc, cc, env); 04566 if (r != 0) goto err; 04567 bbuf_free(cc->mbuf); 04568 cc = prev_cc; 04569 } 04570 04571 if (neg != 0) 04572 NCCLASS_SET_NOT(cc); 04573 else 04574 NCCLASS_CLEAR_NOT(cc); 04575 if (IS_NCCLASS_NOT(cc) && 04576 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) { 04577 int is_empty; 04578 04579 is_empty = (IS_NULL(cc->mbuf) ? 1 : 0); 04580 if (is_empty != 0) 04581 BITSET_IS_EMPTY(cc->bs, is_empty); 04582 04583 if (is_empty == 0) { 04584 #define NEWLINE_CODE 0x0a 04585 04586 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) { 04587 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1) 04588 BITSET_SET_BIT_CHKDUP(cc->bs, NEWLINE_CODE); 04589 else 04590 add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE); 04591 } 04592 } 04593 } 04594 *src = p; 04595 return 0; 04596 04597 err: 04598 if (cc != NCCLASS(*np)) 04599 bbuf_free(cc->mbuf); 04600 return r; 04601 } 04602 04603 static int parse_subexp(Node** top, OnigToken* tok, int term, 04604 UChar** src, UChar* end, ScanEnv* env); 04605 04606 static int 04607 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, 04608 ScanEnv* env) 04609 { 04610 int r, num; 04611 Node *target; 04612 OnigOptionType option; 04613 OnigCodePoint c; 04614 OnigEncoding enc = env->enc; 04615 04616 #ifdef USE_NAMED_GROUP 04617 int list_capture; 04618 #endif 04619 04620 UChar* p = *src; 04621 PFETCH_READY; 04622 04623 *np = NULL; 04624 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; 04625 04626 option = env->option; 04627 if (PPEEK_IS('?') && 04628 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { 04629 PINC; 04630 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 04631 04632 PFETCH(c); 04633 switch (c) { 04634 case ':': /* (?:...) grouping only */ 04635 group: 04636 r = fetch_token(tok, &p, end, env); 04637 if (r < 0) return r; 04638 r = parse_subexp(np, tok, term, &p, end, env); 04639 if (r < 0) return r; 04640 *src = p; 04641 return 1; /* group */ 04642 break; 04643 04644 case '=': 04645 *np = onig_node_new_anchor(ANCHOR_PREC_READ); 04646 break; 04647 case '!': /* preceding read */ 04648 *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT); 04649 break; 04650 case '>': /* (?>...) stop backtrack */ 04651 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK); 04652 break; 04653 04654 #ifdef USE_NAMED_GROUP 04655 case '\'': 04656 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { 04657 goto named_group1; 04658 } 04659 else 04660 return ONIGERR_UNDEFINED_GROUP_OPTION; 04661 break; 04662 #endif 04663 04664 case '<': /* look behind (?<=...), (?<!...) */ 04665 PFETCH(c); 04666 if (c == '=') 04667 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND); 04668 else if (c == '!') 04669 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT); 04670 #ifdef USE_NAMED_GROUP 04671 else { 04672 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { 04673 UChar *name; 04674 UChar *name_end; 04675 04676 PUNFETCH; 04677 c = '<'; 04678 04679 named_group1: 04680 list_capture = 0; 04681 04682 named_group2: 04683 name = p; 04684 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0); 04685 if (r < 0) return r; 04686 04687 num = scan_env_add_mem_entry(env); 04688 if (num < 0) return num; 04689 if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM) 04690 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; 04691 04692 r = name_add(env->reg, name, name_end, num, env); 04693 if (r != 0) return r; 04694 *np = node_new_enclose_memory(env->option, 1); 04695 CHECK_NULL_RETURN_MEMERR(*np); 04696 NENCLOSE(*np)->regnum = num; 04697 if (list_capture != 0) 04698 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num); 04699 env->num_named++; 04700 } 04701 else { 04702 return ONIGERR_UNDEFINED_GROUP_OPTION; 04703 } 04704 } 04705 #else 04706 else { 04707 return ONIGERR_UNDEFINED_GROUP_OPTION; 04708 } 04709 #endif 04710 break; 04711 04712 case '@': 04713 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) { 04714 #ifdef USE_NAMED_GROUP 04715 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { 04716 PFETCH(c); 04717 if (c == '<' || c == '\'') { 04718 list_capture = 1; 04719 goto named_group2; /* (?@<name>...) */ 04720 } 04721 PUNFETCH; 04722 } 04723 #endif 04724 *np = node_new_enclose_memory(env->option, 0); 04725 CHECK_NULL_RETURN_MEMERR(*np); 04726 num = scan_env_add_mem_entry(env); 04727 if (num < 0) { 04728 onig_node_free(*np); 04729 return num; 04730 } 04731 else if (num >= (int )BIT_STATUS_BITS_NUM) { 04732 onig_node_free(*np); 04733 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; 04734 } 04735 NENCLOSE(*np)->regnum = num; 04736 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num); 04737 } 04738 else { 04739 return ONIGERR_UNDEFINED_GROUP_OPTION; 04740 } 04741 break; 04742 04743 #ifdef USE_POSIXLINE_OPTION 04744 case 'p': 04745 #endif 04746 case '-': case 'i': case 'm': case 's': case 'x': 04747 { 04748 int neg = 0; 04749 04750 while (1) { 04751 switch (c) { 04752 case ':': 04753 case ')': 04754 break; 04755 04756 case '-': neg = 1; break; 04757 case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break; 04758 case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break; 04759 case 's': 04760 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { 04761 ONOFF(option, ONIG_OPTION_MULTILINE, neg); 04762 } 04763 else 04764 return ONIGERR_UNDEFINED_GROUP_OPTION; 04765 break; 04766 04767 case 'm': 04768 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { 04769 ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0)); 04770 } 04771 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) { 04772 ONOFF(option, ONIG_OPTION_MULTILINE, neg); 04773 } 04774 else 04775 return ONIGERR_UNDEFINED_GROUP_OPTION; 04776 break; 04777 #ifdef USE_POSIXLINE_OPTION 04778 case 'p': 04779 ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg); 04780 break; 04781 #endif 04782 default: 04783 return ONIGERR_UNDEFINED_GROUP_OPTION; 04784 } 04785 04786 if (c == ')') { 04787 *np = node_new_option(option); 04788 CHECK_NULL_RETURN_MEMERR(*np); 04789 *src = p; 04790 return 2; /* option only */ 04791 } 04792 else if (c == ':') { 04793 OnigOptionType prev = env->option; 04794 04795 env->option = option; 04796 r = fetch_token(tok, &p, end, env); 04797 if (r < 0) return r; 04798 r = parse_subexp(&target, tok, term, &p, end, env); 04799 env->option = prev; 04800 if (r < 0) return r; 04801 *np = node_new_option(option); 04802 CHECK_NULL_RETURN_MEMERR(*np); 04803 NENCLOSE(*np)->target = target; 04804 *src = p; 04805 return 0; 04806 } 04807 04808 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 04809 PFETCH(c); 04810 } 04811 } 04812 break; 04813 04814 default: 04815 return ONIGERR_UNDEFINED_GROUP_OPTION; 04816 } 04817 } 04818 else { 04819 if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP)) 04820 goto group; 04821 04822 *np = node_new_enclose_memory(env->option, 0); 04823 CHECK_NULL_RETURN_MEMERR(*np); 04824 num = scan_env_add_mem_entry(env); 04825 if (num < 0) return num; 04826 NENCLOSE(*np)->regnum = num; 04827 } 04828 04829 CHECK_NULL_RETURN_MEMERR(*np); 04830 r = fetch_token(tok, &p, end, env); 04831 if (r < 0) return r; 04832 r = parse_subexp(&target, tok, term, &p, end, env); 04833 if (r < 0) { 04834 onig_node_free(target); 04835 return r; 04836 } 04837 04838 if (NTYPE(*np) == NT_ANCHOR) 04839 NANCHOR(*np)->target = target; 04840 else { 04841 NENCLOSE(*np)->target = target; 04842 if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) { 04843 /* Don't move this to previous of parse_subexp() */ 04844 r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np); 04845 if (r != 0) return r; 04846 } 04847 } 04848 04849 *src = p; 04850 return 0; 04851 } 04852 04853 static const char* const PopularQStr[] = { 04854 "?", "*", "+", "??", "*?", "+?" 04855 }; 04856 04857 static const char* const ReduceQStr[] = { 04858 "", "", "*", "*?", "??", "+ and ??", "+? and ?" 04859 }; 04860 04861 static int 04862 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) 04863 { 04864 QtfrNode* qn; 04865 04866 qn = NQTFR(qnode); 04867 if (qn->lower == 1 && qn->upper == 1) { 04868 return 1; 04869 } 04870 04871 switch (NTYPE(target)) { 04872 case NT_STR: 04873 if (! group) { 04874 StrNode* sn = NSTR(target); 04875 if (str_node_can_be_split(sn, env->enc)) { 04876 Node* n = str_node_split_last_char(sn, env->enc); 04877 if (IS_NOT_NULL(n)) { 04878 qn->target = n; 04879 return 2; 04880 } 04881 } 04882 } 04883 break; 04884 04885 case NT_QTFR: 04886 { /* check redundant double repeat. */ 04887 /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */ 04888 QtfrNode* qnt = NQTFR(target); 04889 int nestq_num = popular_quantifier_num(qn); 04890 int targetq_num = popular_quantifier_num(qnt); 04891 04892 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR 04893 if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) && 04894 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) { 04895 UChar buf[WARN_BUFSIZE]; 04896 04897 switch(ReduceTypeTable[targetq_num][nestq_num]) { 04898 case RQ_ASIS: 04899 break; 04900 04901 case RQ_DEL: 04902 if (onig_verb_warn != onig_null_warn) { 04903 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, 04904 env->pattern, env->pattern_end, 04905 (UChar* )"redundant nested repeat operator"); 04906 (*onig_verb_warn)((char* )buf); 04907 } 04908 goto warn_exit; 04909 break; 04910 04911 default: 04912 if (onig_verb_warn != onig_null_warn) { 04913 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, 04914 env->pattern, env->pattern_end, 04915 (UChar* )"nested repeat operator %s and %s was replaced with '%s'", 04916 PopularQStr[targetq_num], PopularQStr[nestq_num], 04917 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]); 04918 (*onig_verb_warn)((char* )buf); 04919 } 04920 goto warn_exit; 04921 break; 04922 } 04923 } 04924 04925 warn_exit: 04926 #endif 04927 if (targetq_num >= 0) { 04928 if (nestq_num >= 0) { 04929 onig_reduce_nested_quantifier(qnode, target); 04930 goto q_exit; 04931 } 04932 else if (targetq_num == 1 || targetq_num == 2) { /* * or + */ 04933 /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */ 04934 if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) { 04935 qn->upper = (qn->lower == 0 ? 1 : qn->lower); 04936 } 04937 } 04938 } 04939 } 04940 break; 04941 04942 default: 04943 break; 04944 } 04945 04946 qn->target = target; 04947 q_exit: 04948 return 0; 04949 } 04950 04951 04952 #ifdef USE_SHARED_CCLASS_TABLE 04953 04954 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8 04955 04956 /* for ctype node hash table */ 04957 04958 typedef struct { 04959 OnigEncoding enc; 04960 int not; 04961 int type; 04962 } type_cclass_key; 04963 04964 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y) 04965 { 04966 if (x->type != y->type) return 1; 04967 if (x->enc != y->enc) return 1; 04968 if (x->not != y->not) return 1; 04969 return 0; 04970 } 04971 04972 static st_index_t type_cclass_hash(type_cclass_key* key) 04973 { 04974 int i, val; 04975 UChar *p; 04976 04977 val = 0; 04978 04979 p = (UChar* )&(key->enc); 04980 for (i = 0; i < (int )sizeof(key->enc); i++) { 04981 val = val * 997 + (int )*p++; 04982 } 04983 04984 p = (UChar* )(&key->type); 04985 for (i = 0; i < (int )sizeof(key->type); i++) { 04986 val = val * 997 + (int )*p++; 04987 } 04988 04989 val += key->not; 04990 return val + (val >> 5); 04991 } 04992 04993 static const struct st_hash_type type_type_cclass_hash = { 04994 type_cclass_cmp, 04995 type_cclass_hash, 04996 }; 04997 04998 static st_table* OnigTypeCClassTable; 04999 05000 05001 static int 05002 i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED) 05003 { 05004 if (IS_NOT_NULL(node)) { 05005 CClassNode* cc = NCCLASS(node); 05006 if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf); 05007 xfree(node); 05008 } 05009 05010 if (IS_NOT_NULL(key)) xfree(key); 05011 return ST_DELETE; 05012 } 05013 05014 extern int 05015 onig_free_shared_cclass_table(void) 05016 { 05017 THREAD_ATOMIC_START; 05018 if (IS_NOT_NULL(OnigTypeCClassTable)) { 05019 onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0); 05020 onig_st_free_table(OnigTypeCClassTable); 05021 OnigTypeCClassTable = NULL; 05022 } 05023 THREAD_ATOMIC_END; 05024 05025 return 0; 05026 } 05027 05028 #endif /* USE_SHARED_CCLASS_TABLE */ 05029 05030 05031 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS 05032 static int 05033 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) 05034 { 05035 BBuf *tbuf; 05036 int r; 05037 05038 if (IS_NCCLASS_NOT(cc)) { 05039 bitset_invert(cc->bs); 05040 05041 if (! ONIGENC_IS_SINGLEBYTE(enc)) { 05042 r = not_code_range_buf(enc, cc->mbuf, &tbuf); 05043 if (r != 0) return r; 05044 05045 bbuf_free(cc->mbuf); 05046 cc->mbuf = tbuf; 05047 } 05048 05049 NCCLASS_CLEAR_NOT(cc); 05050 } 05051 05052 return 0; 05053 } 05054 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ 05055 05056 typedef struct { 05057 ScanEnv* env; 05058 CClassNode* cc; 05059 Node* alt_root; 05060 Node** ptail; 05061 } IApplyCaseFoldArg; 05062 05063 static int 05064 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], 05065 int to_len, void* arg) 05066 { 05067 IApplyCaseFoldArg* iarg; 05068 ScanEnv* env; 05069 CClassNode* cc; 05070 BitSetRef bs; 05071 05072 iarg = (IApplyCaseFoldArg* )arg; 05073 env = iarg->env; 05074 cc = iarg->cc; 05075 bs = cc->bs; 05076 05077 if (to_len == 1) { 05078 int is_in = onig_is_code_in_cc(env->enc, from, cc); 05079 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS 05080 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) || 05081 (is_in == 0 && IS_NCCLASS_NOT(cc))) { 05082 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { 05083 add_code_range0(&(cc->mbuf), env, *to, *to, 0); 05084 } 05085 else { 05086 BITSET_SET_BIT(bs, *to); 05087 } 05088 } 05089 #else 05090 if (is_in != 0) { 05091 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { 05092 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc); 05093 add_code_range0(&(cc->mbuf), env, *to, *to, 0); 05094 } 05095 else { 05096 if (IS_NCCLASS_NOT(cc)) { 05097 BITSET_CLEAR_BIT(bs, *to); 05098 } 05099 else 05100 BITSET_SET_BIT(bs, *to); 05101 } 05102 } 05103 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ 05104 } 05105 else { 05106 int r, i, len; 05107 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; 05108 Node *snode = NULL_NODE; 05109 05110 if (onig_is_code_in_cc(env->enc, from, cc) 05111 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS 05112 && !IS_NCCLASS_NOT(cc) 05113 #endif 05114 ) { 05115 for (i = 0; i < to_len; i++) { 05116 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); 05117 if (i == 0) { 05118 snode = onig_node_new_str(buf, buf + len); 05119 CHECK_NULL_RETURN_MEMERR(snode); 05120 05121 /* char-class expanded multi-char only 05122 compare with string folded at match time. */ 05123 NSTRING_SET_AMBIG(snode); 05124 } 05125 else { 05126 r = onig_node_str_cat(snode, buf, buf + len); 05127 if (r < 0) { 05128 onig_node_free(snode); 05129 return r; 05130 } 05131 } 05132 } 05133 05134 *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE); 05135 CHECK_NULL_RETURN_MEMERR(*(iarg->ptail)); 05136 iarg->ptail = &(NCDR((*(iarg->ptail)))); 05137 } 05138 } 05139 05140 return 0; 05141 } 05142 05143 static int 05144 parse_exp(Node** np, OnigToken* tok, int term, 05145 UChar** src, UChar* end, ScanEnv* env) 05146 { 05147 int r, len, group = 0; 05148 Node* qn; 05149 Node** targetp; 05150 05151 *np = NULL; 05152 if (tok->type == (enum TokenSyms )term) 05153 goto end_of_token; 05154 05155 switch (tok->type) { 05156 case TK_ALT: 05157 case TK_EOT: 05158 end_of_token: 05159 *np = node_new_empty(); 05160 return tok->type; 05161 05162 case TK_SUBEXP_OPEN: 05163 r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env); 05164 if (r < 0) return r; 05165 if (r == 1) group = 1; 05166 else if (r == 2) { /* option only */ 05167 Node* target; 05168 OnigOptionType prev = env->option; 05169 05170 env->option = NENCLOSE(*np)->option; 05171 r = fetch_token(tok, src, end, env); 05172 if (r < 0) return r; 05173 r = parse_subexp(&target, tok, term, src, end, env); 05174 env->option = prev; 05175 if (r < 0) { 05176 onig_node_free(target); 05177 return r; 05178 } 05179 NENCLOSE(*np)->target = target; 05180 return tok->type; 05181 } 05182 break; 05183 05184 case TK_SUBEXP_CLOSE: 05185 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP)) 05186 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS; 05187 05188 if (tok->escaped) goto tk_raw_byte; 05189 else goto tk_byte; 05190 break; 05191 05192 case TK_STRING: 05193 tk_byte: 05194 { 05195 *np = node_new_str(tok->backp, *src); 05196 CHECK_NULL_RETURN_MEMERR(*np); 05197 05198 while (1) { 05199 r = fetch_token(tok, src, end, env); 05200 if (r < 0) return r; 05201 if (r != TK_STRING) break; 05202 05203 r = onig_node_str_cat(*np, tok->backp, *src); 05204 if (r < 0) return r; 05205 } 05206 05207 string_end: 05208 targetp = np; 05209 goto repeat; 05210 } 05211 break; 05212 05213 case TK_RAW_BYTE: 05214 tk_raw_byte: 05215 { 05216 *np = node_new_str_raw_char((UChar )tok->u.c); 05217 CHECK_NULL_RETURN_MEMERR(*np); 05218 len = 1; 05219 while (1) { 05220 if (len >= ONIGENC_MBC_MINLEN(env->enc)) { 05221 if (len == enclen(env->enc, NSTR(*np)->s, NSTR(*np)->end)) { 05222 r = fetch_token(tok, src, end, env); 05223 NSTRING_CLEAR_RAW(*np); 05224 goto string_end; 05225 } 05226 } 05227 05228 r = fetch_token(tok, src, end, env); 05229 if (r < 0) return r; 05230 if (r != TK_RAW_BYTE) { 05231 /* Don't use this, it is wrong for little endian encodings. */ 05232 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR 05233 int rem; 05234 if (len < ONIGENC_MBC_MINLEN(env->enc)) { 05235 rem = ONIGENC_MBC_MINLEN(env->enc) - len; 05236 (void )node_str_head_pad(NSTR(*np), rem, (UChar )0); 05237 if (len + rem == enclen(env->enc, NSTR(*np)->s)) { 05238 NSTRING_CLEAR_RAW(*np); 05239 goto string_end; 05240 } 05241 } 05242 #endif 05243 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; 05244 } 05245 05246 r = node_str_cat_char(*np, (UChar )tok->u.c); 05247 if (r < 0) return r; 05248 05249 len++; 05250 } 05251 } 05252 break; 05253 05254 case TK_CODE_POINT: 05255 { 05256 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; 05257 int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf); 05258 if (num < 0) return num; 05259 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG 05260 *np = node_new_str_raw(buf, buf + num); 05261 #else 05262 *np = node_new_str(buf, buf + num); 05263 #endif 05264 CHECK_NULL_RETURN_MEMERR(*np); 05265 } 05266 break; 05267 05268 case TK_QUOTE_OPEN: 05269 { 05270 OnigCodePoint end_op[2]; 05271 UChar *qstart, *qend, *nextp; 05272 05273 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax); 05274 end_op[1] = (OnigCodePoint )'E'; 05275 qstart = *src; 05276 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc); 05277 if (IS_NULL(qend)) { 05278 nextp = qend = end; 05279 } 05280 *np = node_new_str(qstart, qend); 05281 CHECK_NULL_RETURN_MEMERR(*np); 05282 *src = nextp; 05283 } 05284 break; 05285 05286 case TK_CHAR_TYPE: 05287 { 05288 switch (tok->u.prop.ctype) { 05289 case ONIGENC_CTYPE_D: 05290 case ONIGENC_CTYPE_S: 05291 case ONIGENC_CTYPE_W: 05292 { 05293 CClassNode* cc; 05294 *np = node_new_cclass(); 05295 CHECK_NULL_RETURN_MEMERR(*np); 05296 cc = NCCLASS(*np); 05297 add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env); 05298 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); 05299 } 05300 break; 05301 05302 case ONIGENC_CTYPE_WORD: 05303 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not); 05304 CHECK_NULL_RETURN_MEMERR(*np); 05305 break; 05306 05307 case ONIGENC_CTYPE_SPACE: 05308 case ONIGENC_CTYPE_DIGIT: 05309 case ONIGENC_CTYPE_XDIGIT: 05310 { 05311 CClassNode* cc; 05312 05313 #ifdef USE_SHARED_CCLASS_TABLE 05314 const OnigCodePoint *mbr; 05315 OnigCodePoint sb_out; 05316 05317 r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype, 05318 &sb_out, &mbr); 05319 if (r == 0 && 05320 ONIGENC_CODE_RANGE_NUM(mbr) 05321 >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) { 05322 type_cclass_key key; 05323 type_cclass_key* new_key; 05324 05325 key.enc = env->enc; 05326 key.not = tok->u.prop.not; 05327 key.type = tok->u.prop.ctype; 05328 05329 THREAD_ATOMIC_START; 05330 05331 if (IS_NULL(OnigTypeCClassTable)) { 05332 OnigTypeCClassTable 05333 = onig_st_init_table_with_size(&type_type_cclass_hash, 10); 05334 if (IS_NULL(OnigTypeCClassTable)) { 05335 THREAD_ATOMIC_END; 05336 return ONIGERR_MEMORY; 05337 } 05338 } 05339 else { 05340 if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key, 05341 (st_data_t* )np)) { 05342 THREAD_ATOMIC_END; 05343 break; 05344 } 05345 } 05346 05347 *np = node_new_cclass_by_codepoint_range(tok->u.prop.not, 05348 sb_out, mbr); 05349 if (IS_NULL(*np)) { 05350 THREAD_ATOMIC_END; 05351 return ONIGERR_MEMORY; 05352 } 05353 05354 cc = NCCLASS(*np); 05355 NCCLASS_SET_SHARE(cc); 05356 new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key)); 05357 xmemcpy(new_key, &key, sizeof(type_cclass_key)); 05358 onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key, 05359 (st_data_t )*np); 05360 05361 THREAD_ATOMIC_END; 05362 } 05363 else { 05364 #endif 05365 *np = node_new_cclass(); 05366 CHECK_NULL_RETURN_MEMERR(*np); 05367 cc = NCCLASS(*np); 05368 add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env); 05369 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); 05370 #ifdef USE_SHARED_CCLASS_TABLE 05371 } 05372 #endif 05373 } 05374 break; 05375 05376 default: 05377 return ONIGERR_PARSER_BUG; 05378 break; 05379 } 05380 } 05381 break; 05382 05383 case TK_CHAR_PROPERTY: 05384 r = parse_char_property(np, tok, src, end, env); 05385 if (r != 0) return r; 05386 break; 05387 05388 case TK_CC_OPEN: 05389 { 05390 CClassNode* cc; 05391 05392 r = parse_char_class(np, tok, src, end, env); 05393 if (r != 0) return r; 05394 05395 cc = NCCLASS(*np); 05396 if (IS_IGNORECASE(env->option)) { 05397 IApplyCaseFoldArg iarg; 05398 05399 iarg.env = env; 05400 iarg.cc = cc; 05401 iarg.alt_root = NULL_NODE; 05402 iarg.ptail = &(iarg.alt_root); 05403 05404 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag, 05405 i_apply_case_fold, &iarg); 05406 if (r != 0) { 05407 onig_node_free(iarg.alt_root); 05408 return r; 05409 } 05410 if (IS_NOT_NULL(iarg.alt_root)) { 05411 Node* work = onig_node_new_alt(*np, iarg.alt_root); 05412 if (IS_NULL(work)) { 05413 onig_node_free(iarg.alt_root); 05414 return ONIGERR_MEMORY; 05415 } 05416 *np = work; 05417 } 05418 } 05419 } 05420 break; 05421 05422 case TK_ANYCHAR: 05423 *np = node_new_anychar(); 05424 CHECK_NULL_RETURN_MEMERR(*np); 05425 break; 05426 05427 case TK_ANYCHAR_ANYTIME: 05428 *np = node_new_anychar(); 05429 CHECK_NULL_RETURN_MEMERR(*np); 05430 qn = node_new_quantifier(0, REPEAT_INFINITE, 0); 05431 CHECK_NULL_RETURN_MEMERR(qn); 05432 NQTFR(qn)->target = *np; 05433 *np = qn; 05434 break; 05435 05436 case TK_BACKREF: 05437 len = tok->u.backref.num; 05438 *np = node_new_backref(len, 05439 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)), 05440 tok->u.backref.by_name, 05441 #ifdef USE_BACKREF_WITH_LEVEL 05442 tok->u.backref.exist_level, 05443 tok->u.backref.level, 05444 #endif 05445 env); 05446 CHECK_NULL_RETURN_MEMERR(*np); 05447 break; 05448 05449 #ifdef USE_SUBEXP_CALL 05450 case TK_CALL: 05451 { 05452 int gnum = tok->u.call.gnum; 05453 05454 if (gnum < 0) { 05455 gnum = BACKREF_REL_TO_ABS(gnum, env); 05456 if (gnum <= 0) 05457 return ONIGERR_INVALID_BACKREF; 05458 } 05459 *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum); 05460 CHECK_NULL_RETURN_MEMERR(*np); 05461 env->num_call++; 05462 } 05463 break; 05464 #endif 05465 05466 case TK_ANCHOR: 05467 *np = onig_node_new_anchor(tok->u.anchor); 05468 break; 05469 05470 case TK_OP_REPEAT: 05471 case TK_INTERVAL: 05472 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) { 05473 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS)) 05474 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED; 05475 else 05476 *np = node_new_empty(); 05477 } 05478 else { 05479 goto tk_byte; 05480 } 05481 break; 05482 05483 default: 05484 return ONIGERR_PARSER_BUG; 05485 break; 05486 } 05487 05488 { 05489 targetp = np; 05490 05491 re_entry: 05492 r = fetch_token(tok, src, end, env); 05493 if (r < 0) return r; 05494 05495 repeat: 05496 if (r == TK_OP_REPEAT || r == TK_INTERVAL) { 05497 if (is_invalid_quantifier_target(*targetp)) 05498 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; 05499 05500 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper, 05501 (r == TK_INTERVAL ? 1 : 0)); 05502 CHECK_NULL_RETURN_MEMERR(qn); 05503 NQTFR(qn)->greedy = tok->u.repeat.greedy; 05504 r = set_quantifier(qn, *targetp, group, env); 05505 if (r < 0) { 05506 onig_node_free(qn); 05507 return r; 05508 } 05509 05510 if (tok->u.repeat.possessive != 0) { 05511 Node* en; 05512 en = node_new_enclose(ENCLOSE_STOP_BACKTRACK); 05513 if (IS_NULL(en)) { 05514 onig_node_free(qn); 05515 return ONIGERR_MEMORY; 05516 } 05517 NENCLOSE(en)->target = qn; 05518 qn = en; 05519 } 05520 05521 if (r == 0) { 05522 *targetp = qn; 05523 } 05524 else if (r == 1) { 05525 onig_node_free(qn); 05526 } 05527 else if (r == 2) { /* split case: /abc+/ */ 05528 Node *tmp; 05529 05530 *targetp = node_new_list(*targetp, NULL); 05531 if (IS_NULL(*targetp)) { 05532 onig_node_free(qn); 05533 return ONIGERR_MEMORY; 05534 } 05535 tmp = NCDR(*targetp) = node_new_list(qn, NULL); 05536 if (IS_NULL(tmp)) { 05537 onig_node_free(qn); 05538 return ONIGERR_MEMORY; 05539 } 05540 targetp = &(NCAR(tmp)); 05541 } 05542 goto re_entry; 05543 } 05544 } 05545 05546 return r; 05547 } 05548 05549 static int 05550 parse_branch(Node** top, OnigToken* tok, int term, 05551 UChar** src, UChar* end, ScanEnv* env) 05552 { 05553 int r; 05554 Node *node, **headp; 05555 05556 *top = NULL; 05557 r = parse_exp(&node, tok, term, src, end, env); 05558 if (r < 0) { 05559 onig_node_free(node); 05560 return r; 05561 } 05562 05563 if (r == TK_EOT || r == term || r == TK_ALT) { 05564 *top = node; 05565 } 05566 else { 05567 *top = node_new_list(node, NULL); 05568 headp = &(NCDR(*top)); 05569 while (r != TK_EOT && r != term && r != TK_ALT) { 05570 r = parse_exp(&node, tok, term, src, end, env); 05571 if (r < 0) { 05572 onig_node_free(node); 05573 return r; 05574 } 05575 05576 if (NTYPE(node) == NT_LIST) { 05577 *headp = node; 05578 while (IS_NOT_NULL(NCDR(node))) node = NCDR(node); 05579 headp = &(NCDR(node)); 05580 } 05581 else { 05582 *headp = node_new_list(node, NULL); 05583 headp = &(NCDR(*headp)); 05584 } 05585 } 05586 } 05587 05588 return r; 05589 } 05590 05591 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ 05592 static int 05593 parse_subexp(Node** top, OnigToken* tok, int term, 05594 UChar** src, UChar* end, ScanEnv* env) 05595 { 05596 int r; 05597 Node *node, **headp; 05598 05599 *top = NULL; 05600 r = parse_branch(&node, tok, term, src, end, env); 05601 if (r < 0) { 05602 onig_node_free(node); 05603 return r; 05604 } 05605 05606 if (r == term) { 05607 *top = node; 05608 } 05609 else if (r == TK_ALT) { 05610 *top = onig_node_new_alt(node, NULL); 05611 headp = &(NCDR(*top)); 05612 while (r == TK_ALT) { 05613 r = fetch_token(tok, src, end, env); 05614 if (r < 0) return r; 05615 r = parse_branch(&node, tok, term, src, end, env); 05616 if (r < 0) { 05617 onig_node_free(node); 05618 return r; 05619 } 05620 05621 *headp = onig_node_new_alt(node, NULL); 05622 headp = &(NCDR(*headp)); 05623 } 05624 05625 if (tok->type != (enum TokenSyms )term) 05626 goto err; 05627 } 05628 else { 05629 onig_node_free(node); 05630 err: 05631 if (term == TK_SUBEXP_CLOSE) 05632 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; 05633 else 05634 return ONIGERR_PARSER_BUG; 05635 } 05636 05637 return r; 05638 } 05639 05640 static int 05641 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) 05642 { 05643 int r; 05644 OnigToken tok; 05645 05646 r = fetch_token(&tok, src, end, env); 05647 if (r < 0) return r; 05648 r = parse_subexp(top, &tok, TK_EOT, src, end, env); 05649 if (r < 0) return r; 05650 return 0; 05651 } 05652 05653 extern int 05654 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, 05655 regex_t* reg, ScanEnv* env) 05656 { 05657 int r; 05658 UChar* p; 05659 05660 #ifdef USE_NAMED_GROUP 05661 names_clear(reg); 05662 #endif 05663 05664 scan_env_clear(env); 05665 env->option = reg->options; 05666 env->case_fold_flag = reg->case_fold_flag; 05667 env->enc = reg->enc; 05668 env->syntax = reg->syntax; 05669 env->pattern = (UChar* )pattern; 05670 env->pattern_end = (UChar* )end; 05671 env->reg = reg; 05672 05673 *root = NULL; 05674 p = (UChar* )pattern; 05675 r = parse_regexp(root, &p, (UChar* )end, env); 05676 reg->num_mem = env->num_mem; 05677 return r; 05678 } 05679 05680 extern void 05681 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED, 05682 UChar* arg, UChar* arg_end) 05683 { 05684 env->error = arg; 05685 env->error_end = arg_end; 05686 } 05687