Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /********************************************************************** 00002 00003 re.c - 00004 00005 $Author: akr $ 00006 created at: Mon Aug 9 18:24:49 JST 1993 00007 00008 Copyright (C) 1993-2007 Yukihiro Matsumoto 00009 00010 **********************************************************************/ 00011 00012 #include "ruby/ruby.h" 00013 #include "ruby/re.h" 00014 #include "ruby/encoding.h" 00015 #include "ruby/util.h" 00016 #include "internal.h" 00017 #include "regint.h" 00018 #include <ctype.h> 00019 00020 VALUE rb_eRegexpError; 00021 00022 typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN]; 00023 #define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN) 00024 00025 #define BEG(no) (regs->beg[(no)]) 00026 #define END(no) (regs->end[(no)]) 00027 00028 #if 'a' == 97 /* it's ascii */ 00029 static const char casetable[] = { 00030 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', 00031 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', 00032 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', 00033 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', 00034 /* ' ' '!' '"' '#' '$' '%' '&' ''' */ 00035 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', 00036 /* '(' ')' '*' '+' ',' '-' '.' '/' */ 00037 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', 00038 /* '0' '1' '2' '3' '4' '5' '6' '7' */ 00039 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', 00040 /* '8' '9' ':' ';' '<' '=' '>' '?' */ 00041 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', 00042 /* '@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' */ 00043 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', 00044 /* 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' */ 00045 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', 00046 /* 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' */ 00047 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', 00048 /* 'X' 'Y' 'Z' '[' '\' ']' '^' '_' */ 00049 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', 00050 /* '`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' */ 00051 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', 00052 /* 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' */ 00053 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', 00054 /* 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' */ 00055 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', 00056 /* 'x' 'y' 'z' '{' '|' '}' '~' */ 00057 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', 00058 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', 00059 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', 00060 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', 00061 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', 00062 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', 00063 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', 00064 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', 00065 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', 00066 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', 00067 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', 00068 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327', 00069 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337', 00070 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', 00071 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', 00072 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', 00073 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377', 00074 }; 00075 #else 00076 # error >>> "You lose. You will need a translation table for your character set." <<< 00077 #endif 00078 00079 int 00080 rb_memcicmp(const void *x, const void *y, long len) 00081 { 00082 const unsigned char *p1 = x, *p2 = y; 00083 int tmp; 00084 00085 while (len--) { 00086 if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++])) 00087 return tmp; 00088 } 00089 return 0; 00090 } 00091 00092 #undef rb_memcmp 00093 00094 int 00095 rb_memcmp(const void *p1, const void *p2, long len) 00096 { 00097 return memcmp(p1, p2, len); 00098 } 00099 00100 static inline long 00101 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n) 00102 { 00103 const unsigned char *x = xs, *xe = xs + m; 00104 const unsigned char *y = ys, *ye = ys + n; 00105 #ifndef VALUE_MAX 00106 # if SIZEOF_VALUE == 8 00107 # define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL 00108 # elif SIZEOF_VALUE == 4 00109 # define VALUE_MAX 0xFFFFFFFFUL 00110 # endif 00111 #endif 00112 VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT); 00113 00114 if (m > SIZEOF_VALUE) 00115 rb_bug("!!too long pattern string!!"); 00116 00117 /* Prepare hash value */ 00118 for (hx = *x++, hy = *y++; x < xe; ++x, ++y) { 00119 hx <<= CHAR_BIT; 00120 hy <<= CHAR_BIT; 00121 hx |= *x; 00122 hy |= *y; 00123 } 00124 /* Searching */ 00125 while (hx != hy) { 00126 if (y == ye) 00127 return -1; 00128 hy <<= CHAR_BIT; 00129 hy |= *y; 00130 hy &= mask; 00131 y++; 00132 } 00133 return y - ys - m; 00134 } 00135 00136 static inline long 00137 rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n) 00138 { 00139 const unsigned char *x = xs, *xe = xs + m; 00140 const unsigned char *y = ys; 00141 VALUE i, qstable[256]; 00142 00143 /* Preprocessing */ 00144 for (i = 0; i < 256; ++i) 00145 qstable[i] = m + 1; 00146 for (; x < xe; ++x) 00147 qstable[*x] = xe - x; 00148 /* Searching */ 00149 for (; y + m <= ys + n; y += *(qstable + y[m])) { 00150 if (*xs == *y && memcmp(xs, y, m) == 0) 00151 return y - ys; 00152 } 00153 return -1; 00154 } 00155 00156 static inline unsigned int 00157 rb_memsearch_qs_utf8_hash(const unsigned char *x) 00158 { 00159 register const unsigned int mix = 8353; 00160 register unsigned int h = *x; 00161 if (h < 0xC0) { 00162 return h + 256; 00163 } 00164 else if (h < 0xE0) { 00165 h *= mix; 00166 h += x[1]; 00167 } 00168 else if (h < 0xF0) { 00169 h *= mix; 00170 h += x[1]; 00171 h *= mix; 00172 h += x[2]; 00173 } 00174 else if (h < 0xF5) { 00175 h *= mix; 00176 h += x[1]; 00177 h *= mix; 00178 h += x[2]; 00179 h *= mix; 00180 h += x[3]; 00181 } 00182 else { 00183 return h + 256; 00184 } 00185 return (unsigned char)h; 00186 } 00187 00188 static inline long 00189 rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n) 00190 { 00191 const unsigned char *x = xs, *xe = xs + m; 00192 const unsigned char *y = ys; 00193 VALUE i, qstable[512]; 00194 00195 /* Preprocessing */ 00196 for (i = 0; i < 512; ++i) { 00197 qstable[i] = m + 1; 00198 } 00199 for (; x < xe; ++x) { 00200 qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x; 00201 } 00202 /* Searching */ 00203 for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) { 00204 if (*xs == *y && memcmp(xs, y, m) == 0) 00205 return y - ys; 00206 } 00207 return -1; 00208 } 00209 00210 long 00211 rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc) 00212 { 00213 const unsigned char *x = x0, *y = y0; 00214 00215 if (m > n) return -1; 00216 else if (m == n) { 00217 return memcmp(x0, y0, m) == 0 ? 0 : -1; 00218 } 00219 else if (m < 1) { 00220 return 0; 00221 } 00222 else if (m == 1) { 00223 const unsigned char *ys = y, *ye = ys + n; 00224 for (; y < ye; ++y) { 00225 if (*x == *y) 00226 return y - ys; 00227 } 00228 return -1; 00229 } 00230 else if (m <= SIZEOF_VALUE) { 00231 return rb_memsearch_ss(x0, m, y0, n); 00232 } 00233 else if (enc == rb_utf8_encoding()){ 00234 return rb_memsearch_qs_utf8(x0, m, y0, n); 00235 } 00236 else { 00237 return rb_memsearch_qs(x0, m, y0, n); 00238 } 00239 } 00240 00241 #define REG_LITERAL FL_USER5 00242 #define REG_ENCODING_NONE FL_USER6 00243 00244 #define KCODE_FIXED FL_USER4 00245 00246 #define ARG_REG_OPTION_MASK \ 00247 (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND) 00248 #define ARG_ENCODING_FIXED 16 00249 #define ARG_ENCODING_NONE 32 00250 00251 static int 00252 char_to_option(int c) 00253 { 00254 int val; 00255 00256 switch (c) { 00257 case 'i': 00258 val = ONIG_OPTION_IGNORECASE; 00259 break; 00260 case 'x': 00261 val = ONIG_OPTION_EXTEND; 00262 break; 00263 case 'm': 00264 val = ONIG_OPTION_MULTILINE; 00265 break; 00266 default: 00267 val = 0; 00268 break; 00269 } 00270 return val; 00271 } 00272 00273 static char * 00274 option_to_str(char str[4], int options) 00275 { 00276 char *p = str; 00277 if (options & ONIG_OPTION_MULTILINE) *p++ = 'm'; 00278 if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i'; 00279 if (options & ONIG_OPTION_EXTEND) *p++ = 'x'; 00280 *p = 0; 00281 return str; 00282 } 00283 00284 extern int 00285 rb_char_to_option_kcode(int c, int *option, int *kcode) 00286 { 00287 *option = 0; 00288 00289 switch (c) { 00290 case 'n': 00291 *kcode = rb_ascii8bit_encindex(); 00292 return (*option = ARG_ENCODING_NONE); 00293 case 'e': 00294 *kcode = rb_enc_find_index("EUC-JP"); 00295 break; 00296 case 's': 00297 *kcode = rb_enc_find_index("Windows-31J"); 00298 break; 00299 case 'u': 00300 *kcode = rb_utf8_encindex(); 00301 break; 00302 default: 00303 *kcode = -1; 00304 return (*option = char_to_option(c)); 00305 } 00306 *option = ARG_ENCODING_FIXED; 00307 return 1; 00308 } 00309 00310 static void 00311 rb_reg_check(VALUE re) 00312 { 00313 if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) { 00314 rb_raise(rb_eTypeError, "uninitialized Regexp"); 00315 } 00316 } 00317 00318 static void 00319 rb_reg_expr_str(VALUE str, const char *s, long len, 00320 rb_encoding *enc, rb_encoding *resenc) 00321 { 00322 const char *p, *pend; 00323 int cr = ENC_CODERANGE_UNKNOWN; 00324 int need_escape = 0; 00325 int c, clen; 00326 00327 p = s; pend = p + len; 00328 rb_str_coderange_scan_restartable(p, pend, enc, &cr); 00329 if (rb_enc_asciicompat(enc) && 00330 (cr == ENC_CODERANGE_VALID || cr == ENC_CODERANGE_7BIT)) { 00331 while (p < pend) { 00332 c = rb_enc_ascget(p, pend, &clen, enc); 00333 if (c == -1) { 00334 if (enc == resenc) { 00335 p += mbclen(p, pend, enc); 00336 } 00337 else { 00338 need_escape = 1; 00339 break; 00340 } 00341 } 00342 else if (c != '/' && rb_enc_isprint(c, enc)) { 00343 p += clen; 00344 } 00345 else { 00346 need_escape = 1; 00347 break; 00348 } 00349 } 00350 } 00351 else { 00352 need_escape = 1; 00353 } 00354 00355 if (!need_escape) { 00356 rb_str_buf_cat(str, s, len); 00357 } 00358 else { 00359 int unicode_p = rb_enc_unicode_p(enc); 00360 p = s; 00361 while (p<pend) { 00362 c = rb_enc_ascget(p, pend, &clen, enc); 00363 if (c == '\\' && p+clen < pend) { 00364 int n = clen + mbclen(p+clen, pend, enc); 00365 rb_str_buf_cat(str, p, n); 00366 p += n; 00367 continue; 00368 } 00369 else if (c == '/') { 00370 char c = '\\'; 00371 rb_str_buf_cat(str, &c, 1); 00372 rb_str_buf_cat(str, p, clen); 00373 } 00374 else if (c == -1) { 00375 clen = rb_enc_precise_mbclen(p, pend, enc); 00376 if (!MBCLEN_CHARFOUND_P(clen)) { 00377 c = (unsigned char)*p; 00378 clen = 1; 00379 goto hex; 00380 } 00381 if (resenc) { 00382 unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc); 00383 rb_str_buf_cat_escaped_char(str, c, unicode_p); 00384 } 00385 else { 00386 clen = MBCLEN_CHARFOUND_LEN(clen); 00387 rb_str_buf_cat(str, p, clen); 00388 } 00389 } 00390 else if (rb_enc_isprint(c, enc)) { 00391 rb_str_buf_cat(str, p, clen); 00392 } 00393 else if (!rb_enc_isspace(c, enc)) { 00394 char b[8]; 00395 00396 hex: 00397 snprintf(b, sizeof(b), "\\x%02X", c); 00398 rb_str_buf_cat(str, b, 4); 00399 } 00400 else { 00401 rb_str_buf_cat(str, p, clen); 00402 } 00403 p += clen; 00404 } 00405 } 00406 } 00407 00408 static VALUE 00409 rb_reg_desc(const char *s, long len, VALUE re) 00410 { 00411 rb_encoding *enc = rb_enc_get(re); 00412 VALUE str = rb_str_buf_new2("/"); 00413 rb_encoding *resenc = rb_default_internal_encoding(); 00414 if (resenc == NULL) resenc = rb_default_external_encoding(); 00415 00416 if (re && rb_enc_asciicompat(enc)) { 00417 rb_enc_copy(str, re); 00418 } 00419 else { 00420 rb_enc_associate(str, rb_usascii_encoding()); 00421 } 00422 rb_reg_expr_str(str, s, len, enc, resenc); 00423 rb_str_buf_cat2(str, "/"); 00424 if (re) { 00425 char opts[4]; 00426 rb_reg_check(re); 00427 if (*option_to_str(opts, RREGEXP(re)->ptr->options)) 00428 rb_str_buf_cat2(str, opts); 00429 if (RBASIC(re)->flags & REG_ENCODING_NONE) 00430 rb_str_buf_cat2(str, "n"); 00431 } 00432 OBJ_INFECT(str, re); 00433 return str; 00434 } 00435 00436 00437 /* 00438 * call-seq: 00439 * rxp.source -> str 00440 * 00441 * Returns the original string of the pattern. 00442 * 00443 * /ab+c/ix.source #=> "ab+c" 00444 * 00445 * Note that escape sequences are retained as is. 00446 * 00447 * /\x20\+/.source #=> "\\x20\\+" 00448 * 00449 */ 00450 00451 static VALUE 00452 rb_reg_source(VALUE re) 00453 { 00454 VALUE str; 00455 00456 rb_reg_check(re); 00457 str = rb_enc_str_new(RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re), rb_enc_get(re)); 00458 if (OBJ_TAINTED(re)) OBJ_TAINT(str); 00459 return str; 00460 } 00461 00462 /* 00463 * call-seq: 00464 * rxp.inspect -> string 00465 * 00466 * Produce a nicely formatted string-version of _rxp_. Perhaps surprisingly, 00467 * <code>#inspect</code> actually produces the more natural version of 00468 * the string than <code>#to_s</code>. 00469 * 00470 * /ab+c/ix.inspect #=> "/ab+c/ix" 00471 * 00472 */ 00473 00474 static VALUE 00475 rb_reg_inspect(VALUE re) 00476 { 00477 if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) { 00478 return rb_any_to_s(re); 00479 } 00480 return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re); 00481 } 00482 00483 00484 /* 00485 * call-seq: 00486 * rxp.to_s -> str 00487 * 00488 * Returns a string containing the regular expression and its options (using the 00489 * <code>(?opts:source)</code> notation. This string can be fed back in to 00490 * <code>Regexp::new</code> to a regular expression with the same semantics as 00491 * the original. (However, <code>Regexp#==</code> may not return true when 00492 * comparing the two, as the source of the regular expression itself may 00493 * differ, as the example shows). <code>Regexp#inspect</code> produces a 00494 * generally more readable version of <i>rxp</i>. 00495 * 00496 * r1 = /ab+c/ix #=> /ab+c/ix 00497 * s1 = r1.to_s #=> "(?ix-m:ab+c)" 00498 * r2 = Regexp.new(s1) #=> /(?ix-m:ab+c)/ 00499 * r1 == r2 #=> false 00500 * r1.source #=> "ab+c" 00501 * r2.source #=> "(?ix-m:ab+c)" 00502 */ 00503 00504 static VALUE 00505 rb_reg_to_s(VALUE re) 00506 { 00507 int options, opt; 00508 const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND; 00509 long len; 00510 const UChar* ptr; 00511 VALUE str = rb_str_buf_new2("(?"); 00512 char optbuf[5]; 00513 rb_encoding *enc = rb_enc_get(re); 00514 00515 rb_reg_check(re); 00516 00517 rb_enc_copy(str, re); 00518 options = RREGEXP(re)->ptr->options; 00519 ptr = (UChar*)RREGEXP_SRC_PTR(re); 00520 len = RREGEXP_SRC_LEN(re); 00521 again: 00522 if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') { 00523 int err = 1; 00524 ptr += 2; 00525 if ((len -= 2) > 0) { 00526 do { 00527 opt = char_to_option((int )*ptr); 00528 if (opt != 0) { 00529 options |= opt; 00530 } 00531 else { 00532 break; 00533 } 00534 ++ptr; 00535 } while (--len > 0); 00536 } 00537 if (len > 1 && *ptr == '-') { 00538 ++ptr; 00539 --len; 00540 do { 00541 opt = char_to_option((int )*ptr); 00542 if (opt != 0) { 00543 options &= ~opt; 00544 } 00545 else { 00546 break; 00547 } 00548 ++ptr; 00549 } while (--len > 0); 00550 } 00551 if (*ptr == ')') { 00552 --len; 00553 ++ptr; 00554 goto again; 00555 } 00556 if (*ptr == ':' && ptr[len-1] == ')') { 00557 Regexp *rp; 00558 00559 ++ptr; 00560 len -= 2; 00561 err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT, 00562 enc, OnigDefaultSyntax, NULL); 00563 onig_free(rp); 00564 } 00565 if (err) { 00566 options = RREGEXP(re)->ptr->options; 00567 ptr = (UChar*)RREGEXP_SRC_PTR(re); 00568 len = RREGEXP_SRC_LEN(re); 00569 } 00570 } 00571 00572 if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf); 00573 00574 if ((options & embeddable) != embeddable) { 00575 optbuf[0] = '-'; 00576 option_to_str(optbuf + 1, ~options); 00577 rb_str_buf_cat2(str, optbuf); 00578 } 00579 00580 rb_str_buf_cat2(str, ":"); 00581 rb_reg_expr_str(str, (char*)ptr, len, enc, NULL); 00582 rb_str_buf_cat2(str, ")"); 00583 rb_enc_copy(str, re); 00584 00585 OBJ_INFECT(str, re); 00586 return str; 00587 } 00588 00589 static void 00590 rb_reg_raise(const char *s, long len, const char *err, VALUE re) 00591 { 00592 volatile VALUE desc = rb_reg_desc(s, len, re); 00593 00594 rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc)); 00595 } 00596 00597 static VALUE 00598 rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err) 00599 { 00600 char opts[6]; 00601 VALUE desc = rb_str_buf_new2(err); 00602 rb_encoding *resenc = rb_default_internal_encoding(); 00603 if (resenc == NULL) resenc = rb_default_external_encoding(); 00604 00605 rb_enc_associate(desc, enc); 00606 rb_str_buf_cat2(desc, ": /"); 00607 rb_reg_expr_str(desc, s, len, enc, resenc); 00608 opts[0] = '/'; 00609 option_to_str(opts + 1, options); 00610 rb_str_buf_cat2(desc, opts); 00611 return rb_exc_new3(rb_eRegexpError, desc); 00612 } 00613 00614 static void 00615 rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err) 00616 { 00617 rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err)); 00618 } 00619 00620 static VALUE 00621 rb_reg_error_desc(VALUE str, int options, const char *err) 00622 { 00623 return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str), 00624 rb_enc_get(str), options, err); 00625 } 00626 00627 static void 00628 rb_reg_raise_str(VALUE str, int options, const char *err) 00629 { 00630 rb_exc_raise(rb_reg_error_desc(str, options, err)); 00631 } 00632 00633 00634 /* 00635 * call-seq: 00636 * rxp.casefold? -> true or false 00637 * 00638 * Returns the value of the case-insensitive flag. 00639 * 00640 * /a/.casefold? #=> false 00641 * /a/i.casefold? #=> true 00642 * /(?i:a)/.casefold? #=> false 00643 */ 00644 00645 static VALUE 00646 rb_reg_casefold_p(VALUE re) 00647 { 00648 rb_reg_check(re); 00649 if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue; 00650 return Qfalse; 00651 } 00652 00653 00654 /* 00655 * call-seq: 00656 * rxp.options -> fixnum 00657 * 00658 * Returns the set of bits corresponding to the options used when creating this 00659 * Regexp (see <code>Regexp::new</code> for details. Note that additional bits 00660 * may be set in the returned options: these are used internally by the regular 00661 * expression code. These extra bits are ignored if the options are passed to 00662 * <code>Regexp::new</code>. 00663 * 00664 * Regexp::IGNORECASE #=> 1 00665 * Regexp::EXTENDED #=> 2 00666 * Regexp::MULTILINE #=> 4 00667 * 00668 * /cat/.options #=> 0 00669 * /cat/ix.options #=> 3 00670 * Regexp.new('cat', true).options #=> 1 00671 * /\xa1\xa2/e.options #=> 16 00672 * 00673 * r = /cat/ix 00674 * Regexp.new(r.source, r.options) #=> /cat/ix 00675 */ 00676 00677 static VALUE 00678 rb_reg_options_m(VALUE re) 00679 { 00680 int options = rb_reg_options(re); 00681 return INT2NUM(options); 00682 } 00683 00684 static int 00685 reg_names_iter(const OnigUChar *name, const OnigUChar *name_end, 00686 int back_num, int *back_refs, OnigRegex regex, void *arg) 00687 { 00688 VALUE ary = (VALUE)arg; 00689 rb_ary_push(ary, rb_str_new((const char *)name, name_end-name)); 00690 return 0; 00691 } 00692 00693 /* 00694 * call-seq: 00695 * rxp.names -> [name1, name2, ...] 00696 * 00697 * Returns a list of names of captures as an array of strings. 00698 * 00699 * /(?<foo>.)(?<bar>.)(?<baz>.)/.names 00700 * #=> ["foo", "bar", "baz"] 00701 * 00702 * /(?<foo>.)(?<foo>.)/.names 00703 * #=> ["foo"] 00704 * 00705 * /(.)(.)/.names 00706 * #=> [] 00707 */ 00708 00709 static VALUE 00710 rb_reg_names(VALUE re) 00711 { 00712 VALUE ary = rb_ary_new(); 00713 rb_reg_check(re); 00714 onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary); 00715 return ary; 00716 } 00717 00718 static int 00719 reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end, 00720 int back_num, int *back_refs, OnigRegex regex, void *arg) 00721 { 00722 VALUE hash = (VALUE)arg; 00723 VALUE ary = rb_ary_new2(back_num); 00724 int i; 00725 00726 for(i = 0; i < back_num; i++) 00727 rb_ary_store(ary, i, INT2NUM(back_refs[i])); 00728 00729 rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary); 00730 00731 return 0; 00732 } 00733 00734 /* 00735 * call-seq: 00736 * rxp.named_captures -> hash 00737 * 00738 * Returns a hash representing information about named captures of <i>rxp</i>. 00739 * 00740 * A key of the hash is a name of the named captures. 00741 * A value of the hash is an array which is list of indexes of corresponding 00742 * named captures. 00743 * 00744 * /(?<foo>.)(?<bar>.)/.named_captures 00745 * #=> {"foo"=>[1], "bar"=>[2]} 00746 * 00747 * /(?<foo>.)(?<foo>.)/.named_captures 00748 * #=> {"foo"=>[1, 2]} 00749 * 00750 * If there are no named captures, an empty hash is returned. 00751 * 00752 * /(.)(.)/.named_captures 00753 * #=> {} 00754 */ 00755 00756 static VALUE 00757 rb_reg_named_captures(VALUE re) 00758 { 00759 VALUE hash = rb_hash_new(); 00760 rb_reg_check(re); 00761 onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash); 00762 return hash; 00763 } 00764 00765 static int 00766 onig_new_with_source(regex_t** reg, const UChar* pattern, const UChar* pattern_end, 00767 OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax, 00768 OnigErrorInfo* einfo, const char *sourcefile, int sourceline) 00769 { 00770 int r; 00771 00772 *reg = (regex_t* )xmalloc(sizeof(regex_t)); 00773 if (IS_NULL(*reg)) return ONIGERR_MEMORY; 00774 00775 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax); 00776 if (r) goto err; 00777 00778 r = onig_compile(*reg, pattern, pattern_end, einfo, sourcefile, sourceline); 00779 if (r) { 00780 err: 00781 onig_free(*reg); 00782 *reg = NULL; 00783 } 00784 return r; 00785 } 00786 00787 static Regexp* 00788 make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err, 00789 const char *sourcefile, int sourceline) 00790 { 00791 Regexp *rp; 00792 int r; 00793 OnigErrorInfo einfo; 00794 00795 /* Handle escaped characters first. */ 00796 00797 /* Build a copy of the string (in dest) with the 00798 escaped characters translated, and generate the regex 00799 from that. 00800 */ 00801 00802 r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s + len), flags, 00803 enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline); 00804 if (r) { 00805 onig_error_code_to_str((UChar*)err, r, &einfo); 00806 return 0; 00807 } 00808 return rp; 00809 } 00810 00811 00812 /* 00813 * Document-class: MatchData 00814 * 00815 * <code>MatchData</code> is the type of the special variable <code>$~</code>, 00816 * and is the type of the object returned by <code>Regexp#match</code> and 00817 * <code>Regexp.last_match</code>. It encapsulates all the results of a pattern 00818 * match, results normally accessed through the special variables 00819 * <code>$&</code>, <code>$'</code>, <code>$`</code>, <code>$1</code>, 00820 * <code>$2</code>, and so on. 00821 * 00822 */ 00823 00824 VALUE rb_cMatch; 00825 00826 static VALUE 00827 match_alloc(VALUE klass) 00828 { 00829 NEWOBJ(match, struct RMatch); 00830 OBJSETUP(match, klass, T_MATCH); 00831 00832 match->str = 0; 00833 match->rmatch = 0; 00834 match->regexp = 0; 00835 match->rmatch = ALLOC(struct rmatch); 00836 MEMZERO(match->rmatch, struct rmatch, 1); 00837 00838 return (VALUE)match; 00839 } 00840 00841 typedef struct { 00842 long byte_pos; 00843 long char_pos; 00844 } pair_t; 00845 00846 static int 00847 pair_byte_cmp(const void *pair1, const void *pair2) 00848 { 00849 long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos; 00850 #if SIZEOF_LONG > SIZEOF_INT 00851 return diff ? diff > 0 ? 1 : -1 : 0; 00852 #else 00853 return (int)diff; 00854 #endif 00855 } 00856 00857 static void 00858 update_char_offset(VALUE match) 00859 { 00860 struct rmatch *rm = RMATCH(match)->rmatch; 00861 struct re_registers *regs; 00862 int i, num_regs, num_pos; 00863 long c; 00864 char *s, *p, *q; 00865 rb_encoding *enc; 00866 pair_t *pairs; 00867 00868 if (rm->char_offset_updated) 00869 return; 00870 00871 regs = &rm->regs; 00872 num_regs = rm->regs.num_regs; 00873 00874 if (rm->char_offset_num_allocated < num_regs) { 00875 REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs); 00876 rm->char_offset_num_allocated = num_regs; 00877 } 00878 00879 enc = rb_enc_get(RMATCH(match)->str); 00880 if (rb_enc_mbmaxlen(enc) == 1) { 00881 for (i = 0; i < num_regs; i++) { 00882 rm->char_offset[i].beg = BEG(i); 00883 rm->char_offset[i].end = END(i); 00884 } 00885 rm->char_offset_updated = 1; 00886 return; 00887 } 00888 00889 pairs = ALLOCA_N(pair_t, num_regs*2); 00890 num_pos = 0; 00891 for (i = 0; i < num_regs; i++) { 00892 if (BEG(i) < 0) 00893 continue; 00894 pairs[num_pos++].byte_pos = BEG(i); 00895 pairs[num_pos++].byte_pos = END(i); 00896 } 00897 qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp); 00898 00899 s = p = RSTRING_PTR(RMATCH(match)->str); 00900 c = 0; 00901 for (i = 0; i < num_pos; i++) { 00902 q = s + pairs[i].byte_pos; 00903 c += rb_enc_strlen(p, q, enc); 00904 pairs[i].char_pos = c; 00905 p = q; 00906 } 00907 00908 for (i = 0; i < num_regs; i++) { 00909 pair_t key, *found; 00910 if (BEG(i) < 0) { 00911 rm->char_offset[i].beg = -1; 00912 rm->char_offset[i].end = -1; 00913 continue; 00914 } 00915 00916 key.byte_pos = BEG(i); 00917 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp); 00918 rm->char_offset[i].beg = found->char_pos; 00919 00920 key.byte_pos = END(i); 00921 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp); 00922 rm->char_offset[i].end = found->char_pos; 00923 } 00924 00925 rm->char_offset_updated = 1; 00926 } 00927 00928 static void 00929 match_check(VALUE match) 00930 { 00931 if (!RMATCH(match)->regexp) { 00932 rb_raise(rb_eTypeError, "uninitialized Match"); 00933 } 00934 } 00935 00936 /* :nodoc: */ 00937 static VALUE 00938 match_init_copy(VALUE obj, VALUE orig) 00939 { 00940 struct rmatch *rm; 00941 00942 if (obj == orig) return obj; 00943 00944 if (!rb_obj_is_instance_of(orig, rb_obj_class(obj))) { 00945 rb_raise(rb_eTypeError, "wrong argument class"); 00946 } 00947 RMATCH(obj)->str = RMATCH(orig)->str; 00948 RMATCH(obj)->regexp = RMATCH(orig)->regexp; 00949 00950 rm = RMATCH(obj)->rmatch; 00951 onig_region_copy(&rm->regs, RMATCH_REGS(orig)); 00952 00953 if (!RMATCH(orig)->rmatch->char_offset_updated) { 00954 rm->char_offset_updated = 0; 00955 } 00956 else { 00957 if (rm->char_offset_num_allocated < rm->regs.num_regs) { 00958 REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs); 00959 rm->char_offset_num_allocated = rm->regs.num_regs; 00960 } 00961 MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset, 00962 struct rmatch_offset, rm->regs.num_regs); 00963 rm->char_offset_updated = 1; 00964 } 00965 00966 return obj; 00967 } 00968 00969 00970 /* 00971 * call-seq: 00972 * mtch.regexp -> regexp 00973 * 00974 * Returns the regexp. 00975 * 00976 * m = /a.*b/.match("abc") 00977 * m.regexp #=> /a.*b/ 00978 */ 00979 00980 static VALUE 00981 match_regexp(VALUE match) 00982 { 00983 match_check(match); 00984 return RMATCH(match)->regexp; 00985 } 00986 00987 /* 00988 * call-seq: 00989 * mtch.names -> [name1, name2, ...] 00990 * 00991 * Returns a list of names of captures as an array of strings. 00992 * It is same as mtch.regexp.names. 00993 * 00994 * /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").names 00995 * #=> ["foo", "bar", "baz"] 00996 * 00997 * m = /(?<x>.)(?<y>.)?/.match("a") #=> #<MatchData "a" x:"a" y:nil> 00998 * m.names #=> ["x", "y"] 00999 */ 01000 01001 static VALUE 01002 match_names(VALUE match) 01003 { 01004 match_check(match); 01005 return rb_reg_names(RMATCH(match)->regexp); 01006 } 01007 01008 /* 01009 * call-seq: 01010 * mtch.length -> integer 01011 * mtch.size -> integer 01012 * 01013 * Returns the number of elements in the match array. 01014 * 01015 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01016 * m.length #=> 5 01017 * m.size #=> 5 01018 */ 01019 01020 static VALUE 01021 match_size(VALUE match) 01022 { 01023 match_check(match); 01024 return INT2FIX(RMATCH_REGS(match)->num_regs); 01025 } 01026 01027 static int 01028 match_backref_number(VALUE match, VALUE backref) 01029 { 01030 const char *name; 01031 int num; 01032 01033 struct re_registers *regs = RMATCH_REGS(match); 01034 VALUE regexp = RMATCH(match)->regexp; 01035 01036 match_check(match); 01037 switch(TYPE(backref)) { 01038 default: 01039 return NUM2INT(backref); 01040 01041 case T_SYMBOL: 01042 name = rb_id2name(SYM2ID(backref)); 01043 break; 01044 01045 case T_STRING: 01046 name = StringValueCStr(backref); 01047 break; 01048 } 01049 01050 num = onig_name_to_backref_number(RREGEXP(regexp)->ptr, 01051 (const unsigned char*)name, 01052 (const unsigned char*)name + strlen(name), 01053 regs); 01054 01055 if (num < 1) { 01056 rb_raise(rb_eIndexError, "undefined group name reference: %s", name); 01057 } 01058 01059 return num; 01060 } 01061 01062 int 01063 rb_reg_backref_number(VALUE match, VALUE backref) 01064 { 01065 return match_backref_number(match, backref); 01066 } 01067 01068 /* 01069 * call-seq: 01070 * mtch.offset(n) -> array 01071 * 01072 * Returns a two-element array containing the beginning and ending offsets of 01073 * the <em>n</em>th match. 01074 * <em>n</em> can be a string or symbol to reference a named capture. 01075 * 01076 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01077 * m.offset(0) #=> [1, 7] 01078 * m.offset(4) #=> [6, 7] 01079 * 01080 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge") 01081 * p m.offset(:foo) #=> [0, 1] 01082 * p m.offset(:bar) #=> [2, 3] 01083 * 01084 */ 01085 01086 static VALUE 01087 match_offset(VALUE match, VALUE n) 01088 { 01089 int i = match_backref_number(match, n); 01090 struct re_registers *regs = RMATCH_REGS(match); 01091 01092 match_check(match); 01093 if (i < 0 || regs->num_regs <= i) 01094 rb_raise(rb_eIndexError, "index %d out of matches", i); 01095 01096 if (BEG(i) < 0) 01097 return rb_assoc_new(Qnil, Qnil); 01098 01099 update_char_offset(match); 01100 return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg), 01101 INT2FIX(RMATCH(match)->rmatch->char_offset[i].end)); 01102 } 01103 01104 01105 /* 01106 * call-seq: 01107 * mtch.begin(n) -> integer 01108 * 01109 * Returns the offset of the start of the <em>n</em>th element of the match 01110 * array in the string. 01111 * <em>n</em> can be a string or symbol to reference a named capture. 01112 * 01113 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01114 * m.begin(0) #=> 1 01115 * m.begin(2) #=> 2 01116 * 01117 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge") 01118 * p m.begin(:foo) #=> 0 01119 * p m.begin(:bar) #=> 2 01120 */ 01121 01122 static VALUE 01123 match_begin(VALUE match, VALUE n) 01124 { 01125 int i = match_backref_number(match, n); 01126 struct re_registers *regs = RMATCH_REGS(match); 01127 01128 match_check(match); 01129 if (i < 0 || regs->num_regs <= i) 01130 rb_raise(rb_eIndexError, "index %d out of matches", i); 01131 01132 if (BEG(i) < 0) 01133 return Qnil; 01134 01135 update_char_offset(match); 01136 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg); 01137 } 01138 01139 01140 /* 01141 * call-seq: 01142 * mtch.end(n) -> integer 01143 * 01144 * Returns the offset of the character immediately following the end of the 01145 * <em>n</em>th element of the match array in the string. 01146 * <em>n</em> can be a string or symbol to reference a named capture. 01147 * 01148 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01149 * m.end(0) #=> 7 01150 * m.end(2) #=> 3 01151 * 01152 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge") 01153 * p m.end(:foo) #=> 1 01154 * p m.end(:bar) #=> 3 01155 */ 01156 01157 static VALUE 01158 match_end(VALUE match, VALUE n) 01159 { 01160 int i = match_backref_number(match, n); 01161 struct re_registers *regs = RMATCH_REGS(match); 01162 01163 match_check(match); 01164 if (i < 0 || regs->num_regs <= i) 01165 rb_raise(rb_eIndexError, "index %d out of matches", i); 01166 01167 if (BEG(i) < 0) 01168 return Qnil; 01169 01170 update_char_offset(match); 01171 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end); 01172 } 01173 01174 #define MATCH_BUSY FL_USER2 01175 01176 void 01177 rb_match_busy(VALUE match) 01178 { 01179 FL_SET(match, MATCH_BUSY); 01180 } 01181 01182 /* 01183 * call-seq: 01184 * rxp.fixed_encoding? -> true or false 01185 * 01186 * Returns false if rxp is applicable to 01187 * a string with any ASCII compatible encoding. 01188 * Returns true otherwise. 01189 * 01190 * r = /a/ 01191 * r.fixed_encoding? #=> false 01192 * r =~ "\u{6666} a" #=> 2 01193 * r =~ "\xa1\xa2 a".force_encoding("euc-jp") #=> 2 01194 * r =~ "abc".force_encoding("euc-jp") #=> 0 01195 * 01196 * r = /a/u 01197 * r.fixed_encoding? #=> true 01198 * r.encoding #=> #<Encoding:UTF-8> 01199 * r =~ "\u{6666} a" #=> 2 01200 * r =~ "\xa1\xa2".force_encoding("euc-jp") #=> ArgumentError 01201 * r =~ "abc".force_encoding("euc-jp") #=> 0 01202 * 01203 * r = /\u{6666}/ 01204 * r.fixed_encoding? #=> true 01205 * r.encoding #=> #<Encoding:UTF-8> 01206 * r =~ "\u{6666} a" #=> 0 01207 * r =~ "\xa1\xa2".force_encoding("euc-jp") #=> ArgumentError 01208 * r =~ "abc".force_encoding("euc-jp") #=> nil 01209 */ 01210 01211 static VALUE 01212 rb_reg_fixed_encoding_p(VALUE re) 01213 { 01214 if (FL_TEST(re, KCODE_FIXED)) 01215 return Qtrue; 01216 else 01217 return Qfalse; 01218 } 01219 01220 static VALUE 01221 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc, 01222 rb_encoding **fixed_enc, onig_errmsg_buffer err); 01223 01224 01225 static void 01226 reg_enc_error(VALUE re, VALUE str) 01227 { 01228 rb_raise(rb_eEncCompatError, 01229 "incompatible encoding regexp match (%s regexp with %s string)", 01230 rb_enc_name(rb_enc_get(re)), 01231 rb_enc_name(rb_enc_get(str))); 01232 } 01233 01234 static rb_encoding* 01235 rb_reg_prepare_enc(VALUE re, VALUE str, int warn) 01236 { 01237 rb_encoding *enc = 0; 01238 01239 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) { 01240 rb_raise(rb_eArgError, 01241 "invalid byte sequence in %s", 01242 rb_enc_name(rb_enc_get(str))); 01243 } 01244 01245 rb_reg_check(re); 01246 enc = rb_enc_get(str); 01247 if (!rb_enc_str_asciicompat_p(str)) { 01248 if (RREGEXP(re)->ptr->enc != enc) { 01249 reg_enc_error(re, str); 01250 } 01251 } 01252 else if (rb_reg_fixed_encoding_p(re)) { 01253 if (RREGEXP(re)->ptr->enc != enc && 01254 (!rb_enc_asciicompat(RREGEXP(re)->ptr->enc) || 01255 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)) { 01256 reg_enc_error(re, str); 01257 } 01258 enc = RREGEXP(re)->ptr->enc; 01259 } 01260 if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) && 01261 enc != rb_ascii8bit_encoding() && 01262 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) { 01263 rb_warn("regexp match /.../n against to %s string", 01264 rb_enc_name(enc)); 01265 } 01266 return enc; 01267 } 01268 01269 regex_t * 01270 rb_reg_prepare_re(VALUE re, VALUE str) 01271 { 01272 regex_t *reg = RREGEXP(re)->ptr; 01273 onig_errmsg_buffer err = ""; 01274 int r; 01275 OnigErrorInfo einfo; 01276 const char *pattern; 01277 VALUE unescaped; 01278 rb_encoding *fixed_enc = 0; 01279 rb_encoding *enc = rb_reg_prepare_enc(re, str, 1); 01280 01281 if (reg->enc == enc) return reg; 01282 01283 rb_reg_check(re); 01284 reg = RREGEXP(re)->ptr; 01285 pattern = RREGEXP_SRC_PTR(re); 01286 01287 unescaped = rb_reg_preprocess( 01288 pattern, pattern + RREGEXP_SRC_LEN(re), enc, 01289 &fixed_enc, err); 01290 01291 if (unescaped == Qnil) { 01292 rb_raise(rb_eArgError, "regexp preprocess failed: %s", err); 01293 } 01294 01295 r = onig_new(®, (UChar* )RSTRING_PTR(unescaped), 01296 (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)), 01297 reg->options, enc, 01298 OnigDefaultSyntax, &einfo); 01299 if (r) { 01300 onig_error_code_to_str((UChar*)err, r, &einfo); 01301 rb_reg_raise(pattern, RREGEXP_SRC_LEN(re), err, re); 01302 } 01303 01304 RB_GC_GUARD(unescaped); 01305 return reg; 01306 } 01307 01308 long 01309 rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse) 01310 { 01311 long range; 01312 rb_encoding *enc; 01313 UChar *p, *string; 01314 01315 enc = rb_reg_prepare_enc(re, str, 0); 01316 01317 if (reverse) { 01318 range = -pos; 01319 } 01320 else { 01321 range = RSTRING_LEN(str) - pos; 01322 } 01323 01324 if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) { 01325 string = (UChar*)RSTRING_PTR(str); 01326 01327 if (range > 0) { 01328 p = onigenc_get_right_adjust_char_head(enc, string, string + pos, string + RSTRING_LEN(str)); 01329 } 01330 else { 01331 p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos, string + RSTRING_LEN(str)); 01332 } 01333 return p - string; 01334 } 01335 01336 return pos; 01337 } 01338 01339 long 01340 rb_reg_search(VALUE re, VALUE str, long pos, int reverse) 01341 { 01342 long result; 01343 VALUE match; 01344 struct re_registers regi, *regs = ®i; 01345 char *range = RSTRING_PTR(str); 01346 regex_t *reg; 01347 int tmpreg; 01348 01349 if (pos > RSTRING_LEN(str) || pos < 0) { 01350 rb_backref_set(Qnil); 01351 return -1; 01352 } 01353 01354 reg = rb_reg_prepare_re(re, str); 01355 tmpreg = reg != RREGEXP(re)->ptr; 01356 if (!tmpreg) RREGEXP(re)->usecnt++; 01357 01358 match = rb_backref_get(); 01359 if (!NIL_P(match)) { 01360 if (FL_TEST(match, MATCH_BUSY)) { 01361 match = Qnil; 01362 } 01363 else { 01364 regs = RMATCH_REGS(match); 01365 } 01366 } 01367 if (NIL_P(match)) { 01368 MEMZERO(regs, struct re_registers, 1); 01369 } 01370 if (!reverse) { 01371 range += RSTRING_LEN(str); 01372 } 01373 result = onig_search(reg, 01374 (UChar*)(RSTRING_PTR(str)), 01375 ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)), 01376 ((UChar*)(RSTRING_PTR(str)) + pos), 01377 ((UChar*)range), 01378 regs, ONIG_OPTION_NONE); 01379 if (!tmpreg) RREGEXP(re)->usecnt--; 01380 if (tmpreg) { 01381 if (RREGEXP(re)->usecnt) { 01382 onig_free(reg); 01383 } 01384 else { 01385 onig_free(RREGEXP(re)->ptr); 01386 RREGEXP(re)->ptr = reg; 01387 } 01388 } 01389 if (result < 0) { 01390 if (regs == ®i) 01391 onig_region_free(regs, 0); 01392 if (result == ONIG_MISMATCH) { 01393 rb_backref_set(Qnil); 01394 return result; 01395 } 01396 else { 01397 onig_errmsg_buffer err = ""; 01398 onig_error_code_to_str((UChar*)err, (int)result); 01399 rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re); 01400 } 01401 } 01402 01403 if (NIL_P(match)) { 01404 match = match_alloc(rb_cMatch); 01405 onig_region_copy(RMATCH_REGS(match), regs); 01406 onig_region_free(regs, 0); 01407 } 01408 else { 01409 if (rb_safe_level() >= 3) 01410 OBJ_TAINT(match); 01411 else 01412 FL_UNSET(match, FL_TAINT); 01413 } 01414 01415 RMATCH(match)->str = rb_str_new4(str); 01416 RMATCH(match)->regexp = re; 01417 RMATCH(match)->rmatch->char_offset_updated = 0; 01418 rb_backref_set(match); 01419 01420 OBJ_INFECT(match, re); 01421 OBJ_INFECT(match, str); 01422 01423 return result; 01424 } 01425 01426 VALUE 01427 rb_reg_nth_defined(int nth, VALUE match) 01428 { 01429 struct re_registers *regs; 01430 if (NIL_P(match)) return Qnil; 01431 match_check(match); 01432 regs = RMATCH_REGS(match); 01433 if (nth >= regs->num_regs) { 01434 return Qnil; 01435 } 01436 if (nth < 0) { 01437 nth += regs->num_regs; 01438 if (nth <= 0) return Qnil; 01439 } 01440 if (BEG(nth) == -1) return Qfalse; 01441 return Qtrue; 01442 } 01443 01444 VALUE 01445 rb_reg_nth_match(int nth, VALUE match) 01446 { 01447 VALUE str; 01448 long start, end, len; 01449 struct re_registers *regs; 01450 01451 if (NIL_P(match)) return Qnil; 01452 match_check(match); 01453 regs = RMATCH_REGS(match); 01454 if (nth >= regs->num_regs) { 01455 return Qnil; 01456 } 01457 if (nth < 0) { 01458 nth += regs->num_regs; 01459 if (nth <= 0) return Qnil; 01460 } 01461 start = BEG(nth); 01462 if (start == -1) return Qnil; 01463 end = END(nth); 01464 len = end - start; 01465 str = rb_str_subseq(RMATCH(match)->str, start, len); 01466 OBJ_INFECT(str, match); 01467 return str; 01468 } 01469 01470 VALUE 01471 rb_reg_last_match(VALUE match) 01472 { 01473 return rb_reg_nth_match(0, match); 01474 } 01475 01476 01477 /* 01478 * call-seq: 01479 * mtch.pre_match -> str 01480 * 01481 * Returns the portion of the original string before the current match. 01482 * Equivalent to the special variable <code>$`</code>. 01483 * 01484 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01485 * m.pre_match #=> "T" 01486 */ 01487 01488 VALUE 01489 rb_reg_match_pre(VALUE match) 01490 { 01491 VALUE str; 01492 struct re_registers *regs; 01493 01494 if (NIL_P(match)) return Qnil; 01495 match_check(match); 01496 regs = RMATCH_REGS(match); 01497 if (BEG(0) == -1) return Qnil; 01498 str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0)); 01499 if (OBJ_TAINTED(match)) OBJ_TAINT(str); 01500 return str; 01501 } 01502 01503 01504 /* 01505 * call-seq: 01506 * mtch.post_match -> str 01507 * 01508 * Returns the portion of the original string after the current match. 01509 * Equivalent to the special variable <code>$'</code>. 01510 * 01511 * m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie") 01512 * m.post_match #=> ": The Movie" 01513 */ 01514 01515 VALUE 01516 rb_reg_match_post(VALUE match) 01517 { 01518 VALUE str; 01519 long pos; 01520 struct re_registers *regs; 01521 01522 if (NIL_P(match)) return Qnil; 01523 match_check(match); 01524 regs = RMATCH_REGS(match); 01525 if (BEG(0) == -1) return Qnil; 01526 str = RMATCH(match)->str; 01527 pos = END(0); 01528 str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos); 01529 if (OBJ_TAINTED(match)) OBJ_TAINT(str); 01530 return str; 01531 } 01532 01533 VALUE 01534 rb_reg_match_last(VALUE match) 01535 { 01536 int i; 01537 struct re_registers *regs; 01538 01539 if (NIL_P(match)) return Qnil; 01540 match_check(match); 01541 regs = RMATCH_REGS(match); 01542 if (BEG(0) == -1) return Qnil; 01543 01544 for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--) 01545 ; 01546 if (i == 0) return Qnil; 01547 return rb_reg_nth_match(i, match); 01548 } 01549 01550 static VALUE 01551 last_match_getter(void) 01552 { 01553 return rb_reg_last_match(rb_backref_get()); 01554 } 01555 01556 static VALUE 01557 prematch_getter(void) 01558 { 01559 return rb_reg_match_pre(rb_backref_get()); 01560 } 01561 01562 static VALUE 01563 postmatch_getter(void) 01564 { 01565 return rb_reg_match_post(rb_backref_get()); 01566 } 01567 01568 static VALUE 01569 last_paren_match_getter(void) 01570 { 01571 return rb_reg_match_last(rb_backref_get()); 01572 } 01573 01574 static VALUE 01575 match_array(VALUE match, int start) 01576 { 01577 struct re_registers *regs; 01578 VALUE ary; 01579 VALUE target; 01580 int i; 01581 int taint = OBJ_TAINTED(match); 01582 01583 match_check(match); 01584 regs = RMATCH_REGS(match); 01585 ary = rb_ary_new2(regs->num_regs); 01586 target = RMATCH(match)->str; 01587 01588 for (i=start; i<regs->num_regs; i++) { 01589 if (regs->beg[i] == -1) { 01590 rb_ary_push(ary, Qnil); 01591 } 01592 else { 01593 VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]); 01594 if (taint) OBJ_TAINT(str); 01595 rb_ary_push(ary, str); 01596 } 01597 } 01598 return ary; 01599 } 01600 01601 01602 /* [MG]:FIXME: I put parens around the /.../.match() in the first line of the 01603 second example to prevent the '*' followed by a '/' from ending the 01604 comment. */ 01605 01606 /* 01607 * call-seq: 01608 * mtch.to_a -> anArray 01609 * 01610 * Returns the array of matches. 01611 * 01612 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01613 * m.to_a #=> ["HX1138", "H", "X", "113", "8"] 01614 * 01615 * Because <code>to_a</code> is called when expanding 01616 * <code>*</code><em>variable</em>, there's a useful assignment 01617 * shortcut for extracting matched fields. This is slightly slower than 01618 * accessing the fields directly (as an intermediate array is 01619 * generated). 01620 * 01621 * all,f1,f2,f3 = *(/(.)(.)(\d+)(\d)/.match("THX1138.")) 01622 * all #=> "HX1138" 01623 * f1 #=> "H" 01624 * f2 #=> "X" 01625 * f3 #=> "113" 01626 */ 01627 01628 static VALUE 01629 match_to_a(VALUE match) 01630 { 01631 return match_array(match, 0); 01632 } 01633 01634 01635 /* 01636 * call-seq: 01637 * mtch.captures -> array 01638 * 01639 * Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>. 01640 * 01641 * f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures 01642 * f1 #=> "H" 01643 * f2 #=> "X" 01644 * f3 #=> "113" 01645 * f4 #=> "8" 01646 */ 01647 static VALUE 01648 match_captures(VALUE match) 01649 { 01650 return match_array(match, 1); 01651 } 01652 01653 static int 01654 name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end) 01655 { 01656 int num; 01657 01658 num = onig_name_to_backref_number(RREGEXP(regexp)->ptr, 01659 (const unsigned char* )name, (const unsigned char* )name_end, regs); 01660 if (num >= 1) { 01661 return num; 01662 } 01663 else { 01664 VALUE s = rb_str_new(name, (long )(name_end - name)); 01665 rb_raise(rb_eIndexError, "undefined group name reference: %s", 01666 StringValuePtr(s)); 01667 } 01668 } 01669 01670 /* 01671 * call-seq: 01672 * mtch[i] -> str or nil 01673 * mtch[start, length] -> array 01674 * mtch[range] -> array 01675 * mtch[name] -> str or nil 01676 * 01677 * Match Reference---<code>MatchData</code> acts as an array, and may be 01678 * accessed using the normal array indexing techniques. <i>mtch</i>[0] is 01679 * equivalent to the special variable <code>$&</code>, and returns the entire 01680 * matched string. <i>mtch</i>[1], <i>mtch</i>[2], and so on return the values 01681 * of the matched backreferences (portions of the pattern between parentheses). 01682 * 01683 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01684 * m #=> #<MatchData "HX1138" 1:"H" 2:"X" 3:"113" 4:"8"> 01685 * m[0] #=> "HX1138" 01686 * m[1, 2] #=> ["H", "X"] 01687 * m[1..3] #=> ["H", "X", "113"] 01688 * m[-3, 2] #=> ["X", "113"] 01689 * 01690 * m = /(?<foo>a+)b/.match("ccaaab") 01691 * m #=> #<MatchData "aaab" foo:"aaa"> 01692 * m["foo"] #=> "aaa" 01693 * m[:foo] #=> "aaa" 01694 */ 01695 01696 static VALUE 01697 match_aref(int argc, VALUE *argv, VALUE match) 01698 { 01699 VALUE idx, rest; 01700 01701 match_check(match); 01702 rb_scan_args(argc, argv, "11", &idx, &rest); 01703 01704 if (NIL_P(rest)) { 01705 if (FIXNUM_P(idx)) { 01706 if (FIX2INT(idx) >= 0) { 01707 return rb_reg_nth_match(FIX2INT(idx), match); 01708 } 01709 } 01710 else { 01711 const char *p; 01712 int num; 01713 01714 switch (TYPE(idx)) { 01715 case T_SYMBOL: 01716 p = rb_id2name(SYM2ID(idx)); 01717 goto name_to_backref; 01718 break; 01719 case T_STRING: 01720 p = StringValuePtr(idx); 01721 01722 name_to_backref: 01723 num = name_to_backref_number(RMATCH_REGS(match), 01724 RMATCH(match)->regexp, p, p + strlen(p)); 01725 return rb_reg_nth_match(num, match); 01726 break; 01727 01728 default: 01729 break; 01730 } 01731 } 01732 } 01733 01734 return rb_ary_aref(argc, argv, match_to_a(match)); 01735 } 01736 01737 static VALUE 01738 match_entry(VALUE match, long n) 01739 { 01740 /* n should not exceed num_regs */ 01741 return rb_reg_nth_match((int)n, match); 01742 } 01743 01744 01745 /* 01746 * call-seq: 01747 * 01748 * mtch.values_at([index]*) -> array 01749 * 01750 * Uses each <i>index</i> to access the matching values, returning an array of 01751 * the corresponding matches. 01752 * 01753 * m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie") 01754 * m.to_a #=> ["HX1138", "H", "X", "113", "8"] 01755 * m.values_at(0, 2, -2) #=> ["HX1138", "X", "113"] 01756 */ 01757 01758 static VALUE 01759 match_values_at(int argc, VALUE *argv, VALUE match) 01760 { 01761 struct re_registers *regs; 01762 01763 match_check(match); 01764 regs = RMATCH_REGS(match); 01765 return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry); 01766 } 01767 01768 01769 /* 01770 * call-seq: 01771 * mtch.to_s -> str 01772 * 01773 * Returns the entire matched string. 01774 * 01775 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01776 * m.to_s #=> "HX1138" 01777 */ 01778 01779 static VALUE 01780 match_to_s(VALUE match) 01781 { 01782 VALUE str = rb_reg_last_match(match); 01783 01784 match_check(match); 01785 if (NIL_P(str)) str = rb_str_new(0,0); 01786 if (OBJ_TAINTED(match)) OBJ_TAINT(str); 01787 if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str); 01788 return str; 01789 } 01790 01791 01792 /* 01793 * call-seq: 01794 * mtch.string -> str 01795 * 01796 * Returns a frozen copy of the string passed in to <code>match</code>. 01797 * 01798 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01799 * m.string #=> "THX1138." 01800 */ 01801 01802 static VALUE 01803 match_string(VALUE match) 01804 { 01805 match_check(match); 01806 return RMATCH(match)->str; /* str is frozen */ 01807 } 01808 01809 struct backref_name_tag { 01810 const UChar *name; 01811 long len; 01812 }; 01813 01814 static int 01815 match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end, 01816 int back_num, int *back_refs, OnigRegex regex, void *arg0) 01817 { 01818 struct backref_name_tag *arg = (struct backref_name_tag *)arg0; 01819 int i; 01820 01821 for (i = 0; i < back_num; i++) { 01822 arg[back_refs[i]].name = name; 01823 arg[back_refs[i]].len = name_end - name; 01824 } 01825 return 0; 01826 } 01827 01828 /* 01829 * call-seq: 01830 * mtch.inspect -> str 01831 * 01832 * Returns a printable version of <i>mtch</i>. 01833 * 01834 * puts /.$/.match("foo").inspect 01835 * #=> #<MatchData "o"> 01836 * 01837 * puts /(.)(.)(.)/.match("foo").inspect 01838 * #=> #<MatchData "foo" 1:"f" 2:"o" 3:"o"> 01839 * 01840 * puts /(.)(.)?(.)/.match("fo").inspect 01841 * #=> #<MatchData "fo" 1:"f" 2:nil 3:"o"> 01842 * 01843 * puts /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").inspect 01844 * #=> #<MatchData "hog" foo:"h" bar:"o" baz:"g"> 01845 * 01846 */ 01847 01848 static VALUE 01849 match_inspect(VALUE match) 01850 { 01851 const char *cname = rb_obj_classname(match); 01852 VALUE str; 01853 int i; 01854 struct re_registers *regs = RMATCH_REGS(match); 01855 int num_regs = regs->num_regs; 01856 struct backref_name_tag *names; 01857 VALUE regexp = RMATCH(match)->regexp; 01858 01859 if (regexp == 0) { 01860 return rb_sprintf("#<%s:%p>", cname, (void*)match); 01861 } 01862 01863 names = ALLOCA_N(struct backref_name_tag, num_regs); 01864 MEMZERO(names, struct backref_name_tag, num_regs); 01865 01866 onig_foreach_name(RREGEXP(regexp)->ptr, 01867 match_inspect_name_iter, names); 01868 01869 str = rb_str_buf_new2("#<"); 01870 rb_str_buf_cat2(str, cname); 01871 01872 for (i = 0; i < num_regs; i++) { 01873 VALUE v; 01874 rb_str_buf_cat2(str, " "); 01875 if (0 < i) { 01876 if (names[i].name) 01877 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len); 01878 else { 01879 rb_str_catf(str, "%d", i); 01880 } 01881 rb_str_buf_cat2(str, ":"); 01882 } 01883 v = rb_reg_nth_match(i, match); 01884 if (v == Qnil) 01885 rb_str_buf_cat2(str, "nil"); 01886 else 01887 rb_str_buf_append(str, rb_str_inspect(v)); 01888 } 01889 rb_str_buf_cat2(str, ">"); 01890 01891 return str; 01892 } 01893 01894 VALUE rb_cRegexp; 01895 01896 static int 01897 read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err) 01898 { 01899 const char *p = *pp; 01900 int code; 01901 int meta_prefix = 0, ctrl_prefix = 0; 01902 size_t len; 01903 01904 if (p == end || *p++ != '\\') { 01905 errcpy(err, "too short escaped multibyte character"); 01906 return -1; 01907 } 01908 01909 again: 01910 if (p == end) { 01911 errcpy(err, "too short escape sequence"); 01912 return -1; 01913 } 01914 switch (*p++) { 01915 case '\\': code = '\\'; break; 01916 case 'n': code = '\n'; break; 01917 case 't': code = '\t'; break; 01918 case 'r': code = '\r'; break; 01919 case 'f': code = '\f'; break; 01920 case 'v': code = '\013'; break; 01921 case 'a': code = '\007'; break; 01922 case 'e': code = '\033'; break; 01923 01924 /* \OOO */ 01925 case '0': case '1': case '2': case '3': 01926 case '4': case '5': case '6': case '7': 01927 p--; 01928 code = scan_oct(p, end < p+3 ? end-p : 3, &len); 01929 p += len; 01930 break; 01931 01932 case 'x': /* \xHH */ 01933 code = scan_hex(p, end < p+2 ? end-p : 2, &len); 01934 if (len < 1) { 01935 errcpy(err, "invalid hex escape"); 01936 return -1; 01937 } 01938 p += len; 01939 break; 01940 01941 case 'M': /* \M-X, \M-\C-X, \M-\cX */ 01942 if (meta_prefix) { 01943 errcpy(err, "duplicate meta escape"); 01944 return -1; 01945 } 01946 meta_prefix = 1; 01947 if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) { 01948 if (*p == '\\') { 01949 p++; 01950 goto again; 01951 } 01952 else { 01953 code = *p++; 01954 break; 01955 } 01956 } 01957 errcpy(err, "too short meta escape"); 01958 return -1; 01959 01960 case 'C': /* \C-X, \C-\M-X */ 01961 if (p == end || *p++ != '-') { 01962 errcpy(err, "too short control escape"); 01963 return -1; 01964 } 01965 case 'c': /* \cX, \c\M-X */ 01966 if (ctrl_prefix) { 01967 errcpy(err, "duplicate control escape"); 01968 return -1; 01969 } 01970 ctrl_prefix = 1; 01971 if (p < end && (*p & 0x80) == 0) { 01972 if (*p == '\\') { 01973 p++; 01974 goto again; 01975 } 01976 else { 01977 code = *p++; 01978 break; 01979 } 01980 } 01981 errcpy(err, "too short control escape"); 01982 return -1; 01983 01984 default: 01985 errcpy(err, "unexpected escape sequence"); 01986 return -1; 01987 } 01988 if (code < 0 || 0xff < code) { 01989 errcpy(err, "invalid escape code"); 01990 return -1; 01991 } 01992 01993 if (ctrl_prefix) 01994 code &= 0x1f; 01995 if (meta_prefix) 01996 code |= 0x80; 01997 01998 *pp = p; 01999 return code; 02000 } 02001 02002 static int 02003 unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc, 02004 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) 02005 { 02006 const char *p = *pp; 02007 int chmaxlen = rb_enc_mbmaxlen(enc); 02008 char *chbuf = ALLOCA_N(char, chmaxlen); 02009 int chlen = 0; 02010 int byte; 02011 int l; 02012 02013 memset(chbuf, 0, chmaxlen); 02014 02015 byte = read_escaped_byte(&p, end, err); 02016 if (byte == -1) { 02017 return -1; 02018 } 02019 02020 chbuf[chlen++] = byte; 02021 while (chlen < chmaxlen && 02022 MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) { 02023 byte = read_escaped_byte(&p, end, err); 02024 if (byte == -1) { 02025 return -1; 02026 } 02027 chbuf[chlen++] = byte; 02028 } 02029 02030 l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc); 02031 if (MBCLEN_INVALID_P(l)) { 02032 errcpy(err, "invalid multibyte escape"); 02033 return -1; 02034 } 02035 if (1 < chlen || (chbuf[0] & 0x80)) { 02036 rb_str_buf_cat(buf, chbuf, chlen); 02037 02038 if (*encp == 0) 02039 *encp = enc; 02040 else if (*encp != enc) { 02041 errcpy(err, "escaped non ASCII character in UTF-8 regexp"); 02042 return -1; 02043 } 02044 } 02045 else { 02046 char escbuf[5]; 02047 snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff); 02048 rb_str_buf_cat(buf, escbuf, 4); 02049 } 02050 *pp = p; 02051 return 0; 02052 } 02053 02054 static int 02055 check_unicode_range(unsigned long code, onig_errmsg_buffer err) 02056 { 02057 if ((0xd800 <= code && code <= 0xdfff) || /* Surrogates */ 02058 0x10ffff < code) { 02059 errcpy(err, "invalid Unicode range"); 02060 return -1; 02061 } 02062 return 0; 02063 } 02064 02065 static int 02066 append_utf8(unsigned long uv, 02067 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) 02068 { 02069 if (check_unicode_range(uv, err) != 0) 02070 return -1; 02071 if (uv < 0x80) { 02072 char escbuf[5]; 02073 snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv); 02074 rb_str_buf_cat(buf, escbuf, 4); 02075 } 02076 else { 02077 int len; 02078 char utf8buf[6]; 02079 len = rb_uv_to_utf8(utf8buf, uv); 02080 rb_str_buf_cat(buf, utf8buf, len); 02081 02082 if (*encp == 0) 02083 *encp = rb_utf8_encoding(); 02084 else if (*encp != rb_utf8_encoding()) { 02085 errcpy(err, "UTF-8 character in non UTF-8 regexp"); 02086 return -1; 02087 } 02088 } 02089 return 0; 02090 } 02091 02092 static int 02093 unescape_unicode_list(const char **pp, const char *end, 02094 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) 02095 { 02096 const char *p = *pp; 02097 int has_unicode = 0; 02098 unsigned long code; 02099 size_t len; 02100 02101 while (p < end && ISSPACE(*p)) p++; 02102 02103 while (1) { 02104 code = ruby_scan_hex(p, end-p, &len); 02105 if (len == 0) 02106 break; 02107 if (6 < len) { /* max 10FFFF */ 02108 errcpy(err, "invalid Unicode range"); 02109 return -1; 02110 } 02111 p += len; 02112 if (append_utf8(code, buf, encp, err) != 0) 02113 return -1; 02114 has_unicode = 1; 02115 02116 while (p < end && ISSPACE(*p)) p++; 02117 } 02118 02119 if (has_unicode == 0) { 02120 errcpy(err, "invalid Unicode list"); 02121 return -1; 02122 } 02123 02124 *pp = p; 02125 02126 return 0; 02127 } 02128 02129 static int 02130 unescape_unicode_bmp(const char **pp, const char *end, 02131 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) 02132 { 02133 const char *p = *pp; 02134 size_t len; 02135 unsigned long code; 02136 02137 if (end < p+4) { 02138 errcpy(err, "invalid Unicode escape"); 02139 return -1; 02140 } 02141 code = ruby_scan_hex(p, 4, &len); 02142 if (len != 4) { 02143 errcpy(err, "invalid Unicode escape"); 02144 return -1; 02145 } 02146 if (append_utf8(code, buf, encp, err) != 0) 02147 return -1; 02148 *pp = p + 4; 02149 return 0; 02150 } 02151 02152 static int 02153 unescape_nonascii(const char *p, const char *end, rb_encoding *enc, 02154 VALUE buf, rb_encoding **encp, int *has_property, 02155 onig_errmsg_buffer err) 02156 { 02157 char c; 02158 char smallbuf[2]; 02159 02160 while (p < end) { 02161 int chlen = rb_enc_precise_mbclen(p, end, enc); 02162 if (!MBCLEN_CHARFOUND_P(chlen)) { 02163 errcpy(err, "invalid multibyte character"); 02164 return -1; 02165 } 02166 chlen = MBCLEN_CHARFOUND_LEN(chlen); 02167 if (1 < chlen || (*p & 0x80)) { 02168 rb_str_buf_cat(buf, p, chlen); 02169 p += chlen; 02170 if (*encp == 0) 02171 *encp = enc; 02172 else if (*encp != enc) { 02173 errcpy(err, "non ASCII character in UTF-8 regexp"); 02174 return -1; 02175 } 02176 continue; 02177 } 02178 02179 switch (c = *p++) { 02180 case '\\': 02181 if (p == end) { 02182 errcpy(err, "too short escape sequence"); 02183 return -1; 02184 } 02185 switch (c = *p++) { 02186 case '1': case '2': case '3': 02187 case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */ 02188 { 02189 size_t octlen; 02190 if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) { 02191 /* backref or 7bit octal. 02192 no need to unescape anyway. 02193 re-escaping may break backref */ 02194 goto escape_asis; 02195 } 02196 } 02197 /* xxx: How about more than 199 subexpressions? */ 02198 02199 case '0': /* \0, \0O, \0OO */ 02200 02201 case 'x': /* \xHH */ 02202 case 'c': /* \cX, \c\M-X */ 02203 case 'C': /* \C-X, \C-\M-X */ 02204 case 'M': /* \M-X, \M-\C-X, \M-\cX */ 02205 p = p-2; 02206 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0) 02207 return -1; 02208 break; 02209 02210 case 'u': 02211 if (p == end) { 02212 errcpy(err, "too short escape sequence"); 02213 return -1; 02214 } 02215 if (*p == '{') { 02216 /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */ 02217 p++; 02218 if (unescape_unicode_list(&p, end, buf, encp, err) != 0) 02219 return -1; 02220 if (p == end || *p++ != '}') { 02221 errcpy(err, "invalid Unicode list"); 02222 return -1; 02223 } 02224 break; 02225 } 02226 else { 02227 /* \uHHHH */ 02228 if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0) 02229 return -1; 02230 break; 02231 } 02232 02233 case 'p': /* \p{Hiragana} */ 02234 case 'P': 02235 if (!*encp) { 02236 *has_property = 1; 02237 } 02238 goto escape_asis; 02239 02240 default: /* \n, \\, \d, \9, etc. */ 02241 escape_asis: 02242 smallbuf[0] = '\\'; 02243 smallbuf[1] = c; 02244 rb_str_buf_cat(buf, smallbuf, 2); 02245 break; 02246 } 02247 break; 02248 02249 default: 02250 rb_str_buf_cat(buf, &c, 1); 02251 break; 02252 } 02253 } 02254 02255 return 0; 02256 } 02257 02258 static VALUE 02259 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc, 02260 rb_encoding **fixed_enc, onig_errmsg_buffer err) 02261 { 02262 VALUE buf; 02263 int has_property = 0; 02264 02265 buf = rb_str_buf_new(0); 02266 02267 if (rb_enc_asciicompat(enc)) 02268 *fixed_enc = 0; 02269 else { 02270 *fixed_enc = enc; 02271 rb_enc_associate(buf, enc); 02272 } 02273 02274 if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0) 02275 return Qnil; 02276 02277 if (has_property && !*fixed_enc) { 02278 *fixed_enc = enc; 02279 } 02280 02281 if (*fixed_enc) { 02282 rb_enc_associate(buf, *fixed_enc); 02283 } 02284 02285 return buf; 02286 } 02287 02288 VALUE 02289 rb_reg_check_preprocess(VALUE str) 02290 { 02291 rb_encoding *fixed_enc = 0; 02292 onig_errmsg_buffer err = ""; 02293 VALUE buf; 02294 char *p, *end; 02295 rb_encoding *enc; 02296 02297 StringValue(str); 02298 p = RSTRING_PTR(str); 02299 end = p + RSTRING_LEN(str); 02300 enc = rb_enc_get(str); 02301 02302 buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err); 02303 RB_GC_GUARD(str); 02304 02305 if (buf == Qnil) { 02306 return rb_reg_error_desc(str, 0, err); 02307 } 02308 return Qnil; 02309 } 02310 02311 static VALUE 02312 rb_reg_preprocess_dregexp(VALUE ary, int options) 02313 { 02314 rb_encoding *fixed_enc = 0; 02315 rb_encoding *regexp_enc = 0; 02316 onig_errmsg_buffer err = ""; 02317 int i; 02318 VALUE result = 0; 02319 rb_encoding *ascii8bit = rb_ascii8bit_encoding(); 02320 02321 if (RARRAY_LEN(ary) == 0) { 02322 rb_raise(rb_eArgError, "no arguments given"); 02323 } 02324 02325 for (i = 0; i < RARRAY_LEN(ary); i++) { 02326 VALUE str = RARRAY_PTR(ary)[i]; 02327 VALUE buf; 02328 char *p, *end; 02329 rb_encoding *src_enc; 02330 02331 src_enc = rb_enc_get(str); 02332 if (options & ARG_ENCODING_NONE && 02333 src_enc != ascii8bit) { 02334 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) 02335 rb_raise(rb_eRegexpError, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script"); 02336 else 02337 src_enc = ascii8bit; 02338 } 02339 02340 StringValue(str); 02341 p = RSTRING_PTR(str); 02342 end = p + RSTRING_LEN(str); 02343 02344 buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err); 02345 02346 if (buf == Qnil) 02347 rb_raise(rb_eArgError, "%s", err); 02348 02349 if (fixed_enc != 0) { 02350 if (regexp_enc != 0 && regexp_enc != fixed_enc) { 02351 rb_raise(rb_eRegexpError, "encoding mismatch in dynamic regexp : %s and %s", 02352 rb_enc_name(regexp_enc), rb_enc_name(fixed_enc)); 02353 } 02354 regexp_enc = fixed_enc; 02355 } 02356 02357 if (!result) 02358 result = rb_str_new3(str); 02359 else 02360 rb_str_buf_append(result, str); 02361 } 02362 if (regexp_enc) { 02363 rb_enc_associate(result, regexp_enc); 02364 } 02365 02366 return result; 02367 } 02368 02369 static int 02370 rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc, 02371 int options, onig_errmsg_buffer err, 02372 const char *sourcefile, int sourceline) 02373 { 02374 struct RRegexp *re = RREGEXP(obj); 02375 VALUE unescaped; 02376 rb_encoding *fixed_enc = 0; 02377 rb_encoding *a_enc = rb_ascii8bit_encoding(); 02378 02379 if (!OBJ_UNTRUSTED(obj) && rb_safe_level() >= 4) 02380 rb_raise(rb_eSecurityError, "Insecure: can't modify regexp"); 02381 rb_check_frozen(obj); 02382 if (FL_TEST(obj, REG_LITERAL)) 02383 rb_raise(rb_eSecurityError, "can't modify literal regexp"); 02384 if (re->ptr) 02385 rb_raise(rb_eTypeError, "already initialized regexp"); 02386 re->ptr = 0; 02387 02388 if (rb_enc_dummy_p(enc)) { 02389 errcpy(err, "can't make regexp with dummy encoding"); 02390 return -1; 02391 } 02392 02393 unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err); 02394 if (unescaped == Qnil) 02395 return -1; 02396 02397 if (fixed_enc) { 02398 if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) || 02399 (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) { 02400 errcpy(err, "incompatible character encoding"); 02401 return -1; 02402 } 02403 if (fixed_enc != a_enc) { 02404 options |= ARG_ENCODING_FIXED; 02405 enc = fixed_enc; 02406 } 02407 } 02408 else if (!(options & ARG_ENCODING_FIXED)) { 02409 enc = rb_usascii_encoding(); 02410 } 02411 02412 rb_enc_associate((VALUE)re, enc); 02413 if ((options & ARG_ENCODING_FIXED) || fixed_enc) { 02414 re->basic.flags |= KCODE_FIXED; 02415 } 02416 if (options & ARG_ENCODING_NONE) { 02417 re->basic.flags |= REG_ENCODING_NONE; 02418 } 02419 02420 re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc, 02421 options & ARG_REG_OPTION_MASK, err, 02422 sourcefile, sourceline); 02423 if (!re->ptr) return -1; 02424 re->src = rb_enc_str_new(s, len, enc); 02425 OBJ_FREEZE(re->src); 02426 RB_GC_GUARD(unescaped); 02427 return 0; 02428 } 02429 02430 static int 02431 rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err, 02432 const char *sourcefile, int sourceline) 02433 { 02434 int ret; 02435 rb_encoding *enc = rb_enc_get(str); 02436 if (options & ARG_ENCODING_NONE) { 02437 rb_encoding *ascii8bit = rb_ascii8bit_encoding(); 02438 if (enc != ascii8bit) { 02439 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) { 02440 errcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script"); 02441 return -1; 02442 } 02443 enc = ascii8bit; 02444 } 02445 } 02446 ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc, 02447 options, err, sourcefile, sourceline); 02448 OBJ_INFECT(obj, str); 02449 RB_GC_GUARD(str); 02450 return ret; 02451 } 02452 02453 static VALUE 02454 rb_reg_s_alloc(VALUE klass) 02455 { 02456 NEWOBJ(re, struct RRegexp); 02457 OBJSETUP(re, klass, T_REGEXP); 02458 02459 re->ptr = 0; 02460 re->src = 0; 02461 re->usecnt = 0; 02462 02463 return (VALUE)re; 02464 } 02465 02466 VALUE 02467 rb_reg_alloc(void) 02468 { 02469 return rb_reg_s_alloc(rb_cRegexp); 02470 } 02471 02472 VALUE 02473 rb_reg_new_str(VALUE s, int options) 02474 { 02475 return rb_reg_init_str(rb_reg_alloc(), s, options); 02476 } 02477 02478 VALUE 02479 rb_reg_init_str(VALUE re, VALUE s, int options) 02480 { 02481 onig_errmsg_buffer err = ""; 02482 02483 if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) { 02484 rb_reg_raise_str(s, options, err); 02485 } 02486 02487 return re; 02488 } 02489 02490 VALUE 02491 rb_reg_new_ary(VALUE ary, int opt) 02492 { 02493 return rb_reg_new_str(rb_reg_preprocess_dregexp(ary, opt), opt); 02494 } 02495 02496 VALUE 02497 rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options) 02498 { 02499 VALUE re = rb_reg_alloc(); 02500 onig_errmsg_buffer err = ""; 02501 02502 if (rb_reg_initialize(re, s, len, enc, options, err, NULL, 0) != 0) { 02503 rb_enc_reg_raise(s, len, enc, options, err); 02504 } 02505 02506 return re; 02507 } 02508 02509 VALUE 02510 rb_reg_new(const char *s, long len, int options) 02511 { 02512 return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options); 02513 } 02514 02515 VALUE 02516 rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline) 02517 { 02518 VALUE re = rb_reg_alloc(); 02519 onig_errmsg_buffer err = ""; 02520 02521 if (!str) str = rb_str_new(0,0); 02522 if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) { 02523 rb_set_errinfo(rb_reg_error_desc(str, options, err)); 02524 return Qnil; 02525 } 02526 FL_SET(re, REG_LITERAL); 02527 return re; 02528 } 02529 02530 static VALUE reg_cache; 02531 02532 VALUE 02533 rb_reg_regcomp(VALUE str) 02534 { 02535 volatile VALUE save_str = str; 02536 if (reg_cache && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str) 02537 && ENCODING_GET(reg_cache) == ENCODING_GET(str) 02538 && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0) 02539 return reg_cache; 02540 02541 return reg_cache = rb_reg_new_str(save_str, 0); 02542 } 02543 02544 static st_index_t reg_hash(VALUE re); 02545 /* 02546 * call-seq: 02547 * rxp.hash -> fixnum 02548 * 02549 * Produce a hash based on the text and options of this regular expression. 02550 */ 02551 02552 static VALUE 02553 rb_reg_hash(VALUE re) 02554 { 02555 st_index_t hashval = reg_hash(re); 02556 return LONG2FIX(hashval); 02557 } 02558 02559 static st_index_t 02560 reg_hash(VALUE re) 02561 { 02562 st_index_t hashval; 02563 02564 rb_reg_check(re); 02565 hashval = RREGEXP(re)->ptr->options; 02566 hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re))); 02567 return rb_hash_end(hashval); 02568 } 02569 02570 02571 /* 02572 * call-seq: 02573 * rxp == other_rxp -> true or false 02574 * rxp.eql?(other_rxp) -> true or false 02575 * 02576 * Equality---Two regexps are equal if their patterns are identical, they have 02577 * the same character set code, and their <code>casefold?</code> values are the 02578 * same. 02579 * 02580 * /abc/ == /abc/x #=> false 02581 * /abc/ == /abc/i #=> false 02582 * /abc/ == /abc/n #=> false 02583 * /abc/u == /abc/n #=> false 02584 */ 02585 02586 static VALUE 02587 rb_reg_equal(VALUE re1, VALUE re2) 02588 { 02589 if (re1 == re2) return Qtrue; 02590 if (TYPE(re2) != T_REGEXP) return Qfalse; 02591 rb_reg_check(re1); rb_reg_check(re2); 02592 if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse; 02593 if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse; 02594 if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse; 02595 if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse; 02596 if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) { 02597 return Qtrue; 02598 } 02599 return Qfalse; 02600 } 02601 02602 /* 02603 * call-seq: 02604 * mtch.hash -> integer 02605 * 02606 * Produce a hash based on the target string, regexp and matched 02607 * positions of this matchdata. 02608 */ 02609 02610 static VALUE 02611 match_hash(VALUE match) 02612 { 02613 const struct re_registers *regs; 02614 st_index_t hashval = rb_hash_start(rb_str_hash(RMATCH(match)->str)); 02615 02616 rb_hash_uint(hashval, reg_hash(RMATCH(match)->regexp)); 02617 regs = RMATCH_REGS(match); 02618 hashval = rb_hash_uint(hashval, regs->num_regs); 02619 hashval = rb_hash_uint(hashval, rb_memhash(regs->beg, regs->num_regs * sizeof(*regs->beg))); 02620 hashval = rb_hash_uint(hashval, rb_memhash(regs->end, regs->num_regs * sizeof(*regs->end))); 02621 hashval = rb_hash_end(hashval); 02622 return LONG2FIX(hashval); 02623 } 02624 02625 /* 02626 * call-seq: 02627 * mtch == mtch2 -> true or false 02628 * 02629 * Equality---Two matchdata are equal if their target strings, 02630 * patterns, and matched positions are identical. 02631 */ 02632 02633 static VALUE 02634 match_equal(VALUE match1, VALUE match2) 02635 { 02636 const struct re_registers *regs1, *regs2; 02637 if (match1 == match2) return Qtrue; 02638 if (TYPE(match2) != T_MATCH) return Qfalse; 02639 if (!rb_str_equal(RMATCH(match1)->str, RMATCH(match2)->str)) return Qfalse; 02640 if (!rb_reg_equal(RMATCH(match1)->regexp, RMATCH(match2)->regexp)) return Qfalse; 02641 regs1 = RMATCH_REGS(match1); 02642 regs2 = RMATCH_REGS(match2); 02643 if (regs1->num_regs != regs2->num_regs) return Qfalse; 02644 if (memcmp(regs1->beg, regs2->beg, regs1->num_regs * sizeof(*regs1->beg))) return Qfalse; 02645 if (memcmp(regs1->end, regs2->end, regs1->num_regs * sizeof(*regs1->end))) return Qfalse; 02646 return Qtrue; 02647 } 02648 02649 static VALUE 02650 reg_operand(VALUE s, int check) 02651 { 02652 if (SYMBOL_P(s)) { 02653 return rb_sym_to_s(s); 02654 } 02655 else { 02656 VALUE tmp = rb_check_string_type(s); 02657 if (check && NIL_P(tmp)) { 02658 rb_raise(rb_eTypeError, "can't convert %s to String", 02659 rb_obj_classname(s)); 02660 } 02661 return tmp; 02662 } 02663 } 02664 02665 static long 02666 reg_match_pos(VALUE re, VALUE *strp, long pos) 02667 { 02668 VALUE str = *strp; 02669 02670 if (NIL_P(str)) { 02671 rb_backref_set(Qnil); 02672 return -1; 02673 } 02674 *strp = str = reg_operand(str, TRUE); 02675 if (pos != 0) { 02676 if (pos < 0) { 02677 VALUE l = rb_str_length(str); 02678 pos += NUM2INT(l); 02679 if (pos < 0) { 02680 return pos; 02681 } 02682 } 02683 pos = rb_str_offset(str, pos); 02684 } 02685 return rb_reg_search(re, str, pos, 0); 02686 } 02687 02688 /* 02689 * call-seq: 02690 * rxp =~ str -> integer or nil 02691 * 02692 * Match---Matches <i>rxp</i> against <i>str</i>. 02693 * 02694 * /at/ =~ "input data" #=> 7 02695 * /ax/ =~ "input data" #=> nil 02696 * 02697 * If <code>=~</code> is used with a regexp literal with named captures, 02698 * captured strings (or nil) is assigned to local variables named by 02699 * the capture names. 02700 * 02701 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = y " 02702 * p lhs #=> "x" 02703 * p rhs #=> "y" 02704 * 02705 * If it is not matched, nil is assigned for the variables. 02706 * 02707 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = " 02708 * p lhs #=> nil 02709 * p rhs #=> nil 02710 * 02711 * This assignment is implemented in the Ruby parser. 02712 * The parser detects 'regexp-literal =~ expression' for the assignment. 02713 * The regexp must be a literal without interpolation and placed at left hand side. 02714 * 02715 * The assignment does not occur if the regexp is not a literal. 02716 * 02717 * re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ 02718 * re =~ " x = y " 02719 * p lhs # undefined local variable 02720 * p rhs # undefined local variable 02721 * 02722 * A regexp interpolation, <code>#{}</code>, also disables 02723 * the assignment. 02724 * 02725 * rhs_pat = /(?<rhs>\w+)/ 02726 * /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y" 02727 * p lhs # undefined local variable 02728 * 02729 * The assignment does not occur if the regexp is placed at the right hand side. 02730 * 02731 * " x = y " =~ /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ 02732 * p lhs, rhs # undefined local variable 02733 * 02734 */ 02735 02736 VALUE 02737 rb_reg_match(VALUE re, VALUE str) 02738 { 02739 long pos = reg_match_pos(re, &str, 0); 02740 if (pos < 0) return Qnil; 02741 pos = rb_str_sublen(str, pos); 02742 return LONG2FIX(pos); 02743 } 02744 02745 /* 02746 * call-seq: 02747 * rxp === str -> true or false 02748 * 02749 * Case Equality---Synonym for <code>Regexp#=~</code> used in case statements. 02750 * 02751 * a = "HELLO" 02752 * case a 02753 * when /^[a-z]*$/; print "Lower case\n" 02754 * when /^[A-Z]*$/; print "Upper case\n" 02755 * else; print "Mixed case\n" 02756 * end 02757 * 02758 * <em>produces:</em> 02759 * 02760 * Upper case 02761 */ 02762 02763 VALUE 02764 rb_reg_eqq(VALUE re, VALUE str) 02765 { 02766 long start; 02767 02768 str = reg_operand(str, FALSE); 02769 if (NIL_P(str)) { 02770 rb_backref_set(Qnil); 02771 return Qfalse; 02772 } 02773 start = rb_reg_search(re, str, 0, 0); 02774 if (start < 0) { 02775 return Qfalse; 02776 } 02777 return Qtrue; 02778 } 02779 02780 02781 /* 02782 * call-seq: 02783 * ~ rxp -> integer or nil 02784 * 02785 * Match---Matches <i>rxp</i> against the contents of <code>$_</code>. 02786 * Equivalent to <code><i>rxp</i> =~ $_</code>. 02787 * 02788 * $_ = "input data" 02789 * ~ /at/ #=> 7 02790 */ 02791 02792 VALUE 02793 rb_reg_match2(VALUE re) 02794 { 02795 long start; 02796 VALUE line = rb_lastline_get(); 02797 02798 if (TYPE(line) != T_STRING) { 02799 rb_backref_set(Qnil); 02800 return Qnil; 02801 } 02802 02803 start = rb_reg_search(re, line, 0, 0); 02804 if (start < 0) { 02805 return Qnil; 02806 } 02807 start = rb_str_sublen(line, start); 02808 return LONG2FIX(start); 02809 } 02810 02811 02812 /* 02813 * call-seq: 02814 * rxp.match(str) -> matchdata or nil 02815 * rxp.match(str,pos) -> matchdata or nil 02816 * 02817 * Returns a <code>MatchData</code> object describing the match, or 02818 * <code>nil</code> if there was no match. This is equivalent to retrieving the 02819 * value of the special variable <code>$~</code> following a normal match. 02820 * If the second parameter is present, it specifies the position in the string 02821 * to begin the search. 02822 * 02823 * /(.)(.)(.)/.match("abc")[2] #=> "b" 02824 * /(.)(.)/.match("abc", 1)[2] #=> "c" 02825 * 02826 * If a block is given, invoke the block with MatchData if match succeed, so 02827 * that you can write 02828 * 02829 * pat.match(str) {|m| ...} 02830 * 02831 * instead of 02832 * 02833 * if m = pat.match(str) 02834 * ... 02835 * end 02836 * 02837 * The return value is a value from block execution in this case. 02838 */ 02839 02840 static VALUE 02841 rb_reg_match_m(int argc, VALUE *argv, VALUE re) 02842 { 02843 VALUE result, str, initpos; 02844 long pos; 02845 02846 if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) { 02847 pos = NUM2LONG(initpos); 02848 } 02849 else { 02850 pos = 0; 02851 } 02852 02853 pos = reg_match_pos(re, &str, pos); 02854 if (pos < 0) { 02855 rb_backref_set(Qnil); 02856 return Qnil; 02857 } 02858 result = rb_backref_get(); 02859 rb_match_busy(result); 02860 if (!NIL_P(result) && rb_block_given_p()) { 02861 return rb_yield(result); 02862 } 02863 return result; 02864 } 02865 02866 /* 02867 * Document-method: compile 02868 * 02869 * Synonym for <code>Regexp.new</code> 02870 */ 02871 02872 02873 /* 02874 * call-seq: 02875 * Regexp.new(string, [options [, lang]]) -> regexp 02876 * Regexp.new(regexp) -> regexp 02877 * Regexp.compile(string, [options [, lang]]) -> regexp 02878 * Regexp.compile(regexp) -> regexp 02879 * 02880 * Constructs a new regular expression from <i>pattern</i>, which can be either 02881 * a <code>String</code> or a <code>Regexp</code> (in which case that regexp's 02882 * options are propagated, and new options may not be specified (a change as of 02883 * Ruby 1.8). If <i>options</i> is a <code>Fixnum</code>, it should be one or 02884 * more of the constants <code>Regexp::EXTENDED</code>, 02885 * <code>Regexp::IGNORECASE</code>, and <code>Regexp::MULTILINE</code>, 02886 * <em>or</em>-ed together. Otherwise, if <i>options</i> is not 02887 * <code>nil</code>, the regexp will be case insensitive. 02888 * When the <i>lang</i> parameter is `n' or `N' sets the regexp no encoding. 02889 * 02890 * r1 = Regexp.new('^a-z+:\\s+\w+') #=> /^a-z+:\s+\w+/ 02891 * r2 = Regexp.new('cat', true) #=> /cat/i 02892 * r3 = Regexp.new('dog', Regexp::EXTENDED) #=> /dog/x 02893 * r4 = Regexp.new(r2) #=> /cat/i 02894 */ 02895 02896 static VALUE 02897 rb_reg_initialize_m(int argc, VALUE *argv, VALUE self) 02898 { 02899 onig_errmsg_buffer err = ""; 02900 int flags = 0; 02901 VALUE str; 02902 rb_encoding *enc; 02903 const char *ptr; 02904 long len; 02905 02906 if (argc == 0 || argc > 3) { 02907 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..3)", argc); 02908 } 02909 if (TYPE(argv[0]) == T_REGEXP) { 02910 VALUE re = argv[0]; 02911 02912 if (argc > 1) { 02913 rb_warn("flags ignored"); 02914 } 02915 rb_reg_check(re); 02916 flags = rb_reg_options(re); 02917 ptr = RREGEXP_SRC_PTR(re); 02918 len = RREGEXP_SRC_LEN(re); 02919 enc = rb_enc_get(re); 02920 if (rb_reg_initialize(self, ptr, len, enc, flags, err, NULL, 0)) { 02921 str = rb_enc_str_new(ptr, len, enc); 02922 rb_reg_raise_str(str, flags, err); 02923 } 02924 } 02925 else { 02926 if (argc >= 2) { 02927 if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]); 02928 else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE; 02929 } 02930 enc = 0; 02931 if (argc == 3 && !NIL_P(argv[2])) { 02932 char *kcode = StringValuePtr(argv[2]); 02933 if (kcode[0] == 'n' || kcode[0] == 'N') { 02934 enc = rb_ascii8bit_encoding(); 02935 flags |= ARG_ENCODING_NONE; 02936 } 02937 else { 02938 rb_warn("encoding option is ignored - %s", kcode); 02939 } 02940 } 02941 str = argv[0]; 02942 ptr = StringValuePtr(str); 02943 if (enc 02944 ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err, NULL, 0) 02945 : rb_reg_initialize_str(self, str, flags, err, NULL, 0)) { 02946 rb_reg_raise_str(str, flags, err); 02947 } 02948 } 02949 return self; 02950 } 02951 02952 VALUE 02953 rb_reg_quote(VALUE str) 02954 { 02955 rb_encoding *enc = rb_enc_get(str); 02956 char *s, *send, *t; 02957 VALUE tmp; 02958 int c, clen; 02959 int ascii_only = rb_enc_str_asciionly_p(str); 02960 02961 s = RSTRING_PTR(str); 02962 send = s + RSTRING_LEN(str); 02963 while (s < send) { 02964 c = rb_enc_ascget(s, send, &clen, enc); 02965 if (c == -1) { 02966 s += mbclen(s, send, enc); 02967 continue; 02968 } 02969 switch (c) { 02970 case '[': case ']': case '{': case '}': 02971 case '(': case ')': case '|': case '-': 02972 case '*': case '.': case '\\': 02973 case '?': case '+': case '^': case '$': 02974 case ' ': case '#': 02975 case '\t': case '\f': case '\v': case '\n': case '\r': 02976 goto meta_found; 02977 } 02978 s += clen; 02979 } 02980 tmp = rb_str_new3(str); 02981 if (ascii_only) { 02982 rb_enc_associate(tmp, rb_usascii_encoding()); 02983 } 02984 return tmp; 02985 02986 meta_found: 02987 tmp = rb_str_new(0, RSTRING_LEN(str)*2); 02988 if (ascii_only) { 02989 rb_enc_associate(tmp, rb_usascii_encoding()); 02990 } 02991 else { 02992 rb_enc_copy(tmp, str); 02993 } 02994 t = RSTRING_PTR(tmp); 02995 /* copy upto metacharacter */ 02996 memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str)); 02997 t += s - RSTRING_PTR(str); 02998 02999 while (s < send) { 03000 c = rb_enc_ascget(s, send, &clen, enc); 03001 if (c == -1) { 03002 int n = mbclen(s, send, enc); 03003 03004 while (n--) 03005 *t++ = *s++; 03006 continue; 03007 } 03008 s += clen; 03009 switch (c) { 03010 case '[': case ']': case '{': case '}': 03011 case '(': case ')': case '|': case '-': 03012 case '*': case '.': case '\\': 03013 case '?': case '+': case '^': case '$': 03014 case '#': 03015 t += rb_enc_mbcput('\\', t, enc); 03016 break; 03017 case ' ': 03018 t += rb_enc_mbcput('\\', t, enc); 03019 t += rb_enc_mbcput(' ', t, enc); 03020 continue; 03021 case '\t': 03022 t += rb_enc_mbcput('\\', t, enc); 03023 t += rb_enc_mbcput('t', t, enc); 03024 continue; 03025 case '\n': 03026 t += rb_enc_mbcput('\\', t, enc); 03027 t += rb_enc_mbcput('n', t, enc); 03028 continue; 03029 case '\r': 03030 t += rb_enc_mbcput('\\', t, enc); 03031 t += rb_enc_mbcput('r', t, enc); 03032 continue; 03033 case '\f': 03034 t += rb_enc_mbcput('\\', t, enc); 03035 t += rb_enc_mbcput('f', t, enc); 03036 continue; 03037 case '\v': 03038 t += rb_enc_mbcput('\\', t, enc); 03039 t += rb_enc_mbcput('v', t, enc); 03040 continue; 03041 } 03042 t += rb_enc_mbcput(c, t, enc); 03043 } 03044 rb_str_resize(tmp, t - RSTRING_PTR(tmp)); 03045 OBJ_INFECT(tmp, str); 03046 return tmp; 03047 } 03048 03049 03050 /* 03051 * call-seq: 03052 * Regexp.escape(str) -> string 03053 * Regexp.quote(str) -> string 03054 * 03055 * Escapes any characters that would have special meaning in a regular 03056 * expression. Returns a new escaped string, or self if no characters are 03057 * escaped. For any string, 03058 * <code>Regexp.new(Regexp.escape(<i>str</i>))=~<i>str</i></code> will be true. 03059 * 03060 * Regexp.escape('\*?{}.') #=> \\\*\?\{\}\. 03061 * 03062 */ 03063 03064 static VALUE 03065 rb_reg_s_quote(VALUE c, VALUE str) 03066 { 03067 return rb_reg_quote(reg_operand(str, TRUE)); 03068 } 03069 03070 int 03071 rb_reg_options(VALUE re) 03072 { 03073 int options; 03074 03075 rb_reg_check(re); 03076 options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK; 03077 if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED; 03078 if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE; 03079 return options; 03080 } 03081 03082 VALUE 03083 rb_check_regexp_type(VALUE re) 03084 { 03085 return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp"); 03086 } 03087 03088 /* 03089 * call-seq: 03090 * Regexp.try_convert(obj) -> re or nil 03091 * 03092 * Try to convert <i>obj</i> into a Regexp, using to_regexp method. 03093 * Returns converted regexp or nil if <i>obj</i> cannot be converted 03094 * for any reason. 03095 * 03096 * Regexp.try_convert(/re/) #=> /re/ 03097 * Regexp.try_convert("re") #=> nil 03098 * 03099 * o = Object.new 03100 * Regexp.try_convert(o) #=> nil 03101 * def o.to_regexp() /foo/ end 03102 * Regexp.try_convert(o) #=> /foo/ 03103 * 03104 */ 03105 static VALUE 03106 rb_reg_s_try_convert(VALUE dummy, VALUE re) 03107 { 03108 return rb_check_regexp_type(re); 03109 } 03110 03111 static VALUE 03112 rb_reg_s_union(VALUE self, VALUE args0) 03113 { 03114 long argc = RARRAY_LEN(args0); 03115 03116 if (argc == 0) { 03117 VALUE args[1]; 03118 args[0] = rb_str_new2("(?!)"); 03119 return rb_class_new_instance(1, args, rb_cRegexp); 03120 } 03121 else if (argc == 1) { 03122 VALUE arg = rb_ary_entry(args0, 0); 03123 VALUE re = rb_check_regexp_type(arg); 03124 if (!NIL_P(re)) 03125 return re; 03126 else { 03127 VALUE quoted; 03128 quoted = rb_reg_s_quote(Qnil, arg); 03129 return rb_reg_new_str(quoted, 0); 03130 } 03131 } 03132 else { 03133 int i; 03134 VALUE source = rb_str_buf_new(0); 03135 rb_encoding *result_enc; 03136 03137 int has_asciionly = 0; 03138 rb_encoding *has_ascii_compat_fixed = 0; 03139 rb_encoding *has_ascii_incompat = 0; 03140 03141 for (i = 0; i < argc; i++) { 03142 volatile VALUE v; 03143 VALUE e = rb_ary_entry(args0, i); 03144 03145 if (0 < i) 03146 rb_str_buf_cat_ascii(source, "|"); 03147 03148 v = rb_check_regexp_type(e); 03149 if (!NIL_P(v)) { 03150 rb_encoding *enc = rb_enc_get(v); 03151 if (!rb_enc_asciicompat(enc)) { 03152 if (!has_ascii_incompat) 03153 has_ascii_incompat = enc; 03154 else if (has_ascii_incompat != enc) 03155 rb_raise(rb_eArgError, "incompatible encodings: %s and %s", 03156 rb_enc_name(has_ascii_incompat), rb_enc_name(enc)); 03157 } 03158 else if (rb_reg_fixed_encoding_p(v)) { 03159 if (!has_ascii_compat_fixed) 03160 has_ascii_compat_fixed = enc; 03161 else if (has_ascii_compat_fixed != enc) 03162 rb_raise(rb_eArgError, "incompatible encodings: %s and %s", 03163 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc)); 03164 } 03165 else { 03166 has_asciionly = 1; 03167 } 03168 v = rb_reg_to_s(v); 03169 } 03170 else { 03171 rb_encoding *enc; 03172 StringValue(e); 03173 enc = rb_enc_get(e); 03174 if (!rb_enc_str_asciicompat_p(e)) { 03175 if (!has_ascii_incompat) 03176 has_ascii_incompat = enc; 03177 else if (has_ascii_incompat != enc) 03178 rb_raise(rb_eArgError, "incompatible encodings: %s and %s", 03179 rb_enc_name(has_ascii_incompat), rb_enc_name(enc)); 03180 } 03181 else if (rb_enc_str_asciionly_p(e)) { 03182 has_asciionly = 1; 03183 } 03184 else { 03185 if (!has_ascii_compat_fixed) 03186 has_ascii_compat_fixed = enc; 03187 else if (has_ascii_compat_fixed != enc) 03188 rb_raise(rb_eArgError, "incompatible encodings: %s and %s", 03189 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc)); 03190 } 03191 v = rb_reg_s_quote(Qnil, e); 03192 } 03193 if (has_ascii_incompat) { 03194 if (has_asciionly) { 03195 rb_raise(rb_eArgError, "ASCII incompatible encoding: %s", 03196 rb_enc_name(has_ascii_incompat)); 03197 } 03198 if (has_ascii_compat_fixed) { 03199 rb_raise(rb_eArgError, "incompatible encodings: %s and %s", 03200 rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed)); 03201 } 03202 } 03203 03204 if (i == 0) { 03205 rb_enc_copy(source, v); 03206 } 03207 rb_str_append(source, v); 03208 } 03209 03210 if (has_ascii_incompat) { 03211 result_enc = has_ascii_incompat; 03212 } 03213 else if (has_ascii_compat_fixed) { 03214 result_enc = has_ascii_compat_fixed; 03215 } 03216 else { 03217 result_enc = rb_ascii8bit_encoding(); 03218 } 03219 03220 rb_enc_associate(source, result_enc); 03221 return rb_class_new_instance(1, &source, rb_cRegexp); 03222 } 03223 } 03224 03225 /* 03226 * call-seq: 03227 * Regexp.union(pat1, pat2, ...) -> new_regexp 03228 * Regexp.union(pats_ary) -> new_regexp 03229 * 03230 * Return a <code>Regexp</code> object that is the union of the given 03231 * <em>pattern</em>s, i.e., will match any of its parts. The <em>pattern</em>s 03232 * can be Regexp objects, in which case their options will be preserved, or 03233 * Strings. If no patterns are given, returns <code>/(?!)/</code>. 03234 * The behavior is unspecified if any given <em>pattern</em> contains capture. 03235 * 03236 * Regexp.union #=> /(?!)/ 03237 * Regexp.union("penzance") #=> /penzance/ 03238 * Regexp.union("a+b*c") #=> /a\+b\*c/ 03239 * Regexp.union("skiing", "sledding") #=> /skiing|sledding/ 03240 * Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/ 03241 * Regexp.union(/dogs/, /cats/i) #=> /(?-mix:dogs)|(?i-mx:cats)/ 03242 */ 03243 static VALUE 03244 rb_reg_s_union_m(VALUE self, VALUE args) 03245 { 03246 VALUE v; 03247 if (RARRAY_LEN(args) == 1 && 03248 !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) { 03249 return rb_reg_s_union(self, v); 03250 } 03251 return rb_reg_s_union(self, args); 03252 } 03253 03254 /* :nodoc: */ 03255 static VALUE 03256 rb_reg_init_copy(VALUE copy, VALUE re) 03257 { 03258 onig_errmsg_buffer err = ""; 03259 const char *s; 03260 long len; 03261 03262 if (copy == re) return copy; 03263 rb_check_frozen(copy); 03264 /* need better argument type check */ 03265 if (!rb_obj_is_instance_of(re, rb_obj_class(copy))) { 03266 rb_raise(rb_eTypeError, "wrong argument type"); 03267 } 03268 rb_reg_check(re); 03269 s = RREGEXP_SRC_PTR(re); 03270 len = RREGEXP_SRC_LEN(re); 03271 if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re), 03272 err, NULL, 0) != 0) { 03273 rb_reg_raise(s, len, err, re); 03274 } 03275 return copy; 03276 } 03277 03278 VALUE 03279 rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) 03280 { 03281 VALUE val = 0; 03282 char *p, *s, *e; 03283 int no, clen; 03284 rb_encoding *str_enc = rb_enc_get(str); 03285 rb_encoding *src_enc = rb_enc_get(src); 03286 int acompat = rb_enc_asciicompat(str_enc); 03287 #define ASCGET(s,e,cl) (acompat ? (*(cl)=1,ISASCII((s)[0])?(s)[0]:-1) : rb_enc_ascget((s), (e), (cl), str_enc)) 03288 03289 p = s = RSTRING_PTR(str); 03290 e = s + RSTRING_LEN(str); 03291 03292 while (s < e) { 03293 int c = ASCGET(s, e, &clen); 03294 char *ss; 03295 03296 if (c == -1) { 03297 s += mbclen(s, e, str_enc); 03298 continue; 03299 } 03300 ss = s; 03301 s += clen; 03302 03303 if (c != '\\' || s == e) continue; 03304 03305 if (!val) { 03306 val = rb_str_buf_new(ss-p); 03307 } 03308 rb_enc_str_buf_cat(val, p, ss-p, str_enc); 03309 03310 c = ASCGET(s, e, &clen); 03311 if (c == -1) { 03312 s += mbclen(s, e, str_enc); 03313 rb_enc_str_buf_cat(val, ss, s-ss, str_enc); 03314 p = s; 03315 continue; 03316 } 03317 s += clen; 03318 03319 p = s; 03320 switch (c) { 03321 case '1': case '2': case '3': case '4': 03322 case '5': case '6': case '7': case '8': case '9': 03323 if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) { 03324 no = c - '0'; 03325 } 03326 else { 03327 continue; 03328 } 03329 break; 03330 03331 case 'k': 03332 if (s < e && ASCGET(s, e, &clen) == '<') { 03333 char *name, *name_end; 03334 03335 name_end = name = s + clen; 03336 while (name_end < e) { 03337 c = ASCGET(name_end, e, &clen); 03338 if (c == '>') break; 03339 name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen; 03340 } 03341 if (name_end < e) { 03342 no = name_to_backref_number(regs, regexp, name, name_end); 03343 p = s = name_end + clen; 03344 break; 03345 } 03346 else { 03347 rb_raise(rb_eRuntimeError, "invalid group name reference format"); 03348 } 03349 } 03350 03351 rb_enc_str_buf_cat(val, ss, s-ss, str_enc); 03352 continue; 03353 03354 case '0': 03355 case '&': 03356 no = 0; 03357 break; 03358 03359 case '`': 03360 rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc); 03361 continue; 03362 03363 case '\'': 03364 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc); 03365 continue; 03366 03367 case '+': 03368 no = regs->num_regs-1; 03369 while (BEG(no) == -1 && no > 0) no--; 03370 if (no == 0) continue; 03371 break; 03372 03373 case '\\': 03374 rb_enc_str_buf_cat(val, s-clen, clen, str_enc); 03375 continue; 03376 03377 default: 03378 rb_enc_str_buf_cat(val, ss, s-ss, str_enc); 03379 continue; 03380 } 03381 03382 if (no >= 0) { 03383 if (no >= regs->num_regs) continue; 03384 if (BEG(no) == -1) continue; 03385 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc); 03386 } 03387 } 03388 03389 if (!val) return str; 03390 if (p < e) { 03391 rb_enc_str_buf_cat(val, p, e-p, str_enc); 03392 } 03393 03394 return val; 03395 } 03396 03397 static VALUE 03398 kcode_getter(void) 03399 { 03400 rb_warn("variable $KCODE is no longer effective"); 03401 return Qnil; 03402 } 03403 03404 static void 03405 kcode_setter(VALUE val, ID id) 03406 { 03407 rb_warn("variable $KCODE is no longer effective; ignored"); 03408 } 03409 03410 static VALUE 03411 ignorecase_getter(void) 03412 { 03413 rb_warn("variable $= is no longer effective"); 03414 return Qfalse; 03415 } 03416 03417 static void 03418 ignorecase_setter(VALUE val, ID id) 03419 { 03420 rb_warn("variable $= is no longer effective; ignored"); 03421 } 03422 03423 static VALUE 03424 match_getter(void) 03425 { 03426 VALUE match = rb_backref_get(); 03427 03428 if (NIL_P(match)) return Qnil; 03429 rb_match_busy(match); 03430 return match; 03431 } 03432 03433 static void 03434 match_setter(VALUE val) 03435 { 03436 if (!NIL_P(val)) { 03437 Check_Type(val, T_MATCH); 03438 } 03439 rb_backref_set(val); 03440 } 03441 03442 /* 03443 * call-seq: 03444 * Regexp.last_match -> matchdata 03445 * Regexp.last_match(n) -> str 03446 * 03447 * The first form returns the <code>MatchData</code> object generated by the 03448 * last successful pattern match. Equivalent to reading the global variable 03449 * <code>$~</code>. The second form returns the <i>n</i>th field in this 03450 * <code>MatchData</code> object. 03451 * <em>n</em> can be a string or symbol to reference a named capture. 03452 * 03453 * Note that the <code>last_match</code> is local to the thread and method scope 03454 * of the method that did the pattern match. 03455 * 03456 * /c(.)t/ =~ 'cat' #=> 0 03457 * Regexp.last_match #=> #<MatchData "cat" 1:"a"> 03458 * Regexp.last_match(0) #=> "cat" 03459 * Regexp.last_match(1) #=> "a" 03460 * Regexp.last_match(2) #=> nil 03461 * 03462 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val" 03463 * Regexp.last_match #=> #<MatchData "var = val" lhs:"var" rhs:"val"> 03464 * Regexp.last_match(:lhs) #=> "var" 03465 * Regexp.last_match(:rhs) #=> "val" 03466 */ 03467 03468 static VALUE 03469 rb_reg_s_last_match(int argc, VALUE *argv) 03470 { 03471 VALUE nth; 03472 03473 if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) { 03474 VALUE match = rb_backref_get(); 03475 int n; 03476 if (NIL_P(match)) return Qnil; 03477 n = match_backref_number(match, nth); 03478 return rb_reg_nth_match(n, match); 03479 } 03480 return match_getter(); 03481 } 03482 03483 static void 03484 re_warn(const char *s) 03485 { 03486 rb_warn("%s", s); 03487 } 03488 03489 /* 03490 * Document-class: RegexpError 03491 * 03492 * Raised when given an invalid regexp expression. 03493 * 03494 * Regexp.new("?") 03495 * 03496 * <em>raises the exception:</em> 03497 * 03498 * RegexpError: target of repeat operator is not specified: /?/ 03499 */ 03500 03501 /* 03502 * Document-class: Regexp 03503 * 03504 * A <code>Regexp</code> holds a regular expression, used to match a pattern 03505 * against strings. Regexps are created using the <code>/.../</code> and 03506 * <code>%r{...}</code> literals, and by the <code>Regexp::new</code> 03507 * constructor. 03508 * 03509 * :include: doc/re.rdoc 03510 */ 03511 03512 void 03513 Init_Regexp(void) 03514 { 03515 rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError); 03516 03517 onigenc_set_default_caseconv_table((UChar*)casetable); 03518 onigenc_set_default_encoding(ONIG_ENCODING_ASCII); 03519 onig_set_warn_func(re_warn); 03520 onig_set_verb_warn_func(re_warn); 03521 03522 rb_define_virtual_variable("$~", match_getter, match_setter); 03523 rb_define_virtual_variable("$&", last_match_getter, 0); 03524 rb_define_virtual_variable("$`", prematch_getter, 0); 03525 rb_define_virtual_variable("$'", postmatch_getter, 0); 03526 rb_define_virtual_variable("$+", last_paren_match_getter, 0); 03527 03528 rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter); 03529 rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter); 03530 rb_define_virtual_variable("$-K", kcode_getter, kcode_setter); 03531 03532 rb_cRegexp = rb_define_class("Regexp", rb_cObject); 03533 rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc); 03534 rb_define_singleton_method(rb_cRegexp, "compile", rb_class_new_instance, -1); 03535 rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1); 03536 rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1); 03537 rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2); 03538 rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1); 03539 rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1); 03540 03541 rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1); 03542 rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1); 03543 rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0); 03544 rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1); 03545 rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1); 03546 rb_define_method(rb_cRegexp, "=~", rb_reg_match, 1); 03547 rb_define_method(rb_cRegexp, "===", rb_reg_eqq, 1); 03548 rb_define_method(rb_cRegexp, "~", rb_reg_match2, 0); 03549 rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1); 03550 rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0); 03551 rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0); 03552 rb_define_method(rb_cRegexp, "source", rb_reg_source, 0); 03553 rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0); 03554 rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0); 03555 rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0); /* in encoding.c */ 03556 rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0); 03557 rb_define_method(rb_cRegexp, "names", rb_reg_names, 0); 03558 rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0); 03559 03560 /* see Regexp.options and Regexp.new */ 03561 rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE)); 03562 /* see Regexp.options and Regexp.new */ 03563 rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND)); 03564 /* see Regexp.options and Regexp.new */ 03565 rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE)); 03566 /* see Regexp.options and Regexp.new */ 03567 rb_define_const(rb_cRegexp, "FIXEDENCODING", INT2FIX(ARG_ENCODING_FIXED)); 03568 /* see Regexp.options and Regexp.new */ 03569 rb_define_const(rb_cRegexp, "NOENCODING", INT2FIX(ARG_ENCODING_NONE)); 03570 03571 rb_global_variable(®_cache); 03572 03573 rb_cMatch = rb_define_class("MatchData", rb_cObject); 03574 rb_define_alloc_func(rb_cMatch, match_alloc); 03575 rb_undef_method(CLASS_OF(rb_cMatch), "new"); 03576 03577 rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1); 03578 rb_define_method(rb_cMatch, "regexp", match_regexp, 0); 03579 rb_define_method(rb_cMatch, "names", match_names, 0); 03580 rb_define_method(rb_cMatch, "size", match_size, 0); 03581 rb_define_method(rb_cMatch, "length", match_size, 0); 03582 rb_define_method(rb_cMatch, "offset", match_offset, 1); 03583 rb_define_method(rb_cMatch, "begin", match_begin, 1); 03584 rb_define_method(rb_cMatch, "end", match_end, 1); 03585 rb_define_method(rb_cMatch, "to_a", match_to_a, 0); 03586 rb_define_method(rb_cMatch, "[]", match_aref, -1); 03587 rb_define_method(rb_cMatch, "captures", match_captures, 0); 03588 rb_define_method(rb_cMatch, "values_at", match_values_at, -1); 03589 rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0); 03590 rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0); 03591 rb_define_method(rb_cMatch, "to_s", match_to_s, 0); 03592 rb_define_method(rb_cMatch, "inspect", match_inspect, 0); 03593 rb_define_method(rb_cMatch, "string", match_string, 0); 03594 rb_define_method(rb_cMatch, "hash", match_hash, 0); 03595 rb_define_method(rb_cMatch, "eql?", match_equal, 1); 03596 rb_define_method(rb_cMatch, "==", match_equal, 1); 03597 } 03598