Ruby 1.9.3p327(2012-11-10revision37606)
re.c
Go to the documentation of this file.
00001 /**********************************************************************
00002 
00003   re.c -
00004 
00005   $Author: akr $
00006   created at: Mon Aug  9 18:24:49 JST 1993
00007 
00008   Copyright (C) 1993-2007 Yukihiro Matsumoto
00009 
00010 **********************************************************************/
00011 
00012 #include "ruby/ruby.h"
00013 #include "ruby/re.h"
00014 #include "ruby/encoding.h"
00015 #include "ruby/util.h"
00016 #include "internal.h"
00017 #include "regint.h"
00018 #include <ctype.h>
00019 
00020 VALUE rb_eRegexpError;
00021 
00022 typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN];
00023 #define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN)
00024 
00025 #define BEG(no) (regs->beg[(no)])
00026 #define END(no) (regs->end[(no)])
00027 
00028 #if 'a' == 97   /* it's ascii */
00029 static const char casetable[] = {
00030         '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
00031         '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
00032         '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
00033         '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
00034         /* ' '     '!'     '"'     '#'     '$'     '%'     '&'     ''' */
00035         '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
00036         /* '('     ')'     '*'     '+'     ','     '-'     '.'     '/' */
00037         '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
00038         /* '0'     '1'     '2'     '3'     '4'     '5'     '6'     '7' */
00039         '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
00040         /* '8'     '9'     ':'     ';'     '<'     '='     '>'     '?' */
00041         '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
00042         /* '@'     'A'     'B'     'C'     'D'     'E'     'F'     'G' */
00043         '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00044         /* 'H'     'I'     'J'     'K'     'L'     'M'     'N'     'O' */
00045         '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00046         /* 'P'     'Q'     'R'     'S'     'T'     'U'     'V'     'W' */
00047         '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00048         /* 'X'     'Y'     'Z'     '['     '\'     ']'     '^'     '_' */
00049         '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
00050         /* '`'     'a'     'b'     'c'     'd'     'e'     'f'     'g' */
00051         '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00052         /* 'h'     'i'     'j'     'k'     'l'     'm'     'n'     'o' */
00053         '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00054         /* 'p'     'q'     'r'     's'     't'     'u'     'v'     'w' */
00055         '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00056         /* 'x'     'y'     'z'     '{'     '|'     '}'     '~' */
00057         '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
00058         '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
00059         '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
00060         '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
00061         '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
00062         '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
00063         '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
00064         '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
00065         '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
00066         '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
00067         '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
00068         '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
00069         '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
00070         '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
00071         '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
00072         '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
00073         '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
00074 };
00075 #else
00076 # error >>> "You lose. You will need a translation table for your character set." <<<
00077 #endif
00078 
00079 int
00080 rb_memcicmp(const void *x, const void *y, long len)
00081 {
00082     const unsigned char *p1 = x, *p2 = y;
00083     int tmp;
00084 
00085     while (len--) {
00086         if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++]))
00087             return tmp;
00088     }
00089     return 0;
00090 }
00091 
00092 #undef rb_memcmp
00093 
00094 int
00095 rb_memcmp(const void *p1, const void *p2, long len)
00096 {
00097     return memcmp(p1, p2, len);
00098 }
00099 
00100 static inline long
00101 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
00102 {
00103     const unsigned char *x = xs, *xe = xs + m;
00104     const unsigned char *y = ys, *ye = ys + n;
00105 #ifndef VALUE_MAX
00106 # if SIZEOF_VALUE == 8
00107 #  define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL
00108 # elif SIZEOF_VALUE == 4
00109 #  define VALUE_MAX 0xFFFFFFFFUL
00110 # endif
00111 #endif
00112     VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT);
00113 
00114     if (m > SIZEOF_VALUE)
00115         rb_bug("!!too long pattern string!!");
00116 
00117     /* Prepare hash value */
00118     for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
00119         hx <<= CHAR_BIT;
00120         hy <<= CHAR_BIT;
00121         hx |= *x;
00122         hy |= *y;
00123     }
00124     /* Searching */
00125     while (hx != hy) {
00126         if (y == ye)
00127             return -1;
00128         hy <<= CHAR_BIT;
00129         hy |= *y;
00130         hy &= mask;
00131         y++;
00132     }
00133     return y - ys - m;
00134 }
00135 
00136 static inline long
00137 rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n)
00138 {
00139     const unsigned char *x = xs, *xe = xs + m;
00140     const unsigned char *y = ys;
00141     VALUE i, qstable[256];
00142 
00143     /* Preprocessing */
00144     for (i = 0; i < 256; ++i)
00145         qstable[i] = m + 1;
00146     for (; x < xe; ++x)
00147         qstable[*x] = xe - x;
00148     /* Searching */
00149     for (; y + m <= ys + n; y += *(qstable + y[m])) {
00150         if (*xs == *y && memcmp(xs, y, m) == 0)
00151             return y - ys;
00152     }
00153     return -1;
00154 }
00155 
00156 static inline unsigned int
00157 rb_memsearch_qs_utf8_hash(const unsigned char *x)
00158 {
00159     register const unsigned int mix = 8353;
00160     register unsigned int h = *x;
00161     if (h < 0xC0) {
00162         return h + 256;
00163     }
00164     else if (h < 0xE0) {
00165         h *= mix;
00166         h += x[1];
00167     }
00168     else if (h < 0xF0) {
00169         h *= mix;
00170         h += x[1];
00171         h *= mix;
00172         h += x[2];
00173     }
00174     else if (h < 0xF5) {
00175         h *= mix;
00176         h += x[1];
00177         h *= mix;
00178         h += x[2];
00179         h *= mix;
00180         h += x[3];
00181     }
00182     else {
00183         return h + 256;
00184     }
00185     return (unsigned char)h;
00186 }
00187 
00188 static inline long
00189 rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n)
00190 {
00191     const unsigned char *x = xs, *xe = xs + m;
00192     const unsigned char *y = ys;
00193     VALUE i, qstable[512];
00194 
00195     /* Preprocessing */
00196     for (i = 0; i < 512; ++i) {
00197         qstable[i] = m + 1;
00198     }
00199     for (; x < xe; ++x) {
00200         qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
00201     }
00202     /* Searching */
00203     for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
00204         if (*xs == *y && memcmp(xs, y, m) == 0)
00205             return y - ys;
00206     }
00207     return -1;
00208 }
00209 
00210 long
00211 rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
00212 {
00213     const unsigned char *x = x0, *y = y0;
00214 
00215     if (m > n) return -1;
00216     else if (m == n) {
00217         return memcmp(x0, y0, m) == 0 ? 0 : -1;
00218     }
00219     else if (m < 1) {
00220         return 0;
00221     }
00222     else if (m == 1) {
00223         const unsigned char *ys = y, *ye = ys + n;
00224         for (; y < ye; ++y) {
00225             if (*x == *y)
00226                 return y - ys;
00227         }
00228         return -1;
00229     }
00230     else if (m <= SIZEOF_VALUE) {
00231         return rb_memsearch_ss(x0, m, y0, n);
00232     }
00233     else if (enc == rb_utf8_encoding()){
00234         return rb_memsearch_qs_utf8(x0, m, y0, n);
00235     }
00236     else {
00237         return rb_memsearch_qs(x0, m, y0, n);
00238     }
00239 }
00240 
00241 #define REG_LITERAL FL_USER5
00242 #define REG_ENCODING_NONE FL_USER6
00243 
00244 #define KCODE_FIXED FL_USER4
00245 
00246 #define ARG_REG_OPTION_MASK \
00247     (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
00248 #define ARG_ENCODING_FIXED    16
00249 #define ARG_ENCODING_NONE     32
00250 
00251 static int
00252 char_to_option(int c)
00253 {
00254     int val;
00255 
00256     switch (c) {
00257       case 'i':
00258         val = ONIG_OPTION_IGNORECASE;
00259         break;
00260       case 'x':
00261         val = ONIG_OPTION_EXTEND;
00262         break;
00263       case 'm':
00264         val = ONIG_OPTION_MULTILINE;
00265         break;
00266       default:
00267         val = 0;
00268         break;
00269     }
00270     return val;
00271 }
00272 
00273 static char *
00274 option_to_str(char str[4], int options)
00275 {
00276     char *p = str;
00277     if (options & ONIG_OPTION_MULTILINE) *p++ = 'm';
00278     if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i';
00279     if (options & ONIG_OPTION_EXTEND) *p++ = 'x';
00280     *p = 0;
00281     return str;
00282 }
00283 
00284 extern int
00285 rb_char_to_option_kcode(int c, int *option, int *kcode)
00286 {
00287     *option = 0;
00288 
00289     switch (c) {
00290       case 'n':
00291         *kcode = rb_ascii8bit_encindex();
00292         return (*option = ARG_ENCODING_NONE);
00293       case 'e':
00294         *kcode = rb_enc_find_index("EUC-JP");
00295         break;
00296       case 's':
00297         *kcode = rb_enc_find_index("Windows-31J");
00298         break;
00299       case 'u':
00300         *kcode = rb_utf8_encindex();
00301         break;
00302       default:
00303         *kcode = -1;
00304         return (*option = char_to_option(c));
00305     }
00306     *option = ARG_ENCODING_FIXED;
00307     return 1;
00308 }
00309 
00310 static void
00311 rb_reg_check(VALUE re)
00312 {
00313     if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00314         rb_raise(rb_eTypeError, "uninitialized Regexp");
00315     }
00316 }
00317 
00318 static void
00319 rb_reg_expr_str(VALUE str, const char *s, long len,
00320         rb_encoding *enc, rb_encoding *resenc)
00321 {
00322     const char *p, *pend;
00323     int cr = ENC_CODERANGE_UNKNOWN;
00324     int need_escape = 0;
00325     int c, clen;
00326 
00327     p = s; pend = p + len;
00328     rb_str_coderange_scan_restartable(p, pend, enc, &cr);
00329     if (rb_enc_asciicompat(enc) &&
00330         (cr == ENC_CODERANGE_VALID || cr == ENC_CODERANGE_7BIT)) {
00331         while (p < pend) {
00332             c = rb_enc_ascget(p, pend, &clen, enc);
00333             if (c == -1) {
00334                 if (enc == resenc) {
00335                     p += mbclen(p, pend, enc);
00336                 }
00337                 else {
00338                     need_escape = 1;
00339                     break;
00340                 }
00341             }
00342             else if (c != '/' && rb_enc_isprint(c, enc)) {
00343                 p += clen;
00344             }
00345             else {
00346                 need_escape = 1;
00347                 break;
00348             }
00349         }
00350     }
00351     else {
00352         need_escape = 1;
00353     }
00354 
00355     if (!need_escape) {
00356         rb_str_buf_cat(str, s, len);
00357     }
00358     else {
00359         int unicode_p = rb_enc_unicode_p(enc);
00360         p = s;
00361         while (p<pend) {
00362             c = rb_enc_ascget(p, pend, &clen, enc);
00363             if (c == '\\' && p+clen < pend) {
00364                 int n = clen + mbclen(p+clen, pend, enc);
00365                 rb_str_buf_cat(str, p, n);
00366                 p += n;
00367                 continue;
00368             }
00369             else if (c == '/') {
00370                 char c = '\\';
00371                 rb_str_buf_cat(str, &c, 1);
00372                 rb_str_buf_cat(str, p, clen);
00373             }
00374             else if (c == -1) {
00375                 clen = rb_enc_precise_mbclen(p, pend, enc);
00376                 if (!MBCLEN_CHARFOUND_P(clen)) {
00377                     c = (unsigned char)*p;
00378                     clen = 1;
00379                     goto hex;
00380                 }
00381                 if (resenc) {
00382                     unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc);
00383                     rb_str_buf_cat_escaped_char(str, c, unicode_p);
00384                 }
00385                 else {
00386                     clen = MBCLEN_CHARFOUND_LEN(clen);
00387                     rb_str_buf_cat(str, p, clen);
00388                 }
00389             }
00390             else if (rb_enc_isprint(c, enc)) {
00391                 rb_str_buf_cat(str, p, clen);
00392             }
00393             else if (!rb_enc_isspace(c, enc)) {
00394                 char b[8];
00395 
00396               hex:
00397                 snprintf(b, sizeof(b), "\\x%02X", c);
00398                 rb_str_buf_cat(str, b, 4);
00399             }
00400             else {
00401                 rb_str_buf_cat(str, p, clen);
00402             }
00403             p += clen;
00404         }
00405     }
00406 }
00407 
00408 static VALUE
00409 rb_reg_desc(const char *s, long len, VALUE re)
00410 {
00411     rb_encoding *enc = rb_enc_get(re);
00412     VALUE str = rb_str_buf_new2("/");
00413     rb_encoding *resenc = rb_default_internal_encoding();
00414     if (resenc == NULL) resenc = rb_default_external_encoding();
00415 
00416     if (re && rb_enc_asciicompat(enc)) {
00417         rb_enc_copy(str, re);
00418     }
00419     else {
00420         rb_enc_associate(str, rb_usascii_encoding());
00421     }
00422     rb_reg_expr_str(str, s, len, enc, resenc);
00423     rb_str_buf_cat2(str, "/");
00424     if (re) {
00425         char opts[4];
00426         rb_reg_check(re);
00427         if (*option_to_str(opts, RREGEXP(re)->ptr->options))
00428             rb_str_buf_cat2(str, opts);
00429         if (RBASIC(re)->flags & REG_ENCODING_NONE)
00430             rb_str_buf_cat2(str, "n");
00431     }
00432     OBJ_INFECT(str, re);
00433     return str;
00434 }
00435 
00436 
00437 /*
00438  *  call-seq:
00439  *      rxp.source   -> str
00440  *
00441  *  Returns the original string of the pattern.
00442  *
00443  *      /ab+c/ix.source #=> "ab+c"
00444  *
00445  *  Note that escape sequences are retained as is.
00446  *
00447  *     /\x20\+/.source  #=> "\\x20\\+"
00448  *
00449  */
00450 
00451 static VALUE
00452 rb_reg_source(VALUE re)
00453 {
00454     VALUE str;
00455 
00456     rb_reg_check(re);
00457     str = rb_enc_str_new(RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re), rb_enc_get(re));
00458     if (OBJ_TAINTED(re)) OBJ_TAINT(str);
00459     return str;
00460 }
00461 
00462 /*
00463  * call-seq:
00464  *    rxp.inspect   -> string
00465  *
00466  * Produce a nicely formatted string-version of _rxp_. Perhaps surprisingly,
00467  * <code>#inspect</code> actually produces the more natural version of
00468  * the string than <code>#to_s</code>.
00469  *
00470  *      /ab+c/ix.inspect        #=> "/ab+c/ix"
00471  *
00472  */
00473 
00474 static VALUE
00475 rb_reg_inspect(VALUE re)
00476 {
00477     if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00478         return rb_any_to_s(re);
00479     }
00480     return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re);
00481 }
00482 
00483 
00484 /*
00485  *  call-seq:
00486  *     rxp.to_s   -> str
00487  *
00488  *  Returns a string containing the regular expression and its options (using the
00489  *  <code>(?opts:source)</code> notation. This string can be fed back in to
00490  *  <code>Regexp::new</code> to a regular expression with the same semantics as
00491  *  the original. (However, <code>Regexp#==</code> may not return true when
00492  *  comparing the two, as the source of the regular expression itself may
00493  *  differ, as the example shows).  <code>Regexp#inspect</code> produces a
00494  *  generally more readable version of <i>rxp</i>.
00495  *
00496  *      r1 = /ab+c/ix           #=> /ab+c/ix
00497  *      s1 = r1.to_s            #=> "(?ix-m:ab+c)"
00498  *      r2 = Regexp.new(s1)     #=> /(?ix-m:ab+c)/
00499  *      r1 == r2                #=> false
00500  *      r1.source               #=> "ab+c"
00501  *      r2.source               #=> "(?ix-m:ab+c)"
00502  */
00503 
00504 static VALUE
00505 rb_reg_to_s(VALUE re)
00506 {
00507     int options, opt;
00508     const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND;
00509     long len;
00510     const UChar* ptr;
00511     VALUE str = rb_str_buf_new2("(?");
00512     char optbuf[5];
00513     rb_encoding *enc = rb_enc_get(re);
00514 
00515     rb_reg_check(re);
00516 
00517     rb_enc_copy(str, re);
00518     options = RREGEXP(re)->ptr->options;
00519     ptr = (UChar*)RREGEXP_SRC_PTR(re);
00520     len = RREGEXP_SRC_LEN(re);
00521   again:
00522     if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
00523         int err = 1;
00524         ptr += 2;
00525         if ((len -= 2) > 0) {
00526             do {
00527                 opt = char_to_option((int )*ptr);
00528                 if (opt != 0) {
00529                     options |= opt;
00530                 }
00531                 else {
00532                     break;
00533                 }
00534                 ++ptr;
00535             } while (--len > 0);
00536         }
00537         if (len > 1 && *ptr == '-') {
00538             ++ptr;
00539             --len;
00540             do {
00541                 opt = char_to_option((int )*ptr);
00542                 if (opt != 0) {
00543                     options &= ~opt;
00544                 }
00545                 else {
00546                     break;
00547                 }
00548                 ++ptr;
00549             } while (--len > 0);
00550         }
00551         if (*ptr == ')') {
00552             --len;
00553             ++ptr;
00554             goto again;
00555         }
00556         if (*ptr == ':' && ptr[len-1] == ')') {
00557             Regexp *rp;
00558 
00559             ++ptr;
00560             len -= 2;
00561             err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT,
00562                            enc, OnigDefaultSyntax, NULL);
00563             onig_free(rp);
00564         }
00565         if (err) {
00566             options = RREGEXP(re)->ptr->options;
00567             ptr = (UChar*)RREGEXP_SRC_PTR(re);
00568             len = RREGEXP_SRC_LEN(re);
00569         }
00570     }
00571 
00572     if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf);
00573 
00574     if ((options & embeddable) != embeddable) {
00575         optbuf[0] = '-';
00576         option_to_str(optbuf + 1, ~options);
00577         rb_str_buf_cat2(str, optbuf);
00578     }
00579 
00580     rb_str_buf_cat2(str, ":");
00581     rb_reg_expr_str(str, (char*)ptr, len, enc, NULL);
00582     rb_str_buf_cat2(str, ")");
00583     rb_enc_copy(str, re);
00584 
00585     OBJ_INFECT(str, re);
00586     return str;
00587 }
00588 
00589 static void
00590 rb_reg_raise(const char *s, long len, const char *err, VALUE re)
00591 {
00592     volatile VALUE desc = rb_reg_desc(s, len, re);
00593 
00594     rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc));
00595 }
00596 
00597 static VALUE
00598 rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err)
00599 {
00600     char opts[6];
00601     VALUE desc = rb_str_buf_new2(err);
00602     rb_encoding *resenc = rb_default_internal_encoding();
00603     if (resenc == NULL) resenc = rb_default_external_encoding();
00604 
00605     rb_enc_associate(desc, enc);
00606     rb_str_buf_cat2(desc, ": /");
00607     rb_reg_expr_str(desc, s, len, enc, resenc);
00608     opts[0] = '/';
00609     option_to_str(opts + 1, options);
00610     rb_str_buf_cat2(desc, opts);
00611     return rb_exc_new3(rb_eRegexpError, desc);
00612 }
00613 
00614 static void
00615 rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err)
00616 {
00617     rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err));
00618 }
00619 
00620 static VALUE
00621 rb_reg_error_desc(VALUE str, int options, const char *err)
00622 {
00623     return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str),
00624                                  rb_enc_get(str), options, err);
00625 }
00626 
00627 static void
00628 rb_reg_raise_str(VALUE str, int options, const char *err)
00629 {
00630     rb_exc_raise(rb_reg_error_desc(str, options, err));
00631 }
00632 
00633 
00634 /*
00635  *  call-seq:
00636  *     rxp.casefold?   -> true or false
00637  *
00638  *  Returns the value of the case-insensitive flag.
00639  *
00640  *      /a/.casefold?           #=> false
00641  *      /a/i.casefold?          #=> true
00642  *      /(?i:a)/.casefold?      #=> false
00643  */
00644 
00645 static VALUE
00646 rb_reg_casefold_p(VALUE re)
00647 {
00648     rb_reg_check(re);
00649     if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue;
00650     return Qfalse;
00651 }
00652 
00653 
00654 /*
00655  *  call-seq:
00656  *     rxp.options   -> fixnum
00657  *
00658  *  Returns the set of bits corresponding to the options used when creating this
00659  *  Regexp (see <code>Regexp::new</code> for details. Note that additional bits
00660  *  may be set in the returned options: these are used internally by the regular
00661  *  expression code. These extra bits are ignored if the options are passed to
00662  *  <code>Regexp::new</code>.
00663  *
00664  *     Regexp::IGNORECASE                  #=> 1
00665  *     Regexp::EXTENDED                    #=> 2
00666  *     Regexp::MULTILINE                   #=> 4
00667  *
00668  *     /cat/.options                       #=> 0
00669  *     /cat/ix.options                     #=> 3
00670  *     Regexp.new('cat', true).options     #=> 1
00671  *     /\xa1\xa2/e.options                 #=> 16
00672  *
00673  *     r = /cat/ix
00674  *     Regexp.new(r.source, r.options)     #=> /cat/ix
00675  */
00676 
00677 static VALUE
00678 rb_reg_options_m(VALUE re)
00679 {
00680     int options = rb_reg_options(re);
00681     return INT2NUM(options);
00682 }
00683 
00684 static int
00685 reg_names_iter(const OnigUChar *name, const OnigUChar *name_end,
00686           int back_num, int *back_refs, OnigRegex regex, void *arg)
00687 {
00688     VALUE ary = (VALUE)arg;
00689     rb_ary_push(ary, rb_str_new((const char *)name, name_end-name));
00690     return 0;
00691 }
00692 
00693 /*
00694  * call-seq:
00695  *    rxp.names   -> [name1, name2, ...]
00696  *
00697  * Returns a list of names of captures as an array of strings.
00698  *
00699  *     /(?<foo>.)(?<bar>.)(?<baz>.)/.names
00700  *     #=> ["foo", "bar", "baz"]
00701  *
00702  *     /(?<foo>.)(?<foo>.)/.names
00703  *     #=> ["foo"]
00704  *
00705  *     /(.)(.)/.names
00706  *     #=> []
00707  */
00708 
00709 static VALUE
00710 rb_reg_names(VALUE re)
00711 {
00712     VALUE ary = rb_ary_new();
00713     rb_reg_check(re);
00714     onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary);
00715     return ary;
00716 }
00717 
00718 static int
00719 reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end,
00720           int back_num, int *back_refs, OnigRegex regex, void *arg)
00721 {
00722     VALUE hash = (VALUE)arg;
00723     VALUE ary = rb_ary_new2(back_num);
00724     int i;
00725 
00726     for(i = 0; i < back_num; i++)
00727         rb_ary_store(ary, i, INT2NUM(back_refs[i]));
00728 
00729     rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary);
00730 
00731     return 0;
00732 }
00733 
00734 /*
00735  * call-seq:
00736  *    rxp.named_captures  -> hash
00737  *
00738  * Returns a hash representing information about named captures of <i>rxp</i>.
00739  *
00740  * A key of the hash is a name of the named captures.
00741  * A value of the hash is an array which is list of indexes of corresponding
00742  * named captures.
00743  *
00744  *    /(?<foo>.)(?<bar>.)/.named_captures
00745  *    #=> {"foo"=>[1], "bar"=>[2]}
00746  *
00747  *    /(?<foo>.)(?<foo>.)/.named_captures
00748  *    #=> {"foo"=>[1, 2]}
00749  *
00750  * If there are no named captures, an empty hash is returned.
00751  *
00752  *    /(.)(.)/.named_captures
00753  *    #=> {}
00754  */
00755 
00756 static VALUE
00757 rb_reg_named_captures(VALUE re)
00758 {
00759     VALUE hash = rb_hash_new();
00760     rb_reg_check(re);
00761     onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash);
00762     return hash;
00763 }
00764 
00765 static int
00766 onig_new_with_source(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
00767           OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
00768           OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
00769 {
00770   int r;
00771 
00772   *reg = (regex_t* )xmalloc(sizeof(regex_t));
00773   if (IS_NULL(*reg)) return ONIGERR_MEMORY;
00774 
00775   r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
00776   if (r) goto err;
00777 
00778   r = onig_compile(*reg, pattern, pattern_end, einfo, sourcefile, sourceline);
00779   if (r) {
00780   err:
00781     onig_free(*reg);
00782     *reg = NULL;
00783   }
00784   return r;
00785 }
00786 
00787 static Regexp*
00788 make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err,
00789         const char *sourcefile, int sourceline)
00790 {
00791     Regexp *rp;
00792     int r;
00793     OnigErrorInfo einfo;
00794 
00795     /* Handle escaped characters first. */
00796 
00797     /* Build a copy of the string (in dest) with the
00798        escaped characters translated,  and generate the regex
00799        from that.
00800     */
00801 
00802     r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s + len), flags,
00803                  enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline);
00804     if (r) {
00805         onig_error_code_to_str((UChar*)err, r, &einfo);
00806         return 0;
00807     }
00808     return rp;
00809 }
00810 
00811 
00812 /*
00813  *  Document-class: MatchData
00814  *
00815  *  <code>MatchData</code> is the type of the special variable <code>$~</code>,
00816  *  and is the type of the object returned by <code>Regexp#match</code> and
00817  *  <code>Regexp.last_match</code>. It encapsulates all the results of a pattern
00818  *  match, results normally accessed through the special variables
00819  *  <code>$&</code>, <code>$'</code>, <code>$`</code>, <code>$1</code>,
00820  *  <code>$2</code>, and so on.
00821  *
00822  */
00823 
00824 VALUE rb_cMatch;
00825 
00826 static VALUE
00827 match_alloc(VALUE klass)
00828 {
00829     NEWOBJ(match, struct RMatch);
00830     OBJSETUP(match, klass, T_MATCH);
00831 
00832     match->str = 0;
00833     match->rmatch = 0;
00834     match->regexp = 0;
00835     match->rmatch = ALLOC(struct rmatch);
00836     MEMZERO(match->rmatch, struct rmatch, 1);
00837 
00838     return (VALUE)match;
00839 }
00840 
00841 typedef struct {
00842     long byte_pos;
00843     long char_pos;
00844 } pair_t;
00845 
00846 static int
00847 pair_byte_cmp(const void *pair1, const void *pair2)
00848 {
00849     long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos;
00850 #if SIZEOF_LONG > SIZEOF_INT
00851     return diff ? diff > 0 ? 1 : -1 : 0;
00852 #else
00853     return (int)diff;
00854 #endif
00855 }
00856 
00857 static void
00858 update_char_offset(VALUE match)
00859 {
00860     struct rmatch *rm = RMATCH(match)->rmatch;
00861     struct re_registers *regs;
00862     int i, num_regs, num_pos;
00863     long c;
00864     char *s, *p, *q;
00865     rb_encoding *enc;
00866     pair_t *pairs;
00867 
00868     if (rm->char_offset_updated)
00869         return;
00870 
00871     regs = &rm->regs;
00872     num_regs = rm->regs.num_regs;
00873 
00874     if (rm->char_offset_num_allocated < num_regs) {
00875         REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs);
00876         rm->char_offset_num_allocated = num_regs;
00877     }
00878 
00879     enc = rb_enc_get(RMATCH(match)->str);
00880     if (rb_enc_mbmaxlen(enc) == 1) {
00881         for (i = 0; i < num_regs; i++) {
00882             rm->char_offset[i].beg = BEG(i);
00883             rm->char_offset[i].end = END(i);
00884         }
00885         rm->char_offset_updated = 1;
00886         return;
00887     }
00888 
00889     pairs = ALLOCA_N(pair_t, num_regs*2);
00890     num_pos = 0;
00891     for (i = 0; i < num_regs; i++) {
00892         if (BEG(i) < 0)
00893             continue;
00894         pairs[num_pos++].byte_pos = BEG(i);
00895         pairs[num_pos++].byte_pos = END(i);
00896     }
00897     qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00898 
00899     s = p = RSTRING_PTR(RMATCH(match)->str);
00900     c = 0;
00901     for (i = 0; i < num_pos; i++) {
00902         q = s + pairs[i].byte_pos;
00903         c += rb_enc_strlen(p, q, enc);
00904         pairs[i].char_pos = c;
00905         p = q;
00906     }
00907 
00908     for (i = 0; i < num_regs; i++) {
00909         pair_t key, *found;
00910         if (BEG(i) < 0) {
00911             rm->char_offset[i].beg = -1;
00912             rm->char_offset[i].end = -1;
00913             continue;
00914         }
00915 
00916         key.byte_pos = BEG(i);
00917         found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00918         rm->char_offset[i].beg = found->char_pos;
00919 
00920         key.byte_pos = END(i);
00921         found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00922         rm->char_offset[i].end = found->char_pos;
00923     }
00924 
00925     rm->char_offset_updated = 1;
00926 }
00927 
00928 static void
00929 match_check(VALUE match)
00930 {
00931     if (!RMATCH(match)->regexp) {
00932         rb_raise(rb_eTypeError, "uninitialized Match");
00933     }
00934 }
00935 
00936 /* :nodoc: */
00937 static VALUE
00938 match_init_copy(VALUE obj, VALUE orig)
00939 {
00940     struct rmatch *rm;
00941 
00942     if (obj == orig) return obj;
00943 
00944     if (!rb_obj_is_instance_of(orig, rb_obj_class(obj))) {
00945         rb_raise(rb_eTypeError, "wrong argument class");
00946     }
00947     RMATCH(obj)->str = RMATCH(orig)->str;
00948     RMATCH(obj)->regexp = RMATCH(orig)->regexp;
00949 
00950     rm = RMATCH(obj)->rmatch;
00951     onig_region_copy(&rm->regs, RMATCH_REGS(orig));
00952 
00953     if (!RMATCH(orig)->rmatch->char_offset_updated) {
00954         rm->char_offset_updated = 0;
00955     }
00956     else {
00957         if (rm->char_offset_num_allocated < rm->regs.num_regs) {
00958             REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs);
00959             rm->char_offset_num_allocated = rm->regs.num_regs;
00960         }
00961         MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset,
00962                struct rmatch_offset, rm->regs.num_regs);
00963         rm->char_offset_updated = 1;
00964     }
00965 
00966     return obj;
00967 }
00968 
00969 
00970 /*
00971  * call-seq:
00972  *    mtch.regexp   -> regexp
00973  *
00974  * Returns the regexp.
00975  *
00976  *     m = /a.*b/.match("abc")
00977  *     m.regexp #=> /a.*b/
00978  */
00979 
00980 static VALUE
00981 match_regexp(VALUE match)
00982 {
00983     match_check(match);
00984     return RMATCH(match)->regexp;
00985 }
00986 
00987 /*
00988  * call-seq:
00989  *    mtch.names   -> [name1, name2, ...]
00990  *
00991  * Returns a list of names of captures as an array of strings.
00992  * It is same as mtch.regexp.names.
00993  *
00994  *     /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").names
00995  *     #=> ["foo", "bar", "baz"]
00996  *
00997  *     m = /(?<x>.)(?<y>.)?/.match("a") #=> #<MatchData "a" x:"a" y:nil>
00998  *     m.names                          #=> ["x", "y"]
00999  */
01000 
01001 static VALUE
01002 match_names(VALUE match)
01003 {
01004     match_check(match);
01005     return rb_reg_names(RMATCH(match)->regexp);
01006 }
01007 
01008 /*
01009  *  call-seq:
01010  *     mtch.length   -> integer
01011  *     mtch.size     -> integer
01012  *
01013  *  Returns the number of elements in the match array.
01014  *
01015  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01016  *     m.length   #=> 5
01017  *     m.size     #=> 5
01018  */
01019 
01020 static VALUE
01021 match_size(VALUE match)
01022 {
01023     match_check(match);
01024     return INT2FIX(RMATCH_REGS(match)->num_regs);
01025 }
01026 
01027 static int
01028 match_backref_number(VALUE match, VALUE backref)
01029 {
01030     const char *name;
01031     int num;
01032 
01033     struct re_registers *regs = RMATCH_REGS(match);
01034     VALUE regexp = RMATCH(match)->regexp;
01035 
01036     match_check(match);
01037     switch(TYPE(backref)) {
01038       default:
01039         return NUM2INT(backref);
01040 
01041       case T_SYMBOL:
01042         name = rb_id2name(SYM2ID(backref));
01043         break;
01044 
01045       case T_STRING:
01046         name = StringValueCStr(backref);
01047         break;
01048     }
01049 
01050     num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01051               (const unsigned char*)name,
01052               (const unsigned char*)name + strlen(name),
01053               regs);
01054 
01055     if (num < 1) {
01056         rb_raise(rb_eIndexError, "undefined group name reference: %s", name);
01057     }
01058 
01059     return num;
01060 }
01061 
01062 int
01063 rb_reg_backref_number(VALUE match, VALUE backref)
01064 {
01065     return match_backref_number(match, backref);
01066 }
01067 
01068 /*
01069  *  call-seq:
01070  *     mtch.offset(n)   -> array
01071  *
01072  *  Returns a two-element array containing the beginning and ending offsets of
01073  *  the <em>n</em>th match.
01074  *  <em>n</em> can be a string or symbol to reference a named capture.
01075  *
01076  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01077  *     m.offset(0)      #=> [1, 7]
01078  *     m.offset(4)      #=> [6, 7]
01079  *
01080  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
01081  *     p m.offset(:foo) #=> [0, 1]
01082  *     p m.offset(:bar) #=> [2, 3]
01083  *
01084  */
01085 
01086 static VALUE
01087 match_offset(VALUE match, VALUE n)
01088 {
01089     int i = match_backref_number(match, n);
01090     struct re_registers *regs = RMATCH_REGS(match);
01091 
01092     match_check(match);
01093     if (i < 0 || regs->num_regs <= i)
01094         rb_raise(rb_eIndexError, "index %d out of matches", i);
01095 
01096     if (BEG(i) < 0)
01097         return rb_assoc_new(Qnil, Qnil);
01098 
01099     update_char_offset(match);
01100     return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg),
01101                         INT2FIX(RMATCH(match)->rmatch->char_offset[i].end));
01102 }
01103 
01104 
01105 /*
01106  *  call-seq:
01107  *     mtch.begin(n)   -> integer
01108  *
01109  *  Returns the offset of the start of the <em>n</em>th element of the match
01110  *  array in the string.
01111  *  <em>n</em> can be a string or symbol to reference a named capture.
01112  *
01113  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01114  *     m.begin(0)       #=> 1
01115  *     m.begin(2)       #=> 2
01116  *
01117  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
01118  *     p m.begin(:foo)  #=> 0
01119  *     p m.begin(:bar)  #=> 2
01120  */
01121 
01122 static VALUE
01123 match_begin(VALUE match, VALUE n)
01124 {
01125     int i = match_backref_number(match, n);
01126     struct re_registers *regs = RMATCH_REGS(match);
01127 
01128     match_check(match);
01129     if (i < 0 || regs->num_regs <= i)
01130         rb_raise(rb_eIndexError, "index %d out of matches", i);
01131 
01132     if (BEG(i) < 0)
01133         return Qnil;
01134 
01135     update_char_offset(match);
01136     return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg);
01137 }
01138 
01139 
01140 /*
01141  *  call-seq:
01142  *     mtch.end(n)   -> integer
01143  *
01144  *  Returns the offset of the character immediately following the end of the
01145  *  <em>n</em>th element of the match array in the string.
01146  *  <em>n</em> can be a string or symbol to reference a named capture.
01147  *
01148  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01149  *     m.end(0)         #=> 7
01150  *     m.end(2)         #=> 3
01151  *
01152  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
01153  *     p m.end(:foo)    #=> 1
01154  *     p m.end(:bar)    #=> 3
01155  */
01156 
01157 static VALUE
01158 match_end(VALUE match, VALUE n)
01159 {
01160     int i = match_backref_number(match, n);
01161     struct re_registers *regs = RMATCH_REGS(match);
01162 
01163     match_check(match);
01164     if (i < 0 || regs->num_regs <= i)
01165         rb_raise(rb_eIndexError, "index %d out of matches", i);
01166 
01167     if (BEG(i) < 0)
01168         return Qnil;
01169 
01170     update_char_offset(match);
01171     return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end);
01172 }
01173 
01174 #define MATCH_BUSY FL_USER2
01175 
01176 void
01177 rb_match_busy(VALUE match)
01178 {
01179     FL_SET(match, MATCH_BUSY);
01180 }
01181 
01182 /*
01183  *  call-seq:
01184  *     rxp.fixed_encoding?   -> true or false
01185  *
01186  *  Returns false if rxp is applicable to
01187  *  a string with any ASCII compatible encoding.
01188  *  Returns true otherwise.
01189  *
01190  *      r = /a/
01191  *      r.fixed_encoding?                               #=> false
01192  *      r =~ "\u{6666} a"                               #=> 2
01193  *      r =~ "\xa1\xa2 a".force_encoding("euc-jp")      #=> 2
01194  *      r =~ "abc".force_encoding("euc-jp")             #=> 0
01195  *
01196  *      r = /a/u
01197  *      r.fixed_encoding?                               #=> true
01198  *      r.encoding                                      #=> #<Encoding:UTF-8>
01199  *      r =~ "\u{6666} a"                               #=> 2
01200  *      r =~ "\xa1\xa2".force_encoding("euc-jp")        #=> ArgumentError
01201  *      r =~ "abc".force_encoding("euc-jp")             #=> 0
01202  *
01203  *      r = /\u{6666}/
01204  *      r.fixed_encoding?                               #=> true
01205  *      r.encoding                                      #=> #<Encoding:UTF-8>
01206  *      r =~ "\u{6666} a"                               #=> 0
01207  *      r =~ "\xa1\xa2".force_encoding("euc-jp")        #=> ArgumentError
01208  *      r =~ "abc".force_encoding("euc-jp")             #=> nil
01209  */
01210 
01211 static VALUE
01212 rb_reg_fixed_encoding_p(VALUE re)
01213 {
01214     if (FL_TEST(re, KCODE_FIXED))
01215         return Qtrue;
01216     else
01217         return Qfalse;
01218 }
01219 
01220 static VALUE
01221 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
01222         rb_encoding **fixed_enc, onig_errmsg_buffer err);
01223 
01224 
01225 static void
01226 reg_enc_error(VALUE re, VALUE str)
01227 {
01228     rb_raise(rb_eEncCompatError,
01229              "incompatible encoding regexp match (%s regexp with %s string)",
01230              rb_enc_name(rb_enc_get(re)),
01231              rb_enc_name(rb_enc_get(str)));
01232 }
01233 
01234 static rb_encoding*
01235 rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
01236 {
01237     rb_encoding *enc = 0;
01238 
01239     if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
01240         rb_raise(rb_eArgError,
01241             "invalid byte sequence in %s",
01242             rb_enc_name(rb_enc_get(str)));
01243     }
01244 
01245     rb_reg_check(re);
01246     enc = rb_enc_get(str);
01247     if (!rb_enc_str_asciicompat_p(str)) {
01248         if (RREGEXP(re)->ptr->enc != enc) {
01249             reg_enc_error(re, str);
01250         }
01251     }
01252     else if (rb_reg_fixed_encoding_p(re)) {
01253         if (RREGEXP(re)->ptr->enc != enc &&
01254             (!rb_enc_asciicompat(RREGEXP(re)->ptr->enc) ||
01255              rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)) {
01256             reg_enc_error(re, str);
01257         }
01258         enc = RREGEXP(re)->ptr->enc;
01259     }
01260     if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
01261         enc != rb_ascii8bit_encoding() &&
01262         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
01263         rb_warn("regexp match /.../n against to %s string",
01264                 rb_enc_name(enc));
01265     }
01266     return enc;
01267 }
01268 
01269 regex_t *
01270 rb_reg_prepare_re(VALUE re, VALUE str)
01271 {
01272     regex_t *reg = RREGEXP(re)->ptr;
01273     onig_errmsg_buffer err = "";
01274     int r;
01275     OnigErrorInfo einfo;
01276     const char *pattern;
01277     VALUE unescaped;
01278     rb_encoding *fixed_enc = 0;
01279     rb_encoding *enc = rb_reg_prepare_enc(re, str, 1);
01280 
01281     if (reg->enc == enc) return reg;
01282 
01283     rb_reg_check(re);
01284     reg = RREGEXP(re)->ptr;
01285     pattern = RREGEXP_SRC_PTR(re);
01286 
01287     unescaped = rb_reg_preprocess(
01288         pattern, pattern + RREGEXP_SRC_LEN(re), enc,
01289         &fixed_enc, err);
01290 
01291     if (unescaped == Qnil) {
01292         rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
01293     }
01294 
01295     r = onig_new(&reg, (UChar* )RSTRING_PTR(unescaped),
01296                  (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
01297                  reg->options, enc,
01298                  OnigDefaultSyntax, &einfo);
01299     if (r) {
01300         onig_error_code_to_str((UChar*)err, r, &einfo);
01301         rb_reg_raise(pattern, RREGEXP_SRC_LEN(re), err, re);
01302     }
01303 
01304     RB_GC_GUARD(unescaped);
01305     return reg;
01306 }
01307 
01308 long
01309 rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse)
01310 {
01311     long range;
01312     rb_encoding *enc;
01313     UChar *p, *string;
01314 
01315     enc = rb_reg_prepare_enc(re, str, 0);
01316 
01317     if (reverse) {
01318         range = -pos;
01319     }
01320     else {
01321         range = RSTRING_LEN(str) - pos;
01322     }
01323 
01324     if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) {
01325          string = (UChar*)RSTRING_PTR(str);
01326 
01327          if (range > 0) {
01328               p = onigenc_get_right_adjust_char_head(enc, string, string + pos, string + RSTRING_LEN(str));
01329          }
01330          else {
01331               p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos, string + RSTRING_LEN(str));
01332          }
01333          return p - string;
01334     }
01335 
01336     return pos;
01337 }
01338 
01339 long
01340 rb_reg_search(VALUE re, VALUE str, long pos, int reverse)
01341 {
01342     long result;
01343     VALUE match;
01344     struct re_registers regi, *regs = &regi;
01345     char *range = RSTRING_PTR(str);
01346     regex_t *reg;
01347     int tmpreg;
01348 
01349     if (pos > RSTRING_LEN(str) || pos < 0) {
01350         rb_backref_set(Qnil);
01351         return -1;
01352     }
01353 
01354     reg = rb_reg_prepare_re(re, str);
01355     tmpreg = reg != RREGEXP(re)->ptr;
01356     if (!tmpreg) RREGEXP(re)->usecnt++;
01357 
01358     match = rb_backref_get();
01359     if (!NIL_P(match)) {
01360         if (FL_TEST(match, MATCH_BUSY)) {
01361             match = Qnil;
01362         }
01363         else {
01364             regs = RMATCH_REGS(match);
01365         }
01366     }
01367     if (NIL_P(match)) {
01368         MEMZERO(regs, struct re_registers, 1);
01369     }
01370     if (!reverse) {
01371         range += RSTRING_LEN(str);
01372     }
01373     result = onig_search(reg,
01374                          (UChar*)(RSTRING_PTR(str)),
01375                          ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
01376                          ((UChar*)(RSTRING_PTR(str)) + pos),
01377                          ((UChar*)range),
01378                          regs, ONIG_OPTION_NONE);
01379     if (!tmpreg) RREGEXP(re)->usecnt--;
01380     if (tmpreg) {
01381         if (RREGEXP(re)->usecnt) {
01382             onig_free(reg);
01383         }
01384         else {
01385             onig_free(RREGEXP(re)->ptr);
01386             RREGEXP(re)->ptr = reg;
01387         }
01388     }
01389     if (result < 0) {
01390         if (regs == &regi)
01391             onig_region_free(regs, 0);
01392         if (result == ONIG_MISMATCH) {
01393             rb_backref_set(Qnil);
01394             return result;
01395         }
01396         else {
01397             onig_errmsg_buffer err = "";
01398             onig_error_code_to_str((UChar*)err, (int)result);
01399             rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
01400         }
01401     }
01402 
01403     if (NIL_P(match)) {
01404         match = match_alloc(rb_cMatch);
01405         onig_region_copy(RMATCH_REGS(match), regs);
01406         onig_region_free(regs, 0);
01407     }
01408     else {
01409         if (rb_safe_level() >= 3)
01410             OBJ_TAINT(match);
01411         else
01412             FL_UNSET(match, FL_TAINT);
01413     }
01414 
01415     RMATCH(match)->str = rb_str_new4(str);
01416     RMATCH(match)->regexp = re;
01417     RMATCH(match)->rmatch->char_offset_updated = 0;
01418     rb_backref_set(match);
01419 
01420     OBJ_INFECT(match, re);
01421     OBJ_INFECT(match, str);
01422 
01423     return result;
01424 }
01425 
01426 VALUE
01427 rb_reg_nth_defined(int nth, VALUE match)
01428 {
01429     struct re_registers *regs;
01430     if (NIL_P(match)) return Qnil;
01431     match_check(match);
01432     regs = RMATCH_REGS(match);
01433     if (nth >= regs->num_regs) {
01434         return Qnil;
01435     }
01436     if (nth < 0) {
01437         nth += regs->num_regs;
01438         if (nth <= 0) return Qnil;
01439     }
01440     if (BEG(nth) == -1) return Qfalse;
01441     return Qtrue;
01442 }
01443 
01444 VALUE
01445 rb_reg_nth_match(int nth, VALUE match)
01446 {
01447     VALUE str;
01448     long start, end, len;
01449     struct re_registers *regs;
01450 
01451     if (NIL_P(match)) return Qnil;
01452     match_check(match);
01453     regs = RMATCH_REGS(match);
01454     if (nth >= regs->num_regs) {
01455         return Qnil;
01456     }
01457     if (nth < 0) {
01458         nth += regs->num_regs;
01459         if (nth <= 0) return Qnil;
01460     }
01461     start = BEG(nth);
01462     if (start == -1) return Qnil;
01463     end = END(nth);
01464     len = end - start;
01465     str = rb_str_subseq(RMATCH(match)->str, start, len);
01466     OBJ_INFECT(str, match);
01467     return str;
01468 }
01469 
01470 VALUE
01471 rb_reg_last_match(VALUE match)
01472 {
01473     return rb_reg_nth_match(0, match);
01474 }
01475 
01476 
01477 /*
01478  *  call-seq:
01479  *     mtch.pre_match   -> str
01480  *
01481  *  Returns the portion of the original string before the current match.
01482  *  Equivalent to the special variable <code>$`</code>.
01483  *
01484  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01485  *     m.pre_match   #=> "T"
01486  */
01487 
01488 VALUE
01489 rb_reg_match_pre(VALUE match)
01490 {
01491     VALUE str;
01492     struct re_registers *regs;
01493 
01494     if (NIL_P(match)) return Qnil;
01495     match_check(match);
01496     regs = RMATCH_REGS(match);
01497     if (BEG(0) == -1) return Qnil;
01498     str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0));
01499     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01500     return str;
01501 }
01502 
01503 
01504 /*
01505  *  call-seq:
01506  *     mtch.post_match   -> str
01507  *
01508  *  Returns the portion of the original string after the current match.
01509  *  Equivalent to the special variable <code>$'</code>.
01510  *
01511  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
01512  *     m.post_match   #=> ": The Movie"
01513  */
01514 
01515 VALUE
01516 rb_reg_match_post(VALUE match)
01517 {
01518     VALUE str;
01519     long pos;
01520     struct re_registers *regs;
01521 
01522     if (NIL_P(match)) return Qnil;
01523     match_check(match);
01524     regs = RMATCH_REGS(match);
01525     if (BEG(0) == -1) return Qnil;
01526     str = RMATCH(match)->str;
01527     pos = END(0);
01528     str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
01529     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01530     return str;
01531 }
01532 
01533 VALUE
01534 rb_reg_match_last(VALUE match)
01535 {
01536     int i;
01537     struct re_registers *regs;
01538 
01539     if (NIL_P(match)) return Qnil;
01540     match_check(match);
01541     regs = RMATCH_REGS(match);
01542     if (BEG(0) == -1) return Qnil;
01543 
01544     for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
01545         ;
01546     if (i == 0) return Qnil;
01547     return rb_reg_nth_match(i, match);
01548 }
01549 
01550 static VALUE
01551 last_match_getter(void)
01552 {
01553     return rb_reg_last_match(rb_backref_get());
01554 }
01555 
01556 static VALUE
01557 prematch_getter(void)
01558 {
01559     return rb_reg_match_pre(rb_backref_get());
01560 }
01561 
01562 static VALUE
01563 postmatch_getter(void)
01564 {
01565     return rb_reg_match_post(rb_backref_get());
01566 }
01567 
01568 static VALUE
01569 last_paren_match_getter(void)
01570 {
01571     return rb_reg_match_last(rb_backref_get());
01572 }
01573 
01574 static VALUE
01575 match_array(VALUE match, int start)
01576 {
01577     struct re_registers *regs;
01578     VALUE ary;
01579     VALUE target;
01580     int i;
01581     int taint = OBJ_TAINTED(match);
01582 
01583     match_check(match);
01584     regs = RMATCH_REGS(match);
01585     ary = rb_ary_new2(regs->num_regs);
01586     target = RMATCH(match)->str;
01587 
01588     for (i=start; i<regs->num_regs; i++) {
01589         if (regs->beg[i] == -1) {
01590             rb_ary_push(ary, Qnil);
01591         }
01592         else {
01593             VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
01594             if (taint) OBJ_TAINT(str);
01595             rb_ary_push(ary, str);
01596         }
01597     }
01598     return ary;
01599 }
01600 
01601 
01602 /* [MG]:FIXME: I put parens around the /.../.match() in the first line of the
01603    second example to prevent the '*' followed by a '/' from ending the
01604    comment. */
01605 
01606 /*
01607  *  call-seq:
01608  *     mtch.to_a   -> anArray
01609  *
01610  *  Returns the array of matches.
01611  *
01612  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01613  *     m.to_a   #=> ["HX1138", "H", "X", "113", "8"]
01614  *
01615  *  Because <code>to_a</code> is called when expanding
01616  *  <code>*</code><em>variable</em>, there's a useful assignment
01617  *  shortcut for extracting matched fields. This is slightly slower than
01618  *  accessing the fields directly (as an intermediate array is
01619  *  generated).
01620  *
01621  *     all,f1,f2,f3 = *(/(.)(.)(\d+)(\d)/.match("THX1138."))
01622  *     all   #=> "HX1138"
01623  *     f1    #=> "H"
01624  *     f2    #=> "X"
01625  *     f3    #=> "113"
01626  */
01627 
01628 static VALUE
01629 match_to_a(VALUE match)
01630 {
01631     return match_array(match, 0);
01632 }
01633 
01634 
01635 /*
01636  *  call-seq:
01637  *     mtch.captures   -> array
01638  *
01639  *  Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>.
01640  *
01641  *     f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures
01642  *     f1    #=> "H"
01643  *     f2    #=> "X"
01644  *     f3    #=> "113"
01645  *     f4    #=> "8"
01646  */
01647 static VALUE
01648 match_captures(VALUE match)
01649 {
01650     return match_array(match, 1);
01651 }
01652 
01653 static int
01654 name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end)
01655 {
01656     int num;
01657 
01658     num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01659         (const unsigned char* )name, (const unsigned char* )name_end, regs);
01660     if (num >= 1) {
01661         return num;
01662     }
01663     else {
01664         VALUE s = rb_str_new(name, (long )(name_end - name));
01665         rb_raise(rb_eIndexError, "undefined group name reference: %s",
01666                                  StringValuePtr(s));
01667     }
01668 }
01669 
01670 /*
01671  *  call-seq:
01672  *     mtch[i]               -> str or nil
01673  *     mtch[start, length]   -> array
01674  *     mtch[range]           -> array
01675  *     mtch[name]            -> str or nil
01676  *
01677  *  Match Reference---<code>MatchData</code> acts as an array, and may be
01678  *  accessed using the normal array indexing techniques.  <i>mtch</i>[0] is
01679  *  equivalent to the special variable <code>$&</code>, and returns the entire
01680  *  matched string.  <i>mtch</i>[1], <i>mtch</i>[2], and so on return the values
01681  *  of the matched backreferences (portions of the pattern between parentheses).
01682  *
01683  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01684  *     m          #=> #<MatchData "HX1138" 1:"H" 2:"X" 3:"113" 4:"8">
01685  *     m[0]       #=> "HX1138"
01686  *     m[1, 2]    #=> ["H", "X"]
01687  *     m[1..3]    #=> ["H", "X", "113"]
01688  *     m[-3, 2]   #=> ["X", "113"]
01689  *
01690  *     m = /(?<foo>a+)b/.match("ccaaab")
01691  *     m          #=> #<MatchData "aaab" foo:"aaa">
01692  *     m["foo"]   #=> "aaa"
01693  *     m[:foo]    #=> "aaa"
01694  */
01695 
01696 static VALUE
01697 match_aref(int argc, VALUE *argv, VALUE match)
01698 {
01699     VALUE idx, rest;
01700 
01701     match_check(match);
01702     rb_scan_args(argc, argv, "11", &idx, &rest);
01703 
01704     if (NIL_P(rest)) {
01705         if (FIXNUM_P(idx)) {
01706             if (FIX2INT(idx) >= 0) {
01707                 return rb_reg_nth_match(FIX2INT(idx), match);
01708             }
01709         }
01710         else {
01711             const char *p;
01712             int num;
01713 
01714             switch (TYPE(idx)) {
01715               case T_SYMBOL:
01716                 p = rb_id2name(SYM2ID(idx));
01717                 goto name_to_backref;
01718                 break;
01719               case T_STRING:
01720                 p = StringValuePtr(idx);
01721 
01722               name_to_backref:
01723                 num = name_to_backref_number(RMATCH_REGS(match),
01724                                              RMATCH(match)->regexp, p, p + strlen(p));
01725                 return rb_reg_nth_match(num, match);
01726                 break;
01727 
01728               default:
01729                 break;
01730             }
01731         }
01732     }
01733 
01734     return rb_ary_aref(argc, argv, match_to_a(match));
01735 }
01736 
01737 static VALUE
01738 match_entry(VALUE match, long n)
01739 {
01740     /* n should not exceed num_regs */
01741     return rb_reg_nth_match((int)n, match);
01742 }
01743 
01744 
01745 /*
01746  *  call-seq:
01747  *
01748  *     mtch.values_at([index]*)   -> array
01749  *
01750  *  Uses each <i>index</i> to access the matching values, returning an array of
01751  *  the corresponding matches.
01752  *
01753  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
01754  *     m.to_a               #=> ["HX1138", "H", "X", "113", "8"]
01755  *     m.values_at(0, 2, -2)   #=> ["HX1138", "X", "113"]
01756  */
01757 
01758 static VALUE
01759 match_values_at(int argc, VALUE *argv, VALUE match)
01760 {
01761     struct re_registers *regs;
01762 
01763     match_check(match);
01764     regs = RMATCH_REGS(match);
01765     return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry);
01766 }
01767 
01768 
01769 /*
01770  *  call-seq:
01771  *     mtch.to_s   -> str
01772  *
01773  *  Returns the entire matched string.
01774  *
01775  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01776  *     m.to_s   #=> "HX1138"
01777  */
01778 
01779 static VALUE
01780 match_to_s(VALUE match)
01781 {
01782     VALUE str = rb_reg_last_match(match);
01783 
01784     match_check(match);
01785     if (NIL_P(str)) str = rb_str_new(0,0);
01786     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01787     if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str);
01788     return str;
01789 }
01790 
01791 
01792 /*
01793  *  call-seq:
01794  *     mtch.string   -> str
01795  *
01796  *  Returns a frozen copy of the string passed in to <code>match</code>.
01797  *
01798  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01799  *     m.string   #=> "THX1138."
01800  */
01801 
01802 static VALUE
01803 match_string(VALUE match)
01804 {
01805     match_check(match);
01806     return RMATCH(match)->str;  /* str is frozen */
01807 }
01808 
01809 struct backref_name_tag {
01810     const UChar *name;
01811     long len;
01812 };
01813 
01814 static int
01815 match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end,
01816           int back_num, int *back_refs, OnigRegex regex, void *arg0)
01817 {
01818     struct backref_name_tag *arg = (struct backref_name_tag *)arg0;
01819     int i;
01820 
01821     for (i = 0; i < back_num; i++) {
01822         arg[back_refs[i]].name = name;
01823         arg[back_refs[i]].len = name_end - name;
01824     }
01825     return 0;
01826 }
01827 
01828 /*
01829  * call-seq:
01830  *    mtch.inspect   -> str
01831  *
01832  * Returns a printable version of <i>mtch</i>.
01833  *
01834  *     puts /.$/.match("foo").inspect
01835  *     #=> #<MatchData "o">
01836  *
01837  *     puts /(.)(.)(.)/.match("foo").inspect
01838  *     #=> #<MatchData "foo" 1:"f" 2:"o" 3:"o">
01839  *
01840  *     puts /(.)(.)?(.)/.match("fo").inspect
01841  *     #=> #<MatchData "fo" 1:"f" 2:nil 3:"o">
01842  *
01843  *     puts /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").inspect
01844  *     #=> #<MatchData "hog" foo:"h" bar:"o" baz:"g">
01845  *
01846  */
01847 
01848 static VALUE
01849 match_inspect(VALUE match)
01850 {
01851     const char *cname = rb_obj_classname(match);
01852     VALUE str;
01853     int i;
01854     struct re_registers *regs = RMATCH_REGS(match);
01855     int num_regs = regs->num_regs;
01856     struct backref_name_tag *names;
01857     VALUE regexp = RMATCH(match)->regexp;
01858 
01859     if (regexp == 0) {
01860         return rb_sprintf("#<%s:%p>", cname, (void*)match);
01861     }
01862 
01863     names = ALLOCA_N(struct backref_name_tag, num_regs);
01864     MEMZERO(names, struct backref_name_tag, num_regs);
01865 
01866     onig_foreach_name(RREGEXP(regexp)->ptr,
01867             match_inspect_name_iter, names);
01868 
01869     str = rb_str_buf_new2("#<");
01870     rb_str_buf_cat2(str, cname);
01871 
01872     for (i = 0; i < num_regs; i++) {
01873         VALUE v;
01874         rb_str_buf_cat2(str, " ");
01875         if (0 < i) {
01876             if (names[i].name)
01877                 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len);
01878             else {
01879                 rb_str_catf(str, "%d", i);
01880             }
01881             rb_str_buf_cat2(str, ":");
01882         }
01883         v = rb_reg_nth_match(i, match);
01884         if (v == Qnil)
01885             rb_str_buf_cat2(str, "nil");
01886         else
01887             rb_str_buf_append(str, rb_str_inspect(v));
01888     }
01889     rb_str_buf_cat2(str, ">");
01890 
01891     return str;
01892 }
01893 
01894 VALUE rb_cRegexp;
01895 
01896 static int
01897 read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
01898 {
01899     const char *p = *pp;
01900     int code;
01901     int meta_prefix = 0, ctrl_prefix = 0;
01902     size_t len;
01903 
01904     if (p == end || *p++ != '\\') {
01905         errcpy(err, "too short escaped multibyte character");
01906         return -1;
01907     }
01908 
01909 again:
01910     if (p == end) {
01911         errcpy(err, "too short escape sequence");
01912         return -1;
01913     }
01914     switch (*p++) {
01915       case '\\': code = '\\'; break;
01916       case 'n': code = '\n'; break;
01917       case 't': code = '\t'; break;
01918       case 'r': code = '\r'; break;
01919       case 'f': code = '\f'; break;
01920       case 'v': code = '\013'; break;
01921       case 'a': code = '\007'; break;
01922       case 'e': code = '\033'; break;
01923 
01924       /* \OOO */
01925       case '0': case '1': case '2': case '3':
01926       case '4': case '5': case '6': case '7':
01927         p--;
01928         code = scan_oct(p, end < p+3 ? end-p : 3, &len);
01929         p += len;
01930         break;
01931 
01932       case 'x': /* \xHH */
01933         code = scan_hex(p, end < p+2 ? end-p : 2, &len);
01934         if (len < 1) {
01935             errcpy(err, "invalid hex escape");
01936             return -1;
01937         }
01938         p += len;
01939         break;
01940 
01941       case 'M': /* \M-X, \M-\C-X, \M-\cX */
01942         if (meta_prefix) {
01943             errcpy(err, "duplicate meta escape");
01944             return -1;
01945         }
01946         meta_prefix = 1;
01947         if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
01948             if (*p == '\\') {
01949                 p++;
01950                 goto again;
01951             }
01952             else {
01953                 code = *p++;
01954                 break;
01955             }
01956         }
01957         errcpy(err, "too short meta escape");
01958         return -1;
01959 
01960       case 'C': /* \C-X, \C-\M-X */
01961         if (p == end || *p++ != '-') {
01962             errcpy(err, "too short control escape");
01963             return -1;
01964         }
01965       case 'c': /* \cX, \c\M-X */
01966         if (ctrl_prefix) {
01967             errcpy(err, "duplicate control escape");
01968             return -1;
01969         }
01970         ctrl_prefix = 1;
01971         if (p < end && (*p & 0x80) == 0) {
01972             if (*p == '\\') {
01973                 p++;
01974                 goto again;
01975             }
01976             else {
01977                 code = *p++;
01978                 break;
01979             }
01980         }
01981         errcpy(err, "too short control escape");
01982         return -1;
01983 
01984       default:
01985         errcpy(err, "unexpected escape sequence");
01986         return -1;
01987     }
01988     if (code < 0 || 0xff < code) {
01989         errcpy(err, "invalid escape code");
01990         return -1;
01991     }
01992 
01993     if (ctrl_prefix)
01994         code &= 0x1f;
01995     if (meta_prefix)
01996         code |= 0x80;
01997 
01998     *pp = p;
01999     return code;
02000 }
02001 
02002 static int
02003 unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
02004         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02005 {
02006     const char *p = *pp;
02007     int chmaxlen = rb_enc_mbmaxlen(enc);
02008     char *chbuf = ALLOCA_N(char, chmaxlen);
02009     int chlen = 0;
02010     int byte;
02011     int l;
02012 
02013     memset(chbuf, 0, chmaxlen);
02014 
02015     byte = read_escaped_byte(&p, end, err);
02016     if (byte == -1) {
02017         return -1;
02018     }
02019 
02020     chbuf[chlen++] = byte;
02021     while (chlen < chmaxlen &&
02022            MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
02023         byte = read_escaped_byte(&p, end, err);
02024         if (byte == -1) {
02025             return -1;
02026         }
02027         chbuf[chlen++] = byte;
02028     }
02029 
02030     l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
02031     if (MBCLEN_INVALID_P(l)) {
02032         errcpy(err, "invalid multibyte escape");
02033         return -1;
02034     }
02035     if (1 < chlen || (chbuf[0] & 0x80)) {
02036         rb_str_buf_cat(buf, chbuf, chlen);
02037 
02038         if (*encp == 0)
02039             *encp = enc;
02040         else if (*encp != enc) {
02041             errcpy(err, "escaped non ASCII character in UTF-8 regexp");
02042             return -1;
02043         }
02044     }
02045     else {
02046         char escbuf[5];
02047         snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff);
02048         rb_str_buf_cat(buf, escbuf, 4);
02049     }
02050     *pp = p;
02051     return 0;
02052 }
02053 
02054 static int
02055 check_unicode_range(unsigned long code, onig_errmsg_buffer err)
02056 {
02057     if ((0xd800 <= code && code <= 0xdfff) || /* Surrogates */
02058         0x10ffff < code) {
02059         errcpy(err, "invalid Unicode range");
02060         return -1;
02061     }
02062     return 0;
02063 }
02064 
02065 static int
02066 append_utf8(unsigned long uv,
02067         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02068 {
02069     if (check_unicode_range(uv, err) != 0)
02070         return -1;
02071     if (uv < 0x80) {
02072         char escbuf[5];
02073         snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv);
02074         rb_str_buf_cat(buf, escbuf, 4);
02075     }
02076     else {
02077         int len;
02078         char utf8buf[6];
02079         len = rb_uv_to_utf8(utf8buf, uv);
02080         rb_str_buf_cat(buf, utf8buf, len);
02081 
02082         if (*encp == 0)
02083             *encp = rb_utf8_encoding();
02084         else if (*encp != rb_utf8_encoding()) {
02085             errcpy(err, "UTF-8 character in non UTF-8 regexp");
02086             return -1;
02087         }
02088     }
02089     return 0;
02090 }
02091 
02092 static int
02093 unescape_unicode_list(const char **pp, const char *end,
02094         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02095 {
02096     const char *p = *pp;
02097     int has_unicode = 0;
02098     unsigned long code;
02099     size_t len;
02100 
02101     while (p < end && ISSPACE(*p)) p++;
02102 
02103     while (1) {
02104         code = ruby_scan_hex(p, end-p, &len);
02105         if (len == 0)
02106             break;
02107         if (6 < len) { /* max 10FFFF */
02108             errcpy(err, "invalid Unicode range");
02109             return -1;
02110         }
02111         p += len;
02112         if (append_utf8(code, buf, encp, err) != 0)
02113             return -1;
02114         has_unicode = 1;
02115 
02116         while (p < end && ISSPACE(*p)) p++;
02117     }
02118 
02119     if (has_unicode == 0) {
02120         errcpy(err, "invalid Unicode list");
02121         return -1;
02122     }
02123 
02124     *pp = p;
02125 
02126     return 0;
02127 }
02128 
02129 static int
02130 unescape_unicode_bmp(const char **pp, const char *end,
02131         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02132 {
02133     const char *p = *pp;
02134     size_t len;
02135     unsigned long code;
02136 
02137     if (end < p+4) {
02138         errcpy(err, "invalid Unicode escape");
02139         return -1;
02140     }
02141     code = ruby_scan_hex(p, 4, &len);
02142     if (len != 4) {
02143         errcpy(err, "invalid Unicode escape");
02144         return -1;
02145     }
02146     if (append_utf8(code, buf, encp, err) != 0)
02147         return -1;
02148     *pp = p + 4;
02149     return 0;
02150 }
02151 
02152 static int
02153 unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
02154         VALUE buf, rb_encoding **encp, int *has_property,
02155         onig_errmsg_buffer err)
02156 {
02157     char c;
02158     char smallbuf[2];
02159 
02160     while (p < end) {
02161         int chlen = rb_enc_precise_mbclen(p, end, enc);
02162         if (!MBCLEN_CHARFOUND_P(chlen)) {
02163             errcpy(err, "invalid multibyte character");
02164             return -1;
02165         }
02166         chlen = MBCLEN_CHARFOUND_LEN(chlen);
02167         if (1 < chlen || (*p & 0x80)) {
02168             rb_str_buf_cat(buf, p, chlen);
02169             p += chlen;
02170             if (*encp == 0)
02171                 *encp = enc;
02172             else if (*encp != enc) {
02173                 errcpy(err, "non ASCII character in UTF-8 regexp");
02174                 return -1;
02175             }
02176             continue;
02177         }
02178 
02179         switch (c = *p++) {
02180           case '\\':
02181             if (p == end) {
02182                 errcpy(err, "too short escape sequence");
02183                 return -1;
02184             }
02185             switch (c = *p++) {
02186               case '1': case '2': case '3':
02187               case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
02188                 {
02189                     size_t octlen;
02190                     if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
02191                         /* backref or 7bit octal.
02192                            no need to unescape anyway.
02193                            re-escaping may break backref */
02194                         goto escape_asis;
02195                     }
02196                 }
02197                 /* xxx: How about more than 199 subexpressions? */
02198 
02199               case '0': /* \0, \0O, \0OO */
02200 
02201               case 'x': /* \xHH */
02202               case 'c': /* \cX, \c\M-X */
02203               case 'C': /* \C-X, \C-\M-X */
02204               case 'M': /* \M-X, \M-\C-X, \M-\cX */
02205                 p = p-2;
02206                 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
02207                     return -1;
02208                 break;
02209 
02210               case 'u':
02211                 if (p == end) {
02212                     errcpy(err, "too short escape sequence");
02213                     return -1;
02214                 }
02215                 if (*p == '{') {
02216                     /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
02217                     p++;
02218                     if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
02219                         return -1;
02220                     if (p == end || *p++ != '}') {
02221                         errcpy(err, "invalid Unicode list");
02222                         return -1;
02223                     }
02224                     break;
02225                 }
02226                 else {
02227                     /* \uHHHH */
02228                     if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
02229                         return -1;
02230                     break;
02231                 }
02232 
02233               case 'p': /* \p{Hiragana} */
02234               case 'P':
02235                 if (!*encp) {
02236                     *has_property = 1;
02237                 }
02238                 goto escape_asis;
02239 
02240               default: /* \n, \\, \d, \9, etc. */
02241 escape_asis:
02242                 smallbuf[0] = '\\';
02243                 smallbuf[1] = c;
02244                 rb_str_buf_cat(buf, smallbuf, 2);
02245                 break;
02246             }
02247             break;
02248 
02249           default:
02250             rb_str_buf_cat(buf, &c, 1);
02251             break;
02252         }
02253     }
02254 
02255     return 0;
02256 }
02257 
02258 static VALUE
02259 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
02260         rb_encoding **fixed_enc, onig_errmsg_buffer err)
02261 {
02262     VALUE buf;
02263     int has_property = 0;
02264 
02265     buf = rb_str_buf_new(0);
02266 
02267     if (rb_enc_asciicompat(enc))
02268         *fixed_enc = 0;
02269     else {
02270         *fixed_enc = enc;
02271         rb_enc_associate(buf, enc);
02272     }
02273 
02274     if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0)
02275         return Qnil;
02276 
02277     if (has_property && !*fixed_enc) {
02278         *fixed_enc = enc;
02279     }
02280 
02281     if (*fixed_enc) {
02282         rb_enc_associate(buf, *fixed_enc);
02283     }
02284 
02285     return buf;
02286 }
02287 
02288 VALUE
02289 rb_reg_check_preprocess(VALUE str)
02290 {
02291     rb_encoding *fixed_enc = 0;
02292     onig_errmsg_buffer err = "";
02293     VALUE buf;
02294     char *p, *end;
02295     rb_encoding *enc;
02296 
02297     StringValue(str);
02298     p = RSTRING_PTR(str);
02299     end = p + RSTRING_LEN(str);
02300     enc = rb_enc_get(str);
02301 
02302     buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
02303     RB_GC_GUARD(str);
02304 
02305     if (buf == Qnil) {
02306         return rb_reg_error_desc(str, 0, err);
02307     }
02308     return Qnil;
02309 }
02310 
02311 static VALUE
02312 rb_reg_preprocess_dregexp(VALUE ary, int options)
02313 {
02314     rb_encoding *fixed_enc = 0;
02315     rb_encoding *regexp_enc = 0;
02316     onig_errmsg_buffer err = "";
02317     int i;
02318     VALUE result = 0;
02319     rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02320 
02321     if (RARRAY_LEN(ary) == 0) {
02322         rb_raise(rb_eArgError, "no arguments given");
02323     }
02324 
02325     for (i = 0; i < RARRAY_LEN(ary); i++) {
02326         VALUE str = RARRAY_PTR(ary)[i];
02327         VALUE buf;
02328         char *p, *end;
02329         rb_encoding *src_enc;
02330 
02331         src_enc = rb_enc_get(str);
02332         if (options & ARG_ENCODING_NONE &&
02333                 src_enc != ascii8bit) {
02334             if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)
02335                 rb_raise(rb_eRegexpError, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02336             else
02337                 src_enc = ascii8bit;
02338         }
02339 
02340         StringValue(str);
02341         p = RSTRING_PTR(str);
02342         end = p + RSTRING_LEN(str);
02343 
02344         buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
02345 
02346         if (buf == Qnil)
02347             rb_raise(rb_eArgError, "%s", err);
02348 
02349         if (fixed_enc != 0) {
02350             if (regexp_enc != 0 && regexp_enc != fixed_enc) {
02351                 rb_raise(rb_eRegexpError, "encoding mismatch in dynamic regexp : %s and %s",
02352                          rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
02353             }
02354             regexp_enc = fixed_enc;
02355         }
02356 
02357         if (!result)
02358             result = rb_str_new3(str);
02359         else
02360             rb_str_buf_append(result, str);
02361     }
02362     if (regexp_enc) {
02363         rb_enc_associate(result, regexp_enc);
02364     }
02365 
02366     return result;
02367 }
02368 
02369 static int
02370 rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc,
02371                   int options, onig_errmsg_buffer err,
02372                   const char *sourcefile, int sourceline)
02373 {
02374     struct RRegexp *re = RREGEXP(obj);
02375     VALUE unescaped;
02376     rb_encoding *fixed_enc = 0;
02377     rb_encoding *a_enc = rb_ascii8bit_encoding();
02378 
02379     if (!OBJ_UNTRUSTED(obj) && rb_safe_level() >= 4)
02380         rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
02381     rb_check_frozen(obj);
02382     if (FL_TEST(obj, REG_LITERAL))
02383         rb_raise(rb_eSecurityError, "can't modify literal regexp");
02384     if (re->ptr)
02385         rb_raise(rb_eTypeError, "already initialized regexp");
02386     re->ptr = 0;
02387 
02388     if (rb_enc_dummy_p(enc)) {
02389             errcpy(err, "can't make regexp with dummy encoding");
02390             return -1;
02391     }
02392 
02393     unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
02394     if (unescaped == Qnil)
02395         return -1;
02396 
02397     if (fixed_enc) {
02398         if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
02399             (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
02400             errcpy(err, "incompatible character encoding");
02401             return -1;
02402         }
02403         if (fixed_enc != a_enc) {
02404             options |= ARG_ENCODING_FIXED;
02405             enc = fixed_enc;
02406         }
02407     }
02408     else if (!(options & ARG_ENCODING_FIXED)) {
02409        enc = rb_usascii_encoding();
02410     }
02411 
02412     rb_enc_associate((VALUE)re, enc);
02413     if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
02414         re->basic.flags |= KCODE_FIXED;
02415     }
02416     if (options & ARG_ENCODING_NONE) {
02417         re->basic.flags |= REG_ENCODING_NONE;
02418     }
02419 
02420     re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
02421                           options & ARG_REG_OPTION_MASK, err,
02422                           sourcefile, sourceline);
02423     if (!re->ptr) return -1;
02424     re->src = rb_enc_str_new(s, len, enc);
02425     OBJ_FREEZE(re->src);
02426     RB_GC_GUARD(unescaped);
02427     return 0;
02428 }
02429 
02430 static int
02431 rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err,
02432         const char *sourcefile, int sourceline)
02433 {
02434     int ret;
02435     rb_encoding *enc = rb_enc_get(str);
02436     if (options & ARG_ENCODING_NONE) {
02437         rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02438         if (enc != ascii8bit) {
02439             if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
02440                 errcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02441                 return -1;
02442             }
02443             enc = ascii8bit;
02444         }
02445     }
02446     ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
02447                             options, err, sourcefile, sourceline);
02448     OBJ_INFECT(obj, str);
02449     RB_GC_GUARD(str);
02450     return ret;
02451 }
02452 
02453 static VALUE
02454 rb_reg_s_alloc(VALUE klass)
02455 {
02456     NEWOBJ(re, struct RRegexp);
02457     OBJSETUP(re, klass, T_REGEXP);
02458 
02459     re->ptr = 0;
02460     re->src = 0;
02461     re->usecnt = 0;
02462 
02463     return (VALUE)re;
02464 }
02465 
02466 VALUE
02467 rb_reg_alloc(void)
02468 {
02469     return rb_reg_s_alloc(rb_cRegexp);
02470 }
02471 
02472 VALUE
02473 rb_reg_new_str(VALUE s, int options)
02474 {
02475     return rb_reg_init_str(rb_reg_alloc(), s, options);
02476 }
02477 
02478 VALUE
02479 rb_reg_init_str(VALUE re, VALUE s, int options)
02480 {
02481     onig_errmsg_buffer err = "";
02482 
02483     if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) {
02484         rb_reg_raise_str(s, options, err);
02485     }
02486 
02487     return re;
02488 }
02489 
02490 VALUE
02491 rb_reg_new_ary(VALUE ary, int opt)
02492 {
02493     return rb_reg_new_str(rb_reg_preprocess_dregexp(ary, opt), opt);
02494 }
02495 
02496 VALUE
02497 rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options)
02498 {
02499     VALUE re = rb_reg_alloc();
02500     onig_errmsg_buffer err = "";
02501 
02502     if (rb_reg_initialize(re, s, len, enc, options, err, NULL, 0) != 0) {
02503         rb_enc_reg_raise(s, len, enc, options, err);
02504     }
02505 
02506     return re;
02507 }
02508 
02509 VALUE
02510 rb_reg_new(const char *s, long len, int options)
02511 {
02512     return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
02513 }
02514 
02515 VALUE
02516 rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline)
02517 {
02518     VALUE re = rb_reg_alloc();
02519     onig_errmsg_buffer err = "";
02520 
02521     if (!str) str = rb_str_new(0,0);
02522     if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) {
02523         rb_set_errinfo(rb_reg_error_desc(str, options, err));
02524         return Qnil;
02525     }
02526     FL_SET(re, REG_LITERAL);
02527     return re;
02528 }
02529 
02530 static VALUE reg_cache;
02531 
02532 VALUE
02533 rb_reg_regcomp(VALUE str)
02534 {
02535     volatile VALUE save_str = str;
02536     if (reg_cache && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str)
02537         && ENCODING_GET(reg_cache) == ENCODING_GET(str)
02538         && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
02539         return reg_cache;
02540 
02541     return reg_cache = rb_reg_new_str(save_str, 0);
02542 }
02543 
02544 static st_index_t reg_hash(VALUE re);
02545 /*
02546  * call-seq:
02547  *   rxp.hash   -> fixnum
02548  *
02549  * Produce a hash based on the text and options of this regular expression.
02550  */
02551 
02552 static VALUE
02553 rb_reg_hash(VALUE re)
02554 {
02555     st_index_t hashval = reg_hash(re);
02556     return LONG2FIX(hashval);
02557 }
02558 
02559 static st_index_t
02560 reg_hash(VALUE re)
02561 {
02562     st_index_t hashval;
02563 
02564     rb_reg_check(re);
02565     hashval = RREGEXP(re)->ptr->options;
02566     hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re)));
02567     return rb_hash_end(hashval);
02568 }
02569 
02570 
02571 /*
02572  *  call-seq:
02573  *     rxp == other_rxp      -> true or false
02574  *     rxp.eql?(other_rxp)   -> true or false
02575  *
02576  *  Equality---Two regexps are equal if their patterns are identical, they have
02577  *  the same character set code, and their <code>casefold?</code> values are the
02578  *  same.
02579  *
02580  *     /abc/  == /abc/x   #=> false
02581  *     /abc/  == /abc/i   #=> false
02582  *     /abc/  == /abc/n   #=> false
02583  *     /abc/u == /abc/n   #=> false
02584  */
02585 
02586 static VALUE
02587 rb_reg_equal(VALUE re1, VALUE re2)
02588 {
02589     if (re1 == re2) return Qtrue;
02590     if (TYPE(re2) != T_REGEXP) return Qfalse;
02591     rb_reg_check(re1); rb_reg_check(re2);
02592     if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
02593     if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse;
02594     if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse;
02595     if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
02596     if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) {
02597         return Qtrue;
02598     }
02599     return Qfalse;
02600 }
02601 
02602 /*
02603  * call-seq:
02604  *    mtch.hash   -> integer
02605  *
02606  * Produce a hash based on the target string, regexp and matched
02607  * positions of this matchdata.
02608  */
02609 
02610 static VALUE
02611 match_hash(VALUE match)
02612 {
02613     const struct re_registers *regs;
02614     st_index_t hashval = rb_hash_start(rb_str_hash(RMATCH(match)->str));
02615 
02616     rb_hash_uint(hashval, reg_hash(RMATCH(match)->regexp));
02617     regs = RMATCH_REGS(match);
02618     hashval = rb_hash_uint(hashval, regs->num_regs);
02619     hashval = rb_hash_uint(hashval, rb_memhash(regs->beg, regs->num_regs * sizeof(*regs->beg)));
02620     hashval = rb_hash_uint(hashval, rb_memhash(regs->end, regs->num_regs * sizeof(*regs->end)));
02621     hashval = rb_hash_end(hashval);
02622     return LONG2FIX(hashval);
02623 }
02624 
02625 /*
02626  * call-seq:
02627  *    mtch == mtch2   -> true or false
02628  *
02629  *  Equality---Two matchdata are equal if their target strings,
02630  *  patterns, and matched positions are identical.
02631  */
02632 
02633 static VALUE
02634 match_equal(VALUE match1, VALUE match2)
02635 {
02636     const struct re_registers *regs1, *regs2;
02637     if (match1 == match2) return Qtrue;
02638     if (TYPE(match2) != T_MATCH) return Qfalse;
02639     if (!rb_str_equal(RMATCH(match1)->str, RMATCH(match2)->str)) return Qfalse;
02640     if (!rb_reg_equal(RMATCH(match1)->regexp, RMATCH(match2)->regexp)) return Qfalse;
02641     regs1 = RMATCH_REGS(match1);
02642     regs2 = RMATCH_REGS(match2);
02643     if (regs1->num_regs != regs2->num_regs) return Qfalse;
02644     if (memcmp(regs1->beg, regs2->beg, regs1->num_regs * sizeof(*regs1->beg))) return Qfalse;
02645     if (memcmp(regs1->end, regs2->end, regs1->num_regs * sizeof(*regs1->end))) return Qfalse;
02646     return Qtrue;
02647 }
02648 
02649 static VALUE
02650 reg_operand(VALUE s, int check)
02651 {
02652     if (SYMBOL_P(s)) {
02653         return rb_sym_to_s(s);
02654     }
02655     else {
02656         VALUE tmp = rb_check_string_type(s);
02657         if (check && NIL_P(tmp)) {
02658             rb_raise(rb_eTypeError, "can't convert %s to String",
02659                      rb_obj_classname(s));
02660         }
02661         return tmp;
02662     }
02663 }
02664 
02665 static long
02666 reg_match_pos(VALUE re, VALUE *strp, long pos)
02667 {
02668     VALUE str = *strp;
02669 
02670     if (NIL_P(str)) {
02671         rb_backref_set(Qnil);
02672         return -1;
02673     }
02674     *strp = str = reg_operand(str, TRUE);
02675     if (pos != 0) {
02676         if (pos < 0) {
02677             VALUE l = rb_str_length(str);
02678             pos += NUM2INT(l);
02679             if (pos < 0) {
02680                 return pos;
02681             }
02682         }
02683         pos = rb_str_offset(str, pos);
02684     }
02685     return rb_reg_search(re, str, pos, 0);
02686 }
02687 
02688 /*
02689  *  call-seq:
02690  *     rxp =~ str    -> integer or nil
02691  *
02692  *  Match---Matches <i>rxp</i> against <i>str</i>.
02693  *
02694  *     /at/ =~ "input data"   #=> 7
02695  *     /ax/ =~ "input data"   #=> nil
02696  *
02697  *  If <code>=~</code> is used with a regexp literal with named captures,
02698  *  captured strings (or nil) is assigned to local variables named by
02699  *  the capture names.
02700  *
02701  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "  x = y  "
02702  *     p lhs    #=> "x"
02703  *     p rhs    #=> "y"
02704  *
02705  *  If it is not matched, nil is assigned for the variables.
02706  *
02707  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "  x = "
02708  *     p lhs    #=> nil
02709  *     p rhs    #=> nil
02710  *
02711  *  This assignment is implemented in the Ruby parser.
02712  *  The parser detects 'regexp-literal =~ expression' for the assignment.
02713  *  The regexp must be a literal without interpolation and placed at left hand side.
02714  *
02715  *  The assignment does not occur if the regexp is not a literal.
02716  *
02717  *     re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
02718  *     re =~ "  x = y  "
02719  *     p lhs    # undefined local variable
02720  *     p rhs    # undefined local variable
02721  *
02722  *  A regexp interpolation, <code>#{}</code>, also disables
02723  *  the assignment.
02724  *
02725  *     rhs_pat = /(?<rhs>\w+)/
02726  *     /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y"
02727  *     p lhs    # undefined local variable
02728  *
02729  *  The assignment does not occur if the regexp is placed at the right hand side.
02730  *
02731  *    "  x = y  " =~ /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
02732  *    p lhs, rhs # undefined local variable
02733  *
02734  */
02735 
02736 VALUE
02737 rb_reg_match(VALUE re, VALUE str)
02738 {
02739     long pos = reg_match_pos(re, &str, 0);
02740     if (pos < 0) return Qnil;
02741     pos = rb_str_sublen(str, pos);
02742     return LONG2FIX(pos);
02743 }
02744 
02745 /*
02746  *  call-seq:
02747  *     rxp === str   -> true or false
02748  *
02749  *  Case Equality---Synonym for <code>Regexp#=~</code> used in case statements.
02750  *
02751  *     a = "HELLO"
02752  *     case a
02753  *     when /^[a-z]*$/; print "Lower case\n"
02754  *     when /^[A-Z]*$/; print "Upper case\n"
02755  *     else;            print "Mixed case\n"
02756  *     end
02757  *
02758  *  <em>produces:</em>
02759  *
02760  *     Upper case
02761  */
02762 
02763 VALUE
02764 rb_reg_eqq(VALUE re, VALUE str)
02765 {
02766     long start;
02767 
02768     str = reg_operand(str, FALSE);
02769     if (NIL_P(str)) {
02770         rb_backref_set(Qnil);
02771         return Qfalse;
02772     }
02773     start = rb_reg_search(re, str, 0, 0);
02774     if (start < 0) {
02775         return Qfalse;
02776     }
02777     return Qtrue;
02778 }
02779 
02780 
02781 /*
02782  *  call-seq:
02783  *     ~ rxp   -> integer or nil
02784  *
02785  *  Match---Matches <i>rxp</i> against the contents of <code>$_</code>.
02786  *  Equivalent to <code><i>rxp</i> =~ $_</code>.
02787  *
02788  *     $_ = "input data"
02789  *     ~ /at/   #=> 7
02790  */
02791 
02792 VALUE
02793 rb_reg_match2(VALUE re)
02794 {
02795     long start;
02796     VALUE line = rb_lastline_get();
02797 
02798     if (TYPE(line) != T_STRING) {
02799         rb_backref_set(Qnil);
02800         return Qnil;
02801     }
02802 
02803     start = rb_reg_search(re, line, 0, 0);
02804     if (start < 0) {
02805         return Qnil;
02806     }
02807     start = rb_str_sublen(line, start);
02808     return LONG2FIX(start);
02809 }
02810 
02811 
02812 /*
02813  *  call-seq:
02814  *     rxp.match(str)       -> matchdata or nil
02815  *     rxp.match(str,pos)   -> matchdata or nil
02816  *
02817  *  Returns a <code>MatchData</code> object describing the match, or
02818  *  <code>nil</code> if there was no match. This is equivalent to retrieving the
02819  *  value of the special variable <code>$~</code> following a normal match.
02820  *  If the second parameter is present, it specifies the position in the string
02821  *  to begin the search.
02822  *
02823  *     /(.)(.)(.)/.match("abc")[2]   #=> "b"
02824  *     /(.)(.)/.match("abc", 1)[2]   #=> "c"
02825  *
02826  *  If a block is given, invoke the block with MatchData if match succeed, so
02827  *  that you can write
02828  *
02829  *     pat.match(str) {|m| ...}
02830  *
02831  *  instead of
02832  *
02833  *     if m = pat.match(str)
02834  *       ...
02835  *     end
02836  *
02837  *  The return value is a value from block execution in this case.
02838  */
02839 
02840 static VALUE
02841 rb_reg_match_m(int argc, VALUE *argv, VALUE re)
02842 {
02843     VALUE result, str, initpos;
02844     long pos;
02845 
02846     if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) {
02847         pos = NUM2LONG(initpos);
02848     }
02849     else {
02850         pos = 0;
02851     }
02852 
02853     pos = reg_match_pos(re, &str, pos);
02854     if (pos < 0) {
02855         rb_backref_set(Qnil);
02856         return Qnil;
02857     }
02858     result = rb_backref_get();
02859     rb_match_busy(result);
02860     if (!NIL_P(result) && rb_block_given_p()) {
02861         return rb_yield(result);
02862     }
02863     return result;
02864 }
02865 
02866 /*
02867  * Document-method: compile
02868  *
02869  * Synonym for <code>Regexp.new</code>
02870  */
02871 
02872 
02873 /*
02874  *  call-seq:
02875  *     Regexp.new(string, [options [, lang]])        -> regexp
02876  *     Regexp.new(regexp)                            -> regexp
02877  *     Regexp.compile(string, [options [, lang]])    -> regexp
02878  *     Regexp.compile(regexp)                        -> regexp
02879  *
02880  *  Constructs a new regular expression from <i>pattern</i>, which can be either
02881  *  a <code>String</code> or a <code>Regexp</code> (in which case that regexp's
02882  *  options are propagated, and new options may not be specified (a change as of
02883  *  Ruby 1.8). If <i>options</i> is a <code>Fixnum</code>, it should be one or
02884  *  more of the constants <code>Regexp::EXTENDED</code>,
02885  *  <code>Regexp::IGNORECASE</code>, and <code>Regexp::MULTILINE</code>,
02886  *  <em>or</em>-ed together. Otherwise, if <i>options</i> is not
02887  *  <code>nil</code>, the regexp will be case insensitive.
02888  *  When the <i>lang</i> parameter is `n' or `N' sets the regexp no encoding.
02889  *
02890  *     r1 = Regexp.new('^a-z+:\\s+\w+')           #=> /^a-z+:\s+\w+/
02891  *     r2 = Regexp.new('cat', true)               #=> /cat/i
02892  *     r3 = Regexp.new('dog', Regexp::EXTENDED)   #=> /dog/x
02893  *     r4 = Regexp.new(r2)                        #=> /cat/i
02894  */
02895 
02896 static VALUE
02897 rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
02898 {
02899     onig_errmsg_buffer err = "";
02900     int flags = 0;
02901     VALUE str;
02902     rb_encoding *enc;
02903     const char *ptr;
02904     long len;
02905 
02906     if (argc == 0 || argc > 3) {
02907         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..3)", argc);
02908     }
02909     if (TYPE(argv[0]) == T_REGEXP) {
02910         VALUE re = argv[0];
02911 
02912         if (argc > 1) {
02913             rb_warn("flags ignored");
02914         }
02915         rb_reg_check(re);
02916         flags = rb_reg_options(re);
02917         ptr = RREGEXP_SRC_PTR(re);
02918         len = RREGEXP_SRC_LEN(re);
02919         enc = rb_enc_get(re);
02920         if (rb_reg_initialize(self, ptr, len, enc, flags, err, NULL, 0)) {
02921             str = rb_enc_str_new(ptr, len, enc);
02922             rb_reg_raise_str(str, flags, err);
02923         }
02924     }
02925     else {
02926         if (argc >= 2) {
02927             if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
02928             else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
02929         }
02930         enc = 0;
02931         if (argc == 3 && !NIL_P(argv[2])) {
02932             char *kcode = StringValuePtr(argv[2]);
02933             if (kcode[0] == 'n' || kcode[0] == 'N') {
02934                 enc = rb_ascii8bit_encoding();
02935                 flags |= ARG_ENCODING_NONE;
02936             }
02937             else {
02938                 rb_warn("encoding option is ignored - %s", kcode);
02939             }
02940         }
02941         str = argv[0];
02942         ptr = StringValuePtr(str);
02943         if (enc
02944             ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err, NULL, 0)
02945             : rb_reg_initialize_str(self, str, flags, err, NULL, 0)) {
02946             rb_reg_raise_str(str, flags, err);
02947         }
02948     }
02949     return self;
02950 }
02951 
02952 VALUE
02953 rb_reg_quote(VALUE str)
02954 {
02955     rb_encoding *enc = rb_enc_get(str);
02956     char *s, *send, *t;
02957     VALUE tmp;
02958     int c, clen;
02959     int ascii_only = rb_enc_str_asciionly_p(str);
02960 
02961     s = RSTRING_PTR(str);
02962     send = s + RSTRING_LEN(str);
02963     while (s < send) {
02964         c = rb_enc_ascget(s, send, &clen, enc);
02965         if (c == -1) {
02966             s += mbclen(s, send, enc);
02967             continue;
02968         }
02969         switch (c) {
02970           case '[': case ']': case '{': case '}':
02971           case '(': case ')': case '|': case '-':
02972           case '*': case '.': case '\\':
02973           case '?': case '+': case '^': case '$':
02974           case ' ': case '#':
02975           case '\t': case '\f': case '\v': case '\n': case '\r':
02976             goto meta_found;
02977         }
02978         s += clen;
02979     }
02980     tmp = rb_str_new3(str);
02981     if (ascii_only) {
02982         rb_enc_associate(tmp, rb_usascii_encoding());
02983     }
02984     return tmp;
02985 
02986   meta_found:
02987     tmp = rb_str_new(0, RSTRING_LEN(str)*2);
02988     if (ascii_only) {
02989         rb_enc_associate(tmp, rb_usascii_encoding());
02990     }
02991     else {
02992         rb_enc_copy(tmp, str);
02993     }
02994     t = RSTRING_PTR(tmp);
02995     /* copy upto metacharacter */
02996     memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
02997     t += s - RSTRING_PTR(str);
02998 
02999     while (s < send) {
03000         c = rb_enc_ascget(s, send, &clen, enc);
03001         if (c == -1) {
03002             int n = mbclen(s, send, enc);
03003 
03004             while (n--)
03005                 *t++ = *s++;
03006             continue;
03007         }
03008         s += clen;
03009         switch (c) {
03010           case '[': case ']': case '{': case '}':
03011           case '(': case ')': case '|': case '-':
03012           case '*': case '.': case '\\':
03013           case '?': case '+': case '^': case '$':
03014           case '#':
03015             t += rb_enc_mbcput('\\', t, enc);
03016             break;
03017           case ' ':
03018             t += rb_enc_mbcput('\\', t, enc);
03019             t += rb_enc_mbcput(' ', t, enc);
03020             continue;
03021           case '\t':
03022             t += rb_enc_mbcput('\\', t, enc);
03023             t += rb_enc_mbcput('t', t, enc);
03024             continue;
03025           case '\n':
03026             t += rb_enc_mbcput('\\', t, enc);
03027             t += rb_enc_mbcput('n', t, enc);
03028             continue;
03029           case '\r':
03030             t += rb_enc_mbcput('\\', t, enc);
03031             t += rb_enc_mbcput('r', t, enc);
03032             continue;
03033           case '\f':
03034             t += rb_enc_mbcput('\\', t, enc);
03035             t += rb_enc_mbcput('f', t, enc);
03036             continue;
03037           case '\v':
03038             t += rb_enc_mbcput('\\', t, enc);
03039             t += rb_enc_mbcput('v', t, enc);
03040             continue;
03041         }
03042         t += rb_enc_mbcput(c, t, enc);
03043     }
03044     rb_str_resize(tmp, t - RSTRING_PTR(tmp));
03045     OBJ_INFECT(tmp, str);
03046     return tmp;
03047 }
03048 
03049 
03050 /*
03051  *  call-seq:
03052  *     Regexp.escape(str)   -> string
03053  *     Regexp.quote(str)    -> string
03054  *
03055  *  Escapes any characters that would have special meaning in a regular
03056  *  expression. Returns a new escaped string, or self if no characters are
03057  *  escaped.  For any string,
03058  *  <code>Regexp.new(Regexp.escape(<i>str</i>))=~<i>str</i></code> will be true.
03059  *
03060  *     Regexp.escape('\*?{}.')   #=> \\\*\?\{\}\.
03061  *
03062  */
03063 
03064 static VALUE
03065 rb_reg_s_quote(VALUE c, VALUE str)
03066 {
03067     return rb_reg_quote(reg_operand(str, TRUE));
03068 }
03069 
03070 int
03071 rb_reg_options(VALUE re)
03072 {
03073     int options;
03074 
03075     rb_reg_check(re);
03076     options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK;
03077     if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
03078     if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
03079     return options;
03080 }
03081 
03082 VALUE
03083 rb_check_regexp_type(VALUE re)
03084 {
03085     return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp");
03086 }
03087 
03088 /*
03089  *  call-seq:
03090  *     Regexp.try_convert(obj) -> re or nil
03091  *
03092  *  Try to convert <i>obj</i> into a Regexp, using to_regexp method.
03093  *  Returns converted regexp or nil if <i>obj</i> cannot be converted
03094  *  for any reason.
03095  *
03096  *     Regexp.try_convert(/re/)         #=> /re/
03097  *     Regexp.try_convert("re")         #=> nil
03098  *
03099  *     o = Object.new
03100  *     Regexp.try_convert(o)            #=> nil
03101  *     def o.to_regexp() /foo/ end
03102  *     Regexp.try_convert(o)            #=> /foo/
03103  *
03104  */
03105 static VALUE
03106 rb_reg_s_try_convert(VALUE dummy, VALUE re)
03107 {
03108     return rb_check_regexp_type(re);
03109 }
03110 
03111 static VALUE
03112 rb_reg_s_union(VALUE self, VALUE args0)
03113 {
03114     long argc = RARRAY_LEN(args0);
03115 
03116     if (argc == 0) {
03117         VALUE args[1];
03118         args[0] = rb_str_new2("(?!)");
03119         return rb_class_new_instance(1, args, rb_cRegexp);
03120     }
03121     else if (argc == 1) {
03122         VALUE arg = rb_ary_entry(args0, 0);
03123         VALUE re = rb_check_regexp_type(arg);
03124         if (!NIL_P(re))
03125             return re;
03126         else {
03127             VALUE quoted;
03128             quoted = rb_reg_s_quote(Qnil, arg);
03129             return rb_reg_new_str(quoted, 0);
03130         }
03131     }
03132     else {
03133         int i;
03134         VALUE source = rb_str_buf_new(0);
03135         rb_encoding *result_enc;
03136 
03137         int has_asciionly = 0;
03138         rb_encoding *has_ascii_compat_fixed = 0;
03139         rb_encoding *has_ascii_incompat = 0;
03140 
03141         for (i = 0; i < argc; i++) {
03142             volatile VALUE v;
03143             VALUE e = rb_ary_entry(args0, i);
03144 
03145             if (0 < i)
03146                 rb_str_buf_cat_ascii(source, "|");
03147 
03148             v = rb_check_regexp_type(e);
03149             if (!NIL_P(v)) {
03150                 rb_encoding *enc = rb_enc_get(v);
03151                 if (!rb_enc_asciicompat(enc)) {
03152                     if (!has_ascii_incompat)
03153                         has_ascii_incompat = enc;
03154                     else if (has_ascii_incompat != enc)
03155                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03156                             rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03157                 }
03158                 else if (rb_reg_fixed_encoding_p(v)) {
03159                     if (!has_ascii_compat_fixed)
03160                         has_ascii_compat_fixed = enc;
03161                     else if (has_ascii_compat_fixed != enc)
03162                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03163                             rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03164                 }
03165                 else {
03166                     has_asciionly = 1;
03167                 }
03168                 v = rb_reg_to_s(v);
03169             }
03170             else {
03171                 rb_encoding *enc;
03172                 StringValue(e);
03173                 enc = rb_enc_get(e);
03174                 if (!rb_enc_str_asciicompat_p(e)) {
03175                     if (!has_ascii_incompat)
03176                         has_ascii_incompat = enc;
03177                     else if (has_ascii_incompat != enc)
03178                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03179                             rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03180                 }
03181                 else if (rb_enc_str_asciionly_p(e)) {
03182                     has_asciionly = 1;
03183                 }
03184                 else {
03185                     if (!has_ascii_compat_fixed)
03186                         has_ascii_compat_fixed = enc;
03187                     else if (has_ascii_compat_fixed != enc)
03188                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03189                             rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03190                 }
03191                 v = rb_reg_s_quote(Qnil, e);
03192             }
03193             if (has_ascii_incompat) {
03194                 if (has_asciionly) {
03195                     rb_raise(rb_eArgError, "ASCII incompatible encoding: %s",
03196                         rb_enc_name(has_ascii_incompat));
03197                 }
03198                 if (has_ascii_compat_fixed) {
03199                     rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03200                         rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
03201                 }
03202             }
03203 
03204             if (i == 0) {
03205                 rb_enc_copy(source, v);
03206             }
03207             rb_str_append(source, v);
03208         }
03209 
03210         if (has_ascii_incompat) {
03211             result_enc = has_ascii_incompat;
03212         }
03213         else if (has_ascii_compat_fixed) {
03214             result_enc = has_ascii_compat_fixed;
03215         }
03216         else {
03217             result_enc = rb_ascii8bit_encoding();
03218         }
03219 
03220         rb_enc_associate(source, result_enc);
03221         return rb_class_new_instance(1, &source, rb_cRegexp);
03222     }
03223 }
03224 
03225 /*
03226  *  call-seq:
03227  *     Regexp.union(pat1, pat2, ...)            -> new_regexp
03228  *     Regexp.union(pats_ary)                   -> new_regexp
03229  *
03230  *  Return a <code>Regexp</code> object that is the union of the given
03231  *  <em>pattern</em>s, i.e., will match any of its parts. The <em>pattern</em>s
03232  *  can be Regexp objects, in which case their options will be preserved, or
03233  *  Strings. If no patterns are given, returns <code>/(?!)/</code>.
03234  *  The behavior is unspecified if any given <em>pattern</em> contains capture.
03235  *
03236  *     Regexp.union                         #=> /(?!)/
03237  *     Regexp.union("penzance")             #=> /penzance/
03238  *     Regexp.union("a+b*c")                #=> /a\+b\*c/
03239  *     Regexp.union("skiing", "sledding")   #=> /skiing|sledding/
03240  *     Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/
03241  *     Regexp.union(/dogs/, /cats/i)        #=> /(?-mix:dogs)|(?i-mx:cats)/
03242  */
03243 static VALUE
03244 rb_reg_s_union_m(VALUE self, VALUE args)
03245 {
03246     VALUE v;
03247     if (RARRAY_LEN(args) == 1 &&
03248         !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) {
03249         return rb_reg_s_union(self, v);
03250     }
03251     return rb_reg_s_union(self, args);
03252 }
03253 
03254 /* :nodoc: */
03255 static VALUE
03256 rb_reg_init_copy(VALUE copy, VALUE re)
03257 {
03258     onig_errmsg_buffer err = "";
03259     const char *s;
03260     long len;
03261 
03262     if (copy == re) return copy;
03263     rb_check_frozen(copy);
03264     /* need better argument type check */
03265     if (!rb_obj_is_instance_of(re, rb_obj_class(copy))) {
03266         rb_raise(rb_eTypeError, "wrong argument type");
03267     }
03268     rb_reg_check(re);
03269     s = RREGEXP_SRC_PTR(re);
03270     len = RREGEXP_SRC_LEN(re);
03271     if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re),
03272                 err, NULL, 0) != 0) {
03273         rb_reg_raise(s, len, err, re);
03274     }
03275     return copy;
03276 }
03277 
03278 VALUE
03279 rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
03280 {
03281     VALUE val = 0;
03282     char *p, *s, *e;
03283     int no, clen;
03284     rb_encoding *str_enc = rb_enc_get(str);
03285     rb_encoding *src_enc = rb_enc_get(src);
03286     int acompat = rb_enc_asciicompat(str_enc);
03287 #define ASCGET(s,e,cl) (acompat ? (*(cl)=1,ISASCII((s)[0])?(s)[0]:-1) : rb_enc_ascget((s), (e), (cl), str_enc))
03288 
03289     p = s = RSTRING_PTR(str);
03290     e = s + RSTRING_LEN(str);
03291 
03292     while (s < e) {
03293         int c = ASCGET(s, e, &clen);
03294         char *ss;
03295 
03296         if (c == -1) {
03297             s += mbclen(s, e, str_enc);
03298             continue;
03299         }
03300         ss = s;
03301         s += clen;
03302 
03303         if (c != '\\' || s == e) continue;
03304 
03305         if (!val) {
03306             val = rb_str_buf_new(ss-p);
03307         }
03308         rb_enc_str_buf_cat(val, p, ss-p, str_enc);
03309 
03310         c = ASCGET(s, e, &clen);
03311         if (c == -1) {
03312             s += mbclen(s, e, str_enc);
03313             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03314             p = s;
03315             continue;
03316         }
03317         s += clen;
03318 
03319         p = s;
03320         switch (c) {
03321           case '1': case '2': case '3': case '4':
03322           case '5': case '6': case '7': case '8': case '9':
03323             if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) {
03324                 no = c - '0';
03325             }
03326             else {
03327                 continue;
03328             }
03329             break;
03330 
03331           case 'k':
03332             if (s < e && ASCGET(s, e, &clen) == '<') {
03333                 char *name, *name_end;
03334 
03335                 name_end = name = s + clen;
03336                 while (name_end < e) {
03337                     c = ASCGET(name_end, e, &clen);
03338                     if (c == '>') break;
03339                     name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
03340                 }
03341                 if (name_end < e) {
03342                     no = name_to_backref_number(regs, regexp, name, name_end);
03343                     p = s = name_end + clen;
03344                     break;
03345                 }
03346                 else {
03347                     rb_raise(rb_eRuntimeError, "invalid group name reference format");
03348                 }
03349             }
03350 
03351             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03352             continue;
03353 
03354           case '0':
03355           case '&':
03356             no = 0;
03357             break;
03358 
03359           case '`':
03360             rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc);
03361             continue;
03362 
03363           case '\'':
03364             rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc);
03365             continue;
03366 
03367           case '+':
03368             no = regs->num_regs-1;
03369             while (BEG(no) == -1 && no > 0) no--;
03370             if (no == 0) continue;
03371             break;
03372 
03373           case '\\':
03374             rb_enc_str_buf_cat(val, s-clen, clen, str_enc);
03375             continue;
03376 
03377           default:
03378             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03379             continue;
03380         }
03381 
03382         if (no >= 0) {
03383             if (no >= regs->num_regs) continue;
03384             if (BEG(no) == -1) continue;
03385             rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc);
03386         }
03387     }
03388 
03389     if (!val) return str;
03390     if (p < e) {
03391         rb_enc_str_buf_cat(val, p, e-p, str_enc);
03392     }
03393 
03394     return val;
03395 }
03396 
03397 static VALUE
03398 kcode_getter(void)
03399 {
03400     rb_warn("variable $KCODE is no longer effective");
03401     return Qnil;
03402 }
03403 
03404 static void
03405 kcode_setter(VALUE val, ID id)
03406 {
03407     rb_warn("variable $KCODE is no longer effective; ignored");
03408 }
03409 
03410 static VALUE
03411 ignorecase_getter(void)
03412 {
03413     rb_warn("variable $= is no longer effective");
03414     return Qfalse;
03415 }
03416 
03417 static void
03418 ignorecase_setter(VALUE val, ID id)
03419 {
03420     rb_warn("variable $= is no longer effective; ignored");
03421 }
03422 
03423 static VALUE
03424 match_getter(void)
03425 {
03426     VALUE match = rb_backref_get();
03427 
03428     if (NIL_P(match)) return Qnil;
03429     rb_match_busy(match);
03430     return match;
03431 }
03432 
03433 static void
03434 match_setter(VALUE val)
03435 {
03436     if (!NIL_P(val)) {
03437         Check_Type(val, T_MATCH);
03438     }
03439     rb_backref_set(val);
03440 }
03441 
03442 /*
03443  *  call-seq:
03444  *     Regexp.last_match           -> matchdata
03445  *     Regexp.last_match(n)        -> str
03446  *
03447  *  The first form returns the <code>MatchData</code> object generated by the
03448  *  last successful pattern match. Equivalent to reading the global variable
03449  *  <code>$~</code>. The second form returns the <i>n</i>th field in this
03450  *  <code>MatchData</code> object.
03451  *  <em>n</em> can be a string or symbol to reference a named capture.
03452  *
03453  *  Note that the <code>last_match</code> is local to the thread and method scope
03454  *  of the method that did the pattern match.
03455  *
03456  *     /c(.)t/ =~ 'cat'        #=> 0
03457  *     Regexp.last_match       #=> #<MatchData "cat" 1:"a">
03458  *     Regexp.last_match(0)    #=> "cat"
03459  *     Regexp.last_match(1)    #=> "a"
03460  *     Regexp.last_match(2)    #=> nil
03461  *
03462  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val"
03463  *     Regexp.last_match       #=> #<MatchData "var = val" lhs:"var" rhs:"val">
03464  *     Regexp.last_match(:lhs) #=> "var"
03465  *     Regexp.last_match(:rhs) #=> "val"
03466  */
03467 
03468 static VALUE
03469 rb_reg_s_last_match(int argc, VALUE *argv)
03470 {
03471     VALUE nth;
03472 
03473     if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) {
03474         VALUE match = rb_backref_get();
03475         int n;
03476         if (NIL_P(match)) return Qnil;
03477         n = match_backref_number(match, nth);
03478         return rb_reg_nth_match(n, match);
03479     }
03480     return match_getter();
03481 }
03482 
03483 static void
03484 re_warn(const char *s)
03485 {
03486     rb_warn("%s", s);
03487 }
03488 
03489 /*
03490  *  Document-class: RegexpError
03491  *
03492  *  Raised when given an invalid regexp expression.
03493  *
03494  *     Regexp.new("?")
03495  *
03496  *  <em>raises the exception:</em>
03497  *
03498  *     RegexpError: target of repeat operator is not specified: /?/
03499  */
03500 
03501 /*
03502  *  Document-class: Regexp
03503  *
03504  *  A <code>Regexp</code> holds a regular expression, used to match a pattern
03505  *  against strings. Regexps are created using the <code>/.../</code> and
03506  *  <code>%r{...}</code> literals, and by the <code>Regexp::new</code>
03507  *  constructor.
03508  *
03509  *  :include: doc/re.rdoc
03510  */
03511 
03512 void
03513 Init_Regexp(void)
03514 {
03515     rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError);
03516 
03517     onigenc_set_default_caseconv_table((UChar*)casetable);
03518     onigenc_set_default_encoding(ONIG_ENCODING_ASCII);
03519     onig_set_warn_func(re_warn);
03520     onig_set_verb_warn_func(re_warn);
03521 
03522     rb_define_virtual_variable("$~", match_getter, match_setter);
03523     rb_define_virtual_variable("$&", last_match_getter, 0);
03524     rb_define_virtual_variable("$`", prematch_getter, 0);
03525     rb_define_virtual_variable("$'", postmatch_getter, 0);
03526     rb_define_virtual_variable("$+", last_paren_match_getter, 0);
03527 
03528     rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter);
03529     rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter);
03530     rb_define_virtual_variable("$-K", kcode_getter, kcode_setter);
03531 
03532     rb_cRegexp = rb_define_class("Regexp", rb_cObject);
03533     rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc);
03534     rb_define_singleton_method(rb_cRegexp, "compile", rb_class_new_instance, -1);
03535     rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1);
03536     rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1);
03537     rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2);
03538     rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
03539     rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
03540 
03541     rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
03542     rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
03543     rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);
03544     rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1);
03545     rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1);
03546     rb_define_method(rb_cRegexp, "=~", rb_reg_match, 1);
03547     rb_define_method(rb_cRegexp, "===", rb_reg_eqq, 1);
03548     rb_define_method(rb_cRegexp, "~", rb_reg_match2, 0);
03549     rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1);
03550     rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0);
03551     rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0);
03552     rb_define_method(rb_cRegexp, "source", rb_reg_source, 0);
03553     rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0);
03554     rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0);
03555     rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0); /* in encoding.c */
03556     rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0);
03557     rb_define_method(rb_cRegexp, "names", rb_reg_names, 0);
03558     rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0);
03559 
03560     /* see Regexp.options and Regexp.new */
03561     rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE));
03562     /* see Regexp.options and Regexp.new */
03563     rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND));
03564     /* see Regexp.options and Regexp.new */
03565     rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE));
03566     /* see Regexp.options and Regexp.new */
03567     rb_define_const(rb_cRegexp, "FIXEDENCODING", INT2FIX(ARG_ENCODING_FIXED));
03568     /* see Regexp.options and Regexp.new */
03569     rb_define_const(rb_cRegexp, "NOENCODING", INT2FIX(ARG_ENCODING_NONE));
03570 
03571     rb_global_variable(&reg_cache);
03572 
03573     rb_cMatch  = rb_define_class("MatchData", rb_cObject);
03574     rb_define_alloc_func(rb_cMatch, match_alloc);
03575     rb_undef_method(CLASS_OF(rb_cMatch), "new");
03576 
03577     rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1);
03578     rb_define_method(rb_cMatch, "regexp", match_regexp, 0);
03579     rb_define_method(rb_cMatch, "names", match_names, 0);
03580     rb_define_method(rb_cMatch, "size", match_size, 0);
03581     rb_define_method(rb_cMatch, "length", match_size, 0);
03582     rb_define_method(rb_cMatch, "offset", match_offset, 1);
03583     rb_define_method(rb_cMatch, "begin", match_begin, 1);
03584     rb_define_method(rb_cMatch, "end", match_end, 1);
03585     rb_define_method(rb_cMatch, "to_a", match_to_a, 0);
03586     rb_define_method(rb_cMatch, "[]", match_aref, -1);
03587     rb_define_method(rb_cMatch, "captures", match_captures, 0);
03588     rb_define_method(rb_cMatch, "values_at", match_values_at, -1);
03589     rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0);
03590     rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
03591     rb_define_method(rb_cMatch, "to_s", match_to_s, 0);
03592     rb_define_method(rb_cMatch, "inspect", match_inspect, 0);
03593     rb_define_method(rb_cMatch, "string", match_string, 0);
03594     rb_define_method(rb_cMatch, "hash", match_hash, 0);
03595     rb_define_method(rb_cMatch, "eql?", match_equal, 1);
03596     rb_define_method(rb_cMatch, "==", match_equal, 1);
03597 }
03598