Ruby 1.9.3p327(2012-11-10revision37606)
string.c
Go to the documentation of this file.
00001 /**********************************************************************
00002 
00003   string.c -
00004 
00005   $Author: usa $
00006   created at: Mon Aug  9 17:12:58 JST 1993
00007 
00008   Copyright (C) 1993-2007 Yukihiro Matsumoto
00009   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
00010   Copyright (C) 2000  Information-technology Promotion Agency, Japan
00011 
00012 **********************************************************************/
00013 
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include "internal.h"
00018 #include <assert.h>
00019 
00020 #define BEG(no) (regs->beg[(no)])
00021 #define END(no) (regs->end[(no)])
00022 
00023 #include <math.h>
00024 #include <ctype.h>
00025 
00026 #ifdef HAVE_UNISTD_H
00027 #include <unistd.h>
00028 #endif
00029 
00030 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00031 
00032 #undef rb_str_new_cstr
00033 #undef rb_tainted_str_new_cstr
00034 #undef rb_usascii_str_new_cstr
00035 #undef rb_external_str_new_cstr
00036 #undef rb_locale_str_new_cstr
00037 #undef rb_str_new2
00038 #undef rb_str_new3
00039 #undef rb_str_new4
00040 #undef rb_str_new5
00041 #undef rb_tainted_str_new2
00042 #undef rb_usascii_str_new2
00043 #undef rb_str_dup_frozen
00044 #undef rb_str_buf_new_cstr
00045 #undef rb_str_buf_new2
00046 #undef rb_str_buf_cat2
00047 #undef rb_str_cat2
00048 
00049 static VALUE rb_str_clear(VALUE str);
00050 
00051 VALUE rb_cString;
00052 VALUE rb_cSymbol;
00053 
00054 #define RUBY_MAX_CHAR_LEN 16
00055 #define STR_TMPLOCK FL_USER7
00056 #define STR_NOEMBED FL_USER1
00057 #define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
00058 #define STR_ASSOC   FL_USER3
00059 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
00060 #define STR_ASSOC_P(s)  FL_ALL((s), STR_NOEMBED|STR_ASSOC)
00061 #define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
00062 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
00063 #define STR_UNSET_NOCAPA(s) do {\
00064     if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
00065 } while (0)
00066 
00067 
00068 #define STR_SET_NOEMBED(str) do {\
00069     FL_SET((str), STR_NOEMBED);\
00070     STR_SET_EMBED_LEN((str), 0);\
00071 } while (0)
00072 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
00073 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
00074 #define STR_SET_EMBED_LEN(str, n) do { \
00075     long tmp_n = (n);\
00076     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00077     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00078 } while (0)
00079 
00080 #define STR_SET_LEN(str, n) do { \
00081     if (STR_EMBED_P(str)) {\
00082         STR_SET_EMBED_LEN((str), (n));\
00083     }\
00084     else {\
00085         RSTRING(str)->as.heap.len = (n);\
00086     }\
00087 } while (0)
00088 
00089 #define STR_DEC_LEN(str) do {\
00090     if (STR_EMBED_P(str)) {\
00091         long n = RSTRING_LEN(str);\
00092         n--;\
00093         STR_SET_EMBED_LEN((str), n);\
00094     }\
00095     else {\
00096         RSTRING(str)->as.heap.len--;\
00097     }\
00098 } while (0)
00099 
00100 #define RESIZE_CAPA(str,capacity) do {\
00101     if (STR_EMBED_P(str)) {\
00102         if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00103             char *tmp = ALLOC_N(char, (capacity)+1);\
00104             memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
00105             RSTRING(str)->as.heap.ptr = tmp;\
00106             RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
00107             STR_SET_NOEMBED(str);\
00108             RSTRING(str)->as.heap.aux.capa = (capacity);\
00109         }\
00110     }\
00111     else {\
00112         REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
00113         if (!STR_NOCAPA_P(str))\
00114             RSTRING(str)->as.heap.aux.capa = (capacity);\
00115     }\
00116 } while (0)
00117 
00118 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00119 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
00120 
00121 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
00122 
00123 static inline int
00124 single_byte_optimizable(VALUE str)
00125 {
00126     rb_encoding *enc;
00127 
00128     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
00129     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00130         return 1;
00131 
00132     enc = STR_ENC_GET(str);
00133     if (rb_enc_mbmaxlen(enc) == 1)
00134         return 1;
00135 
00136     /* Conservative.  Possibly single byte.
00137      * "\xa1" in Shift_JIS for example. */
00138     return 0;
00139 }
00140 
00141 VALUE rb_fs;
00142 
00143 static inline const char *
00144 search_nonascii(const char *p, const char *e)
00145 {
00146 #if SIZEOF_VALUE == 8
00147 # define NONASCII_MASK 0x8080808080808080ULL
00148 #elif SIZEOF_VALUE == 4
00149 # define NONASCII_MASK 0x80808080UL
00150 #endif
00151 #ifdef NONASCII_MASK
00152     if ((int)sizeof(VALUE) * 2 < e - p) {
00153         const VALUE *s, *t;
00154         const VALUE lowbits = sizeof(VALUE) - 1;
00155         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00156         while (p < (const char *)s) {
00157             if (!ISASCII(*p))
00158                 return p;
00159             p++;
00160         }
00161         t = (const VALUE*)(~lowbits & (VALUE)e);
00162         while (s < t) {
00163             if (*s & NONASCII_MASK) {
00164                 t = s;
00165                 break;
00166             }
00167             s++;
00168         }
00169         p = (const char *)t;
00170     }
00171 #endif
00172     while (p < e) {
00173         if (!ISASCII(*p))
00174             return p;
00175         p++;
00176     }
00177     return NULL;
00178 }
00179 
00180 static int
00181 coderange_scan(const char *p, long len, rb_encoding *enc)
00182 {
00183     const char *e = p + len;
00184 
00185     if (rb_enc_to_index(enc) == 0) {
00186         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
00187         p = search_nonascii(p, e);
00188         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00189     }
00190 
00191     if (rb_enc_asciicompat(enc)) {
00192         p = search_nonascii(p, e);
00193         if (!p) {
00194             return ENC_CODERANGE_7BIT;
00195         }
00196         while (p < e) {
00197             int ret = rb_enc_precise_mbclen(p, e, enc);
00198             if (!MBCLEN_CHARFOUND_P(ret)) {
00199                 return ENC_CODERANGE_BROKEN;
00200             }
00201             p += MBCLEN_CHARFOUND_LEN(ret);
00202             if (p < e) {
00203                 p = search_nonascii(p, e);
00204                 if (!p) {
00205                     return ENC_CODERANGE_VALID;
00206                 }
00207             }
00208         }
00209         if (e < p) {
00210             return ENC_CODERANGE_BROKEN;
00211         }
00212         return ENC_CODERANGE_VALID;
00213     }
00214 
00215     while (p < e) {
00216         int ret = rb_enc_precise_mbclen(p, e, enc);
00217 
00218         if (!MBCLEN_CHARFOUND_P(ret)) {
00219             return ENC_CODERANGE_BROKEN;
00220         }
00221         p += MBCLEN_CHARFOUND_LEN(ret);
00222     }
00223     if (e < p) {
00224         return ENC_CODERANGE_BROKEN;
00225     }
00226     return ENC_CODERANGE_VALID;
00227 }
00228 
00229 long
00230 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00231 {
00232     const char *p = s;
00233 
00234     if (*cr == ENC_CODERANGE_BROKEN)
00235         return e - s;
00236 
00237     if (rb_enc_to_index(enc) == 0) {
00238         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
00239         p = search_nonascii(p, e);
00240         *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00241         return e - s;
00242     }
00243     else if (rb_enc_asciicompat(enc)) {
00244         p = search_nonascii(p, e);
00245         if (!p) {
00246             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00247             return e - s;
00248         }
00249         while (p < e) {
00250             int ret = rb_enc_precise_mbclen(p, e, enc);
00251             if (!MBCLEN_CHARFOUND_P(ret)) {
00252                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00253                 return p - s;
00254             }
00255             p += MBCLEN_CHARFOUND_LEN(ret);
00256             if (p < e) {
00257                 p = search_nonascii(p, e);
00258                 if (!p) {
00259                     *cr = ENC_CODERANGE_VALID;
00260                     return e - s;
00261                 }
00262             }
00263         }
00264         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00265         return p - s;
00266     }
00267     else {
00268         while (p < e) {
00269             int ret = rb_enc_precise_mbclen(p, e, enc);
00270             if (!MBCLEN_CHARFOUND_P(ret)) {
00271                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00272                 return p - s;
00273             }
00274             p += MBCLEN_CHARFOUND_LEN(ret);
00275         }
00276         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00277         return p - s;
00278     }
00279 }
00280 
00281 static inline void
00282 str_enc_copy(VALUE str1, VALUE str2)
00283 {
00284     rb_enc_set_index(str1, ENCODING_GET(str2));
00285 }
00286 
00287 static void
00288 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00289 {
00290     /* this function is designed for copying encoding and coderange
00291      * from src to new string "dest" which is made from the part of src.
00292      */
00293     str_enc_copy(dest, src);
00294     switch (ENC_CODERANGE(src)) {
00295       case ENC_CODERANGE_7BIT:
00296         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00297         break;
00298       case ENC_CODERANGE_VALID:
00299         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00300             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00301             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00302         else
00303             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00304         break;
00305       default:
00306         if (RSTRING_LEN(dest) == 0) {
00307             if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00308                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00309             else
00310                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00311         }
00312         break;
00313     }
00314 }
00315 
00316 static void
00317 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00318 {
00319     str_enc_copy(dest, src);
00320     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00321 }
00322 
00323 int
00324 rb_enc_str_coderange(VALUE str)
00325 {
00326     int cr = ENC_CODERANGE(str);
00327 
00328     if (cr == ENC_CODERANGE_UNKNOWN) {
00329         rb_encoding *enc = STR_ENC_GET(str);
00330         cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00331         ENC_CODERANGE_SET(str, cr);
00332     }
00333     return cr;
00334 }
00335 
00336 int
00337 rb_enc_str_asciionly_p(VALUE str)
00338 {
00339     rb_encoding *enc = STR_ENC_GET(str);
00340 
00341     if (!rb_enc_asciicompat(enc))
00342         return FALSE;
00343     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00344         return TRUE;
00345     return FALSE;
00346 }
00347 
00348 static inline void
00349 str_mod_check(VALUE s, const char *p, long len)
00350 {
00351     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00352         rb_raise(rb_eRuntimeError, "string modified");
00353     }
00354 }
00355 
00356 size_t
00357 rb_str_capacity(VALUE str)
00358 {
00359     if (STR_EMBED_P(str)) {
00360         return RSTRING_EMBED_LEN_MAX;
00361     }
00362     else if (STR_NOCAPA_P(str)) {
00363         return RSTRING(str)->as.heap.len;
00364     }
00365     else {
00366         return RSTRING(str)->as.heap.aux.capa;
00367     }
00368 }
00369 
00370 static inline VALUE
00371 str_alloc(VALUE klass)
00372 {
00373     NEWOBJ(str, struct RString);
00374     OBJSETUP(str, klass, T_STRING);
00375 
00376     str->as.heap.ptr = 0;
00377     str->as.heap.len = 0;
00378     str->as.heap.aux.capa = 0;
00379 
00380     return (VALUE)str;
00381 }
00382 
00383 static VALUE
00384 str_new(VALUE klass, const char *ptr, long len)
00385 {
00386     VALUE str;
00387 
00388     if (len < 0) {
00389         rb_raise(rb_eArgError, "negative string size (or size too big)");
00390     }
00391 
00392     str = str_alloc(klass);
00393     if (len > RSTRING_EMBED_LEN_MAX) {
00394         RSTRING(str)->as.heap.aux.capa = len;
00395         RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
00396         STR_SET_NOEMBED(str);
00397     }
00398     else if (len == 0) {
00399         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00400     }
00401     if (ptr) {
00402         memcpy(RSTRING_PTR(str), ptr, len);
00403     }
00404     STR_SET_LEN(str, len);
00405     RSTRING_PTR(str)[len] = '\0';
00406     return str;
00407 }
00408 
00409 VALUE
00410 rb_str_new(const char *ptr, long len)
00411 {
00412     return str_new(rb_cString, ptr, len);
00413 }
00414 
00415 VALUE
00416 rb_usascii_str_new(const char *ptr, long len)
00417 {
00418     VALUE str = rb_str_new(ptr, len);
00419     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00420     return str;
00421 }
00422 
00423 VALUE
00424 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00425 {
00426     VALUE str = rb_str_new(ptr, len);
00427     rb_enc_associate(str, enc);
00428     return str;
00429 }
00430 
00431 VALUE
00432 rb_str_new_cstr(const char *ptr)
00433 {
00434     if (!ptr) {
00435         rb_raise(rb_eArgError, "NULL pointer given");
00436     }
00437     return rb_str_new(ptr, strlen(ptr));
00438 }
00439 
00440 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
00441 #define rb_str_new2 rb_str_new_cstr
00442 
00443 VALUE
00444 rb_usascii_str_new_cstr(const char *ptr)
00445 {
00446     VALUE str = rb_str_new2(ptr);
00447     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00448     return str;
00449 }
00450 
00451 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
00452 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
00453 
00454 VALUE
00455 rb_tainted_str_new(const char *ptr, long len)
00456 {
00457     VALUE str = rb_str_new(ptr, len);
00458 
00459     OBJ_TAINT(str);
00460     return str;
00461 }
00462 
00463 VALUE
00464 rb_tainted_str_new_cstr(const char *ptr)
00465 {
00466     VALUE str = rb_str_new2(ptr);
00467 
00468     OBJ_TAINT(str);
00469     return str;
00470 }
00471 
00472 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
00473 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
00474 
00475 VALUE
00476 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00477 {
00478     rb_econv_t *ec;
00479     rb_econv_result_t ret;
00480     long len;
00481     VALUE newstr;
00482     const unsigned char *sp;
00483     unsigned char *dp;
00484 
00485     if (!to) return str;
00486     if (from == to) return str;
00487     if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
00488         to == rb_ascii8bit_encoding()) {
00489         if (STR_ENC_GET(str) != to) {
00490             str = rb_str_dup(str);
00491             rb_enc_associate(str, to);
00492         }
00493         return str;
00494     }
00495 
00496     len = RSTRING_LEN(str);
00497     newstr = rb_str_new(0, len);
00498 
00499   retry:
00500     ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00501     if (!ec) return str;
00502 
00503     sp = (unsigned char*)RSTRING_PTR(str);
00504     dp = (unsigned char*)RSTRING_PTR(newstr);
00505     ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
00506                            &dp, (unsigned char*)RSTRING_END(newstr), 0);
00507     rb_econv_close(ec);
00508     switch (ret) {
00509       case econv_destination_buffer_full:
00510         /* destination buffer short */
00511         len = len < 2 ? 2 : len * 2;
00512         rb_str_resize(newstr, len);
00513         goto retry;
00514 
00515       case econv_finished:
00516         len = dp - (unsigned char*)RSTRING_PTR(newstr);
00517         rb_str_set_len(newstr, len);
00518         rb_enc_associate(newstr, to);
00519         return newstr;
00520 
00521       default:
00522         /* some error, return original */
00523         return str;
00524     }
00525 }
00526 
00527 VALUE
00528 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00529 {
00530     return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00531 }
00532 
00533 VALUE
00534 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00535 {
00536     VALUE str;
00537 
00538     str = rb_tainted_str_new(ptr, len);
00539     if (eenc == rb_usascii_encoding() &&
00540         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00541         rb_enc_associate(str, rb_ascii8bit_encoding());
00542         return str;
00543     }
00544     rb_enc_associate(str, eenc);
00545     return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00546 }
00547 
00548 VALUE
00549 rb_external_str_new(const char *ptr, long len)
00550 {
00551     return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00552 }
00553 
00554 VALUE
00555 rb_external_str_new_cstr(const char *ptr)
00556 {
00557     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00558 }
00559 
00560 VALUE
00561 rb_locale_str_new(const char *ptr, long len)
00562 {
00563     return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00564 }
00565 
00566 VALUE
00567 rb_locale_str_new_cstr(const char *ptr)
00568 {
00569     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00570 }
00571 
00572 VALUE
00573 rb_filesystem_str_new(const char *ptr, long len)
00574 {
00575     return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00576 }
00577 
00578 VALUE
00579 rb_filesystem_str_new_cstr(const char *ptr)
00580 {
00581     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00582 }
00583 
00584 VALUE
00585 rb_str_export(VALUE str)
00586 {
00587     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00588 }
00589 
00590 VALUE
00591 rb_str_export_locale(VALUE str)
00592 {
00593     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00594 }
00595 
00596 VALUE
00597 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00598 {
00599     return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00600 }
00601 
00602 static VALUE
00603 str_replace_shared(VALUE str2, VALUE str)
00604 {
00605     if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00606         STR_SET_EMBED(str2);
00607         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00608         STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00609     }
00610     else {
00611         str = rb_str_new_frozen(str);
00612         FL_SET(str2, STR_NOEMBED);
00613         RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00614         RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00615         RSTRING(str2)->as.heap.aux.shared = str;
00616         FL_SET(str2, ELTS_SHARED);
00617     }
00618     rb_enc_cr_str_exact_copy(str2, str);
00619 
00620     return str2;
00621 }
00622 
00623 static VALUE
00624 str_new_shared(VALUE klass, VALUE str)
00625 {
00626     return str_replace_shared(str_alloc(klass), str);
00627 }
00628 
00629 static VALUE
00630 str_new3(VALUE klass, VALUE str)
00631 {
00632     return str_new_shared(klass, str);
00633 }
00634 
00635 VALUE
00636 rb_str_new_shared(VALUE str)
00637 {
00638     VALUE str2 = str_new3(rb_obj_class(str), str);
00639 
00640     OBJ_INFECT(str2, str);
00641     return str2;
00642 }
00643 
00644 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
00645 #define rb_str_new3 rb_str_new_shared
00646 
00647 static VALUE
00648 str_new4(VALUE klass, VALUE str)
00649 {
00650     VALUE str2;
00651 
00652     str2 = str_alloc(klass);
00653     STR_SET_NOEMBED(str2);
00654     RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00655     RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00656     if (STR_SHARED_P(str)) {
00657         VALUE shared = RSTRING(str)->as.heap.aux.shared;
00658         assert(OBJ_FROZEN(shared));
00659         FL_SET(str2, ELTS_SHARED);
00660         RSTRING(str2)->as.heap.aux.shared = shared;
00661     }
00662     else {
00663         FL_SET(str, ELTS_SHARED);
00664         RSTRING(str)->as.heap.aux.shared = str2;
00665     }
00666     rb_enc_cr_str_exact_copy(str2, str);
00667     OBJ_INFECT(str2, str);
00668     return str2;
00669 }
00670 
00671 VALUE
00672 rb_str_new_frozen(VALUE orig)
00673 {
00674     VALUE klass, str;
00675 
00676     if (OBJ_FROZEN(orig)) return orig;
00677     klass = rb_obj_class(orig);
00678     if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00679         long ofs;
00680         assert(OBJ_FROZEN(str));
00681         ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00682         if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00683             (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
00684             ENCODING_GET(str) != ENCODING_GET(orig)) {
00685             str = str_new3(klass, str);
00686             RSTRING(str)->as.heap.ptr += ofs;
00687             RSTRING(str)->as.heap.len -= ofs;
00688             rb_enc_cr_str_exact_copy(str, orig);
00689             OBJ_INFECT(str, orig);
00690         }
00691     }
00692     else if (STR_EMBED_P(orig)) {
00693         str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00694         rb_enc_cr_str_exact_copy(str, orig);
00695         OBJ_INFECT(str, orig);
00696     }
00697     else if (STR_ASSOC_P(orig)) {
00698         VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00699         FL_UNSET(orig, STR_ASSOC);
00700         str = str_new4(klass, orig);
00701         FL_SET(str, STR_ASSOC);
00702         RSTRING(str)->as.heap.aux.shared = assoc;
00703     }
00704     else {
00705         str = str_new4(klass, orig);
00706     }
00707     OBJ_FREEZE(str);
00708     return str;
00709 }
00710 
00711 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
00712 #define rb_str_new4 rb_str_new_frozen
00713 
00714 VALUE
00715 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00716 {
00717     return str_new(rb_obj_class(obj), ptr, len);
00718 }
00719 
00720 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
00721            rb_str_new_with_class, (obj, ptr, len))
00722 #define rb_str_new5 rb_str_new_with_class
00723 
00724 static VALUE
00725 str_new_empty(VALUE str)
00726 {
00727     VALUE v = rb_str_new5(str, 0, 0);
00728     rb_enc_copy(v, str);
00729     OBJ_INFECT(v, str);
00730     return v;
00731 }
00732 
00733 #define STR_BUF_MIN_SIZE 128
00734 
00735 VALUE
00736 rb_str_buf_new(long capa)
00737 {
00738     VALUE str = str_alloc(rb_cString);
00739 
00740     if (capa < STR_BUF_MIN_SIZE) {
00741         capa = STR_BUF_MIN_SIZE;
00742     }
00743     FL_SET(str, STR_NOEMBED);
00744     RSTRING(str)->as.heap.aux.capa = capa;
00745     RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00746     RSTRING(str)->as.heap.ptr[0] = '\0';
00747 
00748     return str;
00749 }
00750 
00751 VALUE
00752 rb_str_buf_new_cstr(const char *ptr)
00753 {
00754     VALUE str;
00755     long len = strlen(ptr);
00756 
00757     str = rb_str_buf_new(len);
00758     rb_str_buf_cat(str, ptr, len);
00759 
00760     return str;
00761 }
00762 
00763 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
00764 #define rb_str_buf_new2 rb_str_buf_new_cstr
00765 
00766 VALUE
00767 rb_str_tmp_new(long len)
00768 {
00769     return str_new(0, 0, len);
00770 }
00771 
00772 void *
00773 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
00774 {
00775     VALUE s = rb_str_tmp_new(len);
00776     *store = s;
00777     return RSTRING_PTR(s);
00778 }
00779 
00780 void
00781 rb_free_tmp_buffer(volatile VALUE *store)
00782 {
00783     VALUE s = *store;
00784     *store = 0;
00785     if (s) rb_str_clear(s);
00786 }
00787 
00788 void
00789 rb_str_free(VALUE str)
00790 {
00791     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00792         xfree(RSTRING(str)->as.heap.ptr);
00793     }
00794 }
00795 
00796 RUBY_FUNC_EXPORTED size_t
00797 rb_str_memsize(VALUE str)
00798 {
00799     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00800         return RSTRING(str)->as.heap.aux.capa;
00801     }
00802     else {
00803         return 0;
00804     }
00805 }
00806 
00807 VALUE
00808 rb_str_to_str(VALUE str)
00809 {
00810     return rb_convert_type(str, T_STRING, "String", "to_str");
00811 }
00812 
00813 static inline void str_discard(VALUE str);
00814 
00815 void
00816 rb_str_shared_replace(VALUE str, VALUE str2)
00817 {
00818     rb_encoding *enc;
00819     int cr;
00820     if (str == str2) return;
00821     enc = STR_ENC_GET(str2);
00822     cr = ENC_CODERANGE(str2);
00823     str_discard(str);
00824     OBJ_INFECT(str, str2);
00825     if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00826         STR_SET_EMBED(str);
00827         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00828         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00829         rb_enc_associate(str, enc);
00830         ENC_CODERANGE_SET(str, cr);
00831         return;
00832     }
00833     STR_SET_NOEMBED(str);
00834     STR_UNSET_NOCAPA(str);
00835     RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00836     RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00837     if (STR_NOCAPA_P(str2)) {
00838         FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00839         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
00840     }
00841     else {
00842         RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
00843     }
00844     STR_SET_EMBED(str2);        /* abandon str2 */
00845     RSTRING_PTR(str2)[0] = 0;
00846     STR_SET_EMBED_LEN(str2, 0);
00847     rb_enc_associate(str, enc);
00848     ENC_CODERANGE_SET(str, cr);
00849 }
00850 
00851 static ID id_to_s;
00852 
00853 VALUE
00854 rb_obj_as_string(VALUE obj)
00855 {
00856     VALUE str;
00857 
00858     if (TYPE(obj) == T_STRING) {
00859         return obj;
00860     }
00861     str = rb_funcall(obj, id_to_s, 0);
00862     if (TYPE(str) != T_STRING)
00863         return rb_any_to_s(obj);
00864     if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
00865     return str;
00866 }
00867 
00868 static VALUE
00869 str_replace(VALUE str, VALUE str2)
00870 {
00871     long len;
00872 
00873     len = RSTRING_LEN(str2);
00874     if (STR_ASSOC_P(str2)) {
00875         str2 = rb_str_new4(str2);
00876     }
00877     if (STR_SHARED_P(str2)) {
00878         VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00879         assert(OBJ_FROZEN(shared));
00880         STR_SET_NOEMBED(str);
00881         RSTRING(str)->as.heap.len = len;
00882         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00883         FL_SET(str, ELTS_SHARED);
00884         FL_UNSET(str, STR_ASSOC);
00885         RSTRING(str)->as.heap.aux.shared = shared;
00886     }
00887     else {
00888         str_replace_shared(str, str2);
00889     }
00890 
00891     OBJ_INFECT(str, str2);
00892     rb_enc_cr_str_exact_copy(str, str2);
00893     return str;
00894 }
00895 
00896 static VALUE
00897 str_duplicate(VALUE klass, VALUE str)
00898 {
00899     VALUE dup = str_alloc(klass);
00900     str_replace(dup, str);
00901     return dup;
00902 }
00903 
00904 VALUE
00905 rb_str_dup(VALUE str)
00906 {
00907     return str_duplicate(rb_obj_class(str), str);
00908 }
00909 
00910 VALUE
00911 rb_str_resurrect(VALUE str)
00912 {
00913     return str_replace(str_alloc(rb_cString), str);
00914 }
00915 
00916 /*
00917  *  call-seq:
00918  *     String.new(str="")   -> new_str
00919  *
00920  *  Returns a new string object containing a copy of <i>str</i>.
00921  */
00922 
00923 static VALUE
00924 rb_str_init(int argc, VALUE *argv, VALUE str)
00925 {
00926     VALUE orig;
00927 
00928     if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
00929         rb_str_replace(str, orig);
00930     return str;
00931 }
00932 
00933 static inline long
00934 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
00935 {
00936     long c;
00937     const char *q;
00938 
00939     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00940         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00941     }
00942     else if (rb_enc_asciicompat(enc)) {
00943         c = 0;
00944         if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
00945             while (p < e) {
00946                 if (ISASCII(*p)) {
00947                     q = search_nonascii(p, e);
00948                     if (!q)
00949                         return c + (e - p);
00950                     c += q - p;
00951                     p = q;
00952                 }
00953                 p += rb_enc_fast_mbclen(p, e, enc);
00954                 c++;
00955             }
00956         }
00957         else {
00958             while (p < e) {
00959                 if (ISASCII(*p)) {
00960                     q = search_nonascii(p, e);
00961                     if (!q)
00962                         return c + (e - p);
00963                     c += q - p;
00964                     p = q;
00965                 }
00966                 p += rb_enc_mbclen(p, e, enc);
00967                 c++;
00968             }
00969         }
00970         return c;
00971     }
00972 
00973     for (c=0; p<e; c++) {
00974         p += rb_enc_mbclen(p, e, enc);
00975     }
00976     return c;
00977 }
00978 
00979 long
00980 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
00981 {
00982     return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
00983 }
00984 
00985 long
00986 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
00987 {
00988     long c;
00989     const char *q;
00990     int ret;
00991 
00992     *cr = 0;
00993     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00994         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00995     }
00996     else if (rb_enc_asciicompat(enc)) {
00997         c = 0;
00998         while (p < e) {
00999             if (ISASCII(*p)) {
01000                 q = search_nonascii(p, e);
01001                 if (!q) {
01002                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
01003                     return c + (e - p);
01004                 }
01005                 c += q - p;
01006                 p = q;
01007             }
01008             ret = rb_enc_precise_mbclen(p, e, enc);
01009             if (MBCLEN_CHARFOUND_P(ret)) {
01010                 *cr |= ENC_CODERANGE_VALID;
01011                 p += MBCLEN_CHARFOUND_LEN(ret);
01012             }
01013             else {
01014                 *cr = ENC_CODERANGE_BROKEN;
01015                 p++;
01016             }
01017             c++;
01018         }
01019         if (!*cr) *cr = ENC_CODERANGE_7BIT;
01020         return c;
01021     }
01022 
01023     for (c=0; p<e; c++) {
01024         ret = rb_enc_precise_mbclen(p, e, enc);
01025         if (MBCLEN_CHARFOUND_P(ret)) {
01026             *cr |= ENC_CODERANGE_VALID;
01027             p += MBCLEN_CHARFOUND_LEN(ret);
01028         }
01029         else {
01030             *cr = ENC_CODERANGE_BROKEN;
01031             if (p + rb_enc_mbminlen(enc) <= e)
01032                 p += rb_enc_mbminlen(enc);
01033             else
01034                 p = e;
01035         }
01036     }
01037     if (!*cr) *cr = ENC_CODERANGE_7BIT;
01038     return c;
01039 }
01040 
01041 #ifdef NONASCII_MASK
01042 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01043 
01044 /*
01045  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
01046  * bit represention. (see http://en.wikipedia.org/wiki/UTF-8)
01047  * Therefore, following pseudo code can detect UTF-8 leading byte.
01048  *
01049  * if (!(byte & 0x80))
01050  *   byte |= 0x40;          // turn on bit6
01051  * return ((byte>>6) & 1);  // bit6 represent it's leading byte or not.
01052  *
01053  * This function calculate every bytes in the argument word `s'
01054  * using the above logic concurrently. and gather every bytes result.
01055  */
01056 static inline VALUE
01057 count_utf8_lead_bytes_with_word(const VALUE *s)
01058 {
01059     VALUE d = *s;
01060 
01061     /* Transform into bit0 represent UTF-8 leading or not. */
01062     d |= ~(d>>1);
01063     d >>= 6;
01064     d &= NONASCII_MASK >> 7;
01065 
01066     /* Gather every bytes. */
01067     d += (d>>8);
01068     d += (d>>16);
01069 #if SIZEOF_VALUE == 8
01070     d += (d>>32);
01071 #endif
01072     return (d&0xF);
01073 }
01074 #endif
01075 
01076 static long
01077 str_strlen(VALUE str, rb_encoding *enc)
01078 {
01079     const char *p, *e;
01080     long n;
01081     int cr;
01082 
01083     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01084     if (!enc) enc = STR_ENC_GET(str);
01085     p = RSTRING_PTR(str);
01086     e = RSTRING_END(str);
01087     cr = ENC_CODERANGE(str);
01088 #ifdef NONASCII_MASK
01089     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01090         enc == rb_utf8_encoding()) {
01091 
01092         VALUE len = 0;
01093         if ((int)sizeof(VALUE) * 2 < e - p) {
01094             const VALUE *s, *t;
01095             const VALUE lowbits = sizeof(VALUE) - 1;
01096             s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01097             t = (const VALUE*)(~lowbits & (VALUE)e);
01098             while (p < (const char *)s) {
01099                 if (is_utf8_lead_byte(*p)) len++;
01100                 p++;
01101             }
01102             while (s < t) {
01103                 len += count_utf8_lead_bytes_with_word(s);
01104                 s++;
01105             }
01106             p = (const char *)s;
01107         }
01108         while (p < e) {
01109             if (is_utf8_lead_byte(*p)) len++;
01110             p++;
01111         }
01112         return (long)len;
01113     }
01114 #endif
01115     n = rb_enc_strlen_cr(p, e, enc, &cr);
01116     if (cr) {
01117         ENC_CODERANGE_SET(str, cr);
01118     }
01119     return n;
01120 }
01121 
01122 long
01123 rb_str_strlen(VALUE str)
01124 {
01125     return str_strlen(str, STR_ENC_GET(str));
01126 }
01127 
01128 /*
01129  *  call-seq:
01130  *     str.length   -> integer
01131  *     str.size     -> integer
01132  *
01133  *  Returns the character length of <i>str</i>.
01134  */
01135 
01136 VALUE
01137 rb_str_length(VALUE str)
01138 {
01139     long len;
01140 
01141     len = str_strlen(str, STR_ENC_GET(str));
01142     return LONG2NUM(len);
01143 }
01144 
01145 /*
01146  *  call-seq:
01147  *     str.bytesize  -> integer
01148  *
01149  *  Returns the length of <i>str</i> in bytes.
01150  */
01151 
01152 static VALUE
01153 rb_str_bytesize(VALUE str)
01154 {
01155     return LONG2NUM(RSTRING_LEN(str));
01156 }
01157 
01158 /*
01159  *  call-seq:
01160  *     str.empty?   -> true or false
01161  *
01162  *  Returns <code>true</code> if <i>str</i> has a length of zero.
01163  *
01164  *     "hello".empty?   #=> false
01165  *     "".empty?        #=> true
01166  */
01167 
01168 static VALUE
01169 rb_str_empty(VALUE str)
01170 {
01171     if (RSTRING_LEN(str) == 0)
01172         return Qtrue;
01173     return Qfalse;
01174 }
01175 
01176 /*
01177  *  call-seq:
01178  *     str + other_str   -> new_str
01179  *
01180  *  Concatenation---Returns a new <code>String</code> containing
01181  *  <i>other_str</i> concatenated to <i>str</i>.
01182  *
01183  *     "Hello from " + self.to_s   #=> "Hello from main"
01184  */
01185 
01186 VALUE
01187 rb_str_plus(VALUE str1, VALUE str2)
01188 {
01189     VALUE str3;
01190     rb_encoding *enc;
01191 
01192     StringValue(str2);
01193     enc = rb_enc_check(str1, str2);
01194     str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01195     memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01196     memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01197            RSTRING_PTR(str2), RSTRING_LEN(str2));
01198     RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01199 
01200     if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
01201         OBJ_TAINT(str3);
01202     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01203                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01204     return str3;
01205 }
01206 
01207 /*
01208  *  call-seq:
01209  *     str * integer   -> new_str
01210  *
01211  *  Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
01212  *  the receiver.
01213  *
01214  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
01215  */
01216 
01217 VALUE
01218 rb_str_times(VALUE str, VALUE times)
01219 {
01220     VALUE str2;
01221     long n, len;
01222     char *ptr2;
01223 
01224     len = NUM2LONG(times);
01225     if (len < 0) {
01226         rb_raise(rb_eArgError, "negative argument");
01227     }
01228     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
01229         rb_raise(rb_eArgError, "argument too big");
01230     }
01231 
01232     str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01233     ptr2 = RSTRING_PTR(str2);
01234     if (len) {
01235         n = RSTRING_LEN(str);
01236         memcpy(ptr2, RSTRING_PTR(str), n);
01237         while (n <= len/2) {
01238             memcpy(ptr2 + n, ptr2, n);
01239             n *= 2;
01240         }
01241         memcpy(ptr2 + n, ptr2, len-n);
01242     }
01243     ptr2[RSTRING_LEN(str2)] = '\0';
01244     OBJ_INFECT(str2, str);
01245     rb_enc_cr_str_copy_for_substr(str2, str);
01246 
01247     return str2;
01248 }
01249 
01250 /*
01251  *  call-seq:
01252  *     str % arg   -> new_str
01253  *
01254  *  Format---Uses <i>str</i> as a format specification, and returns the result
01255  *  of applying it to <i>arg</i>. If the format specification contains more than
01256  *  one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
01257  *  containing the values to be substituted. See <code>Kernel::sprintf</code> for
01258  *  details of the format string.
01259  *
01260  *     "%05d" % 123                              #=> "00123"
01261  *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
01262  *     "foo = %{foo}" % { :foo => 'bar' }        #=> "foo = bar"
01263  */
01264 
01265 static VALUE
01266 rb_str_format_m(VALUE str, VALUE arg)
01267 {
01268     volatile VALUE tmp = rb_check_array_type(arg);
01269 
01270     if (!NIL_P(tmp)) {
01271         return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
01272     }
01273     return rb_str_format(1, &arg, str);
01274 }
01275 
01276 static inline void
01277 str_modifiable(VALUE str)
01278 {
01279     if (FL_TEST(str, STR_TMPLOCK)) {
01280         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01281     }
01282     rb_check_frozen(str);
01283     if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
01284         rb_raise(rb_eSecurityError, "Insecure: can't modify string");
01285 }
01286 
01287 static inline int
01288 str_independent(VALUE str)
01289 {
01290     str_modifiable(str);
01291     if (!STR_SHARED_P(str)) return 1;
01292     if (STR_EMBED_P(str)) return 1;
01293     return 0;
01294 }
01295 
01296 static void
01297 str_make_independent_expand(VALUE str, long expand)
01298 {
01299     char *ptr;
01300     long len = RSTRING_LEN(str);
01301     long capa = len + expand;
01302 
01303     if (len > capa) len = capa;
01304     ptr = ALLOC_N(char, capa + 1);
01305     if (RSTRING_PTR(str)) {
01306         memcpy(ptr, RSTRING_PTR(str), len);
01307     }
01308     STR_SET_NOEMBED(str);
01309     STR_UNSET_NOCAPA(str);
01310     ptr[len] = 0;
01311     RSTRING(str)->as.heap.ptr = ptr;
01312     RSTRING(str)->as.heap.len = len;
01313     RSTRING(str)->as.heap.aux.capa = capa;
01314 }
01315 
01316 #define str_make_independent(str) str_make_independent_expand((str), 0L)
01317 
01318 void
01319 rb_str_modify(VALUE str)
01320 {
01321     if (!str_independent(str))
01322         str_make_independent(str);
01323     ENC_CODERANGE_CLEAR(str);
01324 }
01325 
01326 void
01327 rb_str_modify_expand(VALUE str, long expand)
01328 {
01329     if (expand < 0) {
01330         rb_raise(rb_eArgError, "negative expanding string size");
01331     }
01332     if (!str_independent(str)) {
01333         str_make_independent_expand(str, expand);
01334     }
01335     else if (expand > 0) {
01336         long len = RSTRING_LEN(str);
01337         long capa = len + expand;
01338         if (!STR_EMBED_P(str)) {
01339             REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
01340             RSTRING(str)->as.heap.aux.capa = capa;
01341         }
01342         else if (capa > RSTRING_EMBED_LEN_MAX) {
01343             str_make_independent_expand(str, expand);
01344         }
01345     }
01346     ENC_CODERANGE_CLEAR(str);
01347 }
01348 
01349 /* As rb_str_modify(), but don't clear coderange */
01350 static void
01351 str_modify_keep_cr(VALUE str)
01352 {
01353     if (!str_independent(str))
01354         str_make_independent(str);
01355     if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01356         /* Force re-scan later */
01357         ENC_CODERANGE_CLEAR(str);
01358 }
01359 
01360 static inline void
01361 str_discard(VALUE str)
01362 {
01363     str_modifiable(str);
01364     if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01365         xfree(RSTRING_PTR(str));
01366         RSTRING(str)->as.heap.ptr = 0;
01367         RSTRING(str)->as.heap.len = 0;
01368     }
01369 }
01370 
01371 void
01372 rb_str_associate(VALUE str, VALUE add)
01373 {
01374     /* sanity check */
01375     rb_check_frozen(str);
01376     if (STR_ASSOC_P(str)) {
01377         /* already associated */
01378         rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01379     }
01380     else {
01381         if (STR_SHARED_P(str)) {
01382             VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01383             str_make_independent(str);
01384             if (STR_ASSOC_P(assoc)) {
01385                 assoc = RSTRING(assoc)->as.heap.aux.shared;
01386                 rb_ary_concat(assoc, add);
01387                 add = assoc;
01388             }
01389         }
01390         else if (STR_EMBED_P(str)) {
01391             str_make_independent(str);
01392         }
01393         else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01394             RESIZE_CAPA(str, RSTRING_LEN(str));
01395         }
01396         FL_SET(str, STR_ASSOC);
01397         RBASIC(add)->klass = 0;
01398         RSTRING(str)->as.heap.aux.shared = add;
01399     }
01400 }
01401 
01402 VALUE
01403 rb_str_associated(VALUE str)
01404 {
01405     if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01406     if (STR_ASSOC_P(str)) {
01407         return RSTRING(str)->as.heap.aux.shared;
01408     }
01409     return Qfalse;
01410 }
01411 
01412 VALUE
01413 rb_string_value(volatile VALUE *ptr)
01414 {
01415     VALUE s = *ptr;
01416     if (TYPE(s) != T_STRING) {
01417         s = rb_str_to_str(s);
01418         *ptr = s;
01419     }
01420     return s;
01421 }
01422 
01423 char *
01424 rb_string_value_ptr(volatile VALUE *ptr)
01425 {
01426     VALUE str = rb_string_value(ptr);
01427     return RSTRING_PTR(str);
01428 }
01429 
01430 char *
01431 rb_string_value_cstr(volatile VALUE *ptr)
01432 {
01433     VALUE str = rb_string_value(ptr);
01434     char *s = RSTRING_PTR(str);
01435     long len = RSTRING_LEN(str);
01436 
01437     if (!s || memchr(s, 0, len)) {
01438         rb_raise(rb_eArgError, "string contains null byte");
01439     }
01440     if (s[len]) {
01441         rb_str_modify(str);
01442         s = RSTRING_PTR(str);
01443         s[RSTRING_LEN(str)] = 0;
01444     }
01445     return s;
01446 }
01447 
01448 VALUE
01449 rb_check_string_type(VALUE str)
01450 {
01451     str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01452     return str;
01453 }
01454 
01455 /*
01456  *  call-seq:
01457  *     String.try_convert(obj) -> string or nil
01458  *
01459  *  Try to convert <i>obj</i> into a String, using to_str method.
01460  *  Returns converted string or nil if <i>obj</i> cannot be converted
01461  *  for any reason.
01462  *
01463  *     String.try_convert("str")     #=> "str"
01464  *     String.try_convert(/re/)      #=> nil
01465  */
01466 static VALUE
01467 rb_str_s_try_convert(VALUE dummy, VALUE str)
01468 {
01469     return rb_check_string_type(str);
01470 }
01471 
01472 static char*
01473 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
01474 {
01475     long nth = *nthp;
01476     if (rb_enc_mbmaxlen(enc) == 1) {
01477         p += nth;
01478     }
01479     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01480         p += nth * rb_enc_mbmaxlen(enc);
01481     }
01482     else if (rb_enc_asciicompat(enc)) {
01483         const char *p2, *e2;
01484         int n;
01485 
01486         while (p < e && 0 < nth) {
01487             e2 = p + nth;
01488             if (e < e2) {
01489                 *nthp = nth;
01490                 return (char *)e;
01491             }
01492             if (ISASCII(*p)) {
01493                 p2 = search_nonascii(p, e2);
01494                 if (!p2) {
01495                     *nthp = nth;
01496                     return (char *)e2;
01497                 }
01498                 nth -= p2 - p;
01499                 p = p2;
01500             }
01501             n = rb_enc_mbclen(p, e, enc);
01502             p += n;
01503             nth--;
01504         }
01505         *nthp = nth;
01506         if (nth != 0) {
01507             return (char *)e;
01508         }
01509         return (char *)p;
01510     }
01511     else {
01512         while (p < e && nth--) {
01513             p += rb_enc_mbclen(p, e, enc);
01514         }
01515     }
01516     if (p > e) p = e;
01517     *nthp = nth;
01518     return (char*)p;
01519 }
01520 
01521 char*
01522 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01523 {
01524     return str_nth_len(p, e, &nth, enc);
01525 }
01526 
01527 static char*
01528 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01529 {
01530     if (singlebyte)
01531         p += nth;
01532     else {
01533         p = str_nth_len(p, e, &nth, enc);
01534     }
01535     if (!p) return 0;
01536     if (p > e) p = e;
01537     return (char *)p;
01538 }
01539 
01540 /* char offset to byte offset */
01541 static long
01542 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01543 {
01544     const char *pp = str_nth(p, e, nth, enc, singlebyte);
01545     if (!pp) return e - p;
01546     return pp - p;
01547 }
01548 
01549 long
01550 rb_str_offset(VALUE str, long pos)
01551 {
01552     return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01553                       STR_ENC_GET(str), single_byte_optimizable(str));
01554 }
01555 
01556 #ifdef NONASCII_MASK
01557 static char *
01558 str_utf8_nth(const char *p, const char *e, long *nthp)
01559 {
01560     long nth = *nthp;
01561     if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01562         const VALUE *s, *t;
01563         const VALUE lowbits = sizeof(VALUE) - 1;
01564         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01565         t = (const VALUE*)(~lowbits & (VALUE)e);
01566         while (p < (const char *)s) {
01567             if (is_utf8_lead_byte(*p)) nth--;
01568             p++;
01569         }
01570         do {
01571             nth -= count_utf8_lead_bytes_with_word(s);
01572             s++;
01573         } while (s < t && (int)sizeof(VALUE) <= nth);
01574         p = (char *)s;
01575     }
01576     while (p < e) {
01577         if (is_utf8_lead_byte(*p)) {
01578             if (nth == 0) break;
01579             nth--;
01580         }
01581         p++;
01582     }
01583     *nthp = nth;
01584     return (char *)p;
01585 }
01586 
01587 static long
01588 str_utf8_offset(const char *p, const char *e, long nth)
01589 {
01590     const char *pp = str_utf8_nth(p, e, &nth);
01591     return pp - p;
01592 }
01593 #endif
01594 
01595 /* byte offset to char offset */
01596 long
01597 rb_str_sublen(VALUE str, long pos)
01598 {
01599     if (single_byte_optimizable(str) || pos < 0)
01600         return pos;
01601     else {
01602         char *p = RSTRING_PTR(str);
01603         return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01604     }
01605 }
01606 
01607 VALUE
01608 rb_str_subseq(VALUE str, long beg, long len)
01609 {
01610     VALUE str2;
01611 
01612     if (RSTRING_LEN(str) == beg + len &&
01613         RSTRING_EMBED_LEN_MAX < len) {
01614         str2 = rb_str_new_shared(rb_str_new_frozen(str));
01615         rb_str_drop_bytes(str2, beg);
01616     }
01617     else {
01618         str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01619     }
01620 
01621     rb_enc_cr_str_copy_for_substr(str2, str);
01622     OBJ_INFECT(str2, str);
01623 
01624     return str2;
01625 }
01626 
01627 VALUE
01628 rb_str_substr(VALUE str, long beg, long len)
01629 {
01630     rb_encoding *enc = STR_ENC_GET(str);
01631     VALUE str2;
01632     char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
01633 
01634     if (len < 0) return Qnil;
01635     if (!RSTRING_LEN(str)) {
01636         len = 0;
01637     }
01638     if (single_byte_optimizable(str)) {
01639         if (beg > RSTRING_LEN(str)) return Qnil;
01640         if (beg < 0) {
01641             beg += RSTRING_LEN(str);
01642             if (beg < 0) return Qnil;
01643         }
01644         if (beg + len > RSTRING_LEN(str))
01645             len = RSTRING_LEN(str) - beg;
01646         if (len <= 0) {
01647             len = 0;
01648             p = 0;
01649         }
01650         else
01651             p = s + beg;
01652         goto sub;
01653     }
01654     if (beg < 0) {
01655         if (len > -beg) len = -beg;
01656         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01657             beg = -beg;
01658             while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01659             p = e;
01660             if (!p) return Qnil;
01661             while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01662             if (!p) return Qnil;
01663             len = e - p;
01664             goto sub;
01665         }
01666         else {
01667             beg += str_strlen(str, enc);
01668             if (beg < 0) return Qnil;
01669         }
01670     }
01671     else if (beg > 0 && beg > RSTRING_LEN(str)) {
01672         return Qnil;
01673     }
01674     if (len == 0) {
01675         if (beg > str_strlen(str, enc)) return Qnil;
01676         p = 0;
01677     }
01678 #ifdef NONASCII_MASK
01679     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01680         enc == rb_utf8_encoding()) {
01681         p = str_utf8_nth(s, e, &beg);
01682         if (beg > 0) return Qnil;
01683         len = str_utf8_offset(p, e, len);
01684     }
01685 #endif
01686     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01687         int char_sz = rb_enc_mbmaxlen(enc);
01688 
01689         p = s + beg * char_sz;
01690         if (p > e) {
01691             return Qnil;
01692         }
01693         else if (len * char_sz > e - p)
01694             len = e - p;
01695         else
01696             len *= char_sz;
01697     }
01698     else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
01699         if (beg > 0) return Qnil;
01700         len = 0;
01701     }
01702     else {
01703         len = str_offset(p, e, len, enc, 0);
01704     }
01705   sub:
01706     if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
01707         str2 = rb_str_new4(str);
01708         str2 = str_new3(rb_obj_class(str2), str2);
01709         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01710         RSTRING(str2)->as.heap.len = len;
01711     }
01712     else {
01713         str2 = rb_str_new5(str, p, len);
01714         rb_enc_cr_str_copy_for_substr(str2, str);
01715         OBJ_INFECT(str2, str);
01716     }
01717 
01718     return str2;
01719 }
01720 
01721 VALUE
01722 rb_str_freeze(VALUE str)
01723 {
01724     if (STR_ASSOC_P(str)) {
01725         VALUE ary = RSTRING(str)->as.heap.aux.shared;
01726         OBJ_FREEZE(ary);
01727     }
01728     return rb_obj_freeze(str);
01729 }
01730 
01731 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01732 #define rb_str_dup_frozen rb_str_new_frozen
01733 
01734 VALUE
01735 rb_str_locktmp(VALUE str)
01736 {
01737     if (FL_TEST(str, STR_TMPLOCK)) {
01738         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01739     }
01740     FL_SET(str, STR_TMPLOCK);
01741     return str;
01742 }
01743 
01744 VALUE
01745 rb_str_unlocktmp(VALUE str)
01746 {
01747     if (!FL_TEST(str, STR_TMPLOCK)) {
01748         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01749     }
01750     FL_UNSET(str, STR_TMPLOCK);
01751     return str;
01752 }
01753 
01754 void
01755 rb_str_set_len(VALUE str, long len)
01756 {
01757     long capa;
01758 
01759     str_modifiable(str);
01760     if (STR_SHARED_P(str)) {
01761         rb_raise(rb_eRuntimeError, "can't set length of shared string");
01762     }
01763     if (len > (capa = (long)rb_str_capacity(str))) {
01764         rb_bug("probable buffer overflow: %ld for %ld", len, capa);
01765     }
01766     STR_SET_LEN(str, len);
01767     RSTRING_PTR(str)[len] = '\0';
01768 }
01769 
01770 VALUE
01771 rb_str_resize(VALUE str, long len)
01772 {
01773     long slen;
01774     int independent;
01775 
01776     if (len < 0) {
01777         rb_raise(rb_eArgError, "negative string size (or size too big)");
01778     }
01779 
01780     independent = str_independent(str);
01781     ENC_CODERANGE_CLEAR(str);
01782     slen = RSTRING_LEN(str);
01783     if (len != slen) {
01784         if (STR_EMBED_P(str)) {
01785             if (len <= RSTRING_EMBED_LEN_MAX) {
01786                 STR_SET_EMBED_LEN(str, len);
01787                 RSTRING(str)->as.ary[len] = '\0';
01788                 return str;
01789             }
01790             str_make_independent_expand(str, len - slen);
01791             STR_SET_NOEMBED(str);
01792         }
01793         else if (len <= RSTRING_EMBED_LEN_MAX) {
01794             char *ptr = RSTRING(str)->as.heap.ptr;
01795             STR_SET_EMBED(str);
01796             if (slen > len) slen = len;
01797             if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
01798             RSTRING(str)->as.ary[len] = '\0';
01799             STR_SET_EMBED_LEN(str, len);
01800             if (independent) xfree(ptr);
01801             return str;
01802         }
01803         else if (!independent) {
01804             str_make_independent_expand(str, len - slen);
01805         }
01806         else if (slen < len || slen - len > 1024) {
01807             REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01808         }
01809         if (!STR_NOCAPA_P(str)) {
01810             RSTRING(str)->as.heap.aux.capa = len;
01811         }
01812         RSTRING(str)->as.heap.len = len;
01813         RSTRING(str)->as.heap.ptr[len] = '\0';  /* sentinel */
01814     }
01815     return str;
01816 }
01817 
01818 static VALUE
01819 str_buf_cat(VALUE str, const char *ptr, long len)
01820 {
01821     long capa, total, off = -1;
01822 
01823     if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
01824         off = ptr - RSTRING_PTR(str);
01825     }
01826     rb_str_modify(str);
01827     if (len == 0) return 0;
01828     if (STR_ASSOC_P(str)) {
01829         FL_UNSET(str, STR_ASSOC);
01830         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
01831     }
01832     else if (STR_EMBED_P(str)) {
01833         capa = RSTRING_EMBED_LEN_MAX;
01834     }
01835     else {
01836         capa = RSTRING(str)->as.heap.aux.capa;
01837     }
01838     if (RSTRING_LEN(str) >= LONG_MAX - len) {
01839         rb_raise(rb_eArgError, "string sizes too big");
01840     }
01841     total = RSTRING_LEN(str)+len;
01842     if (capa <= total) {
01843         while (total > capa) {
01844             if (capa + 1 >= LONG_MAX / 2) {
01845                 capa = (total + 4095) / 4096;
01846                 break;
01847             }
01848             capa = (capa + 1) * 2;
01849         }
01850         RESIZE_CAPA(str, capa);
01851     }
01852     if (off != -1) {
01853         ptr = RSTRING_PTR(str) + off;
01854     }
01855     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
01856     STR_SET_LEN(str, total);
01857     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
01858 
01859     return str;
01860 }
01861 
01862 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
01863 
01864 VALUE
01865 rb_str_buf_cat(VALUE str, const char *ptr, long len)
01866 {
01867     if (len == 0) return str;
01868     if (len < 0) {
01869         rb_raise(rb_eArgError, "negative string size (or size too big)");
01870     }
01871     return str_buf_cat(str, ptr, len);
01872 }
01873 
01874 VALUE
01875 rb_str_buf_cat2(VALUE str, const char *ptr)
01876 {
01877     return rb_str_buf_cat(str, ptr, strlen(ptr));
01878 }
01879 
01880 VALUE
01881 rb_str_cat(VALUE str, const char *ptr, long len)
01882 {
01883     if (len < 0) {
01884         rb_raise(rb_eArgError, "negative string size (or size too big)");
01885     }
01886     if (STR_ASSOC_P(str)) {
01887         char *p;
01888         rb_str_modify_expand(str, len);
01889         p = RSTRING(str)->as.heap.ptr;
01890         memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
01891         len = RSTRING(str)->as.heap.len += len;
01892         p[len] = '\0'; /* sentinel */
01893         return str;
01894     }
01895 
01896     return rb_str_buf_cat(str, ptr, len);
01897 }
01898 
01899 VALUE
01900 rb_str_cat2(VALUE str, const char *ptr)
01901 {
01902     return rb_str_cat(str, ptr, strlen(ptr));
01903 }
01904 
01905 static VALUE
01906 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
01907     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
01908 {
01909     int str_encindex = ENCODING_GET(str);
01910     int res_encindex;
01911     int str_cr, res_cr;
01912 
01913     str_cr = ENC_CODERANGE(str);
01914 
01915     if (str_encindex == ptr_encindex) {
01916         if (str_cr == ENC_CODERANGE_UNKNOWN)
01917             ptr_cr = ENC_CODERANGE_UNKNOWN;
01918         else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01919             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
01920         }
01921     }
01922     else {
01923         rb_encoding *str_enc = rb_enc_from_index(str_encindex);
01924         rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
01925         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
01926             if (len == 0)
01927                 return str;
01928             if (RSTRING_LEN(str) == 0) {
01929                 rb_str_buf_cat(str, ptr, len);
01930                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
01931                 return str;
01932             }
01933             goto incompatible;
01934         }
01935         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01936             ptr_cr = coderange_scan(ptr, len, ptr_enc);
01937         }
01938         if (str_cr == ENC_CODERANGE_UNKNOWN) {
01939             if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
01940                 str_cr = rb_enc_str_coderange(str);
01941             }
01942         }
01943     }
01944     if (ptr_cr_ret)
01945         *ptr_cr_ret = ptr_cr;
01946 
01947     if (str_encindex != ptr_encindex &&
01948         str_cr != ENC_CODERANGE_7BIT &&
01949         ptr_cr != ENC_CODERANGE_7BIT) {
01950       incompatible:
01951         rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
01952             rb_enc_name(rb_enc_from_index(str_encindex)),
01953             rb_enc_name(rb_enc_from_index(ptr_encindex)));
01954     }
01955 
01956     if (str_cr == ENC_CODERANGE_UNKNOWN) {
01957         res_encindex = str_encindex;
01958         res_cr = ENC_CODERANGE_UNKNOWN;
01959     }
01960     else if (str_cr == ENC_CODERANGE_7BIT) {
01961         if (ptr_cr == ENC_CODERANGE_7BIT) {
01962             res_encindex = str_encindex;
01963             res_cr = ENC_CODERANGE_7BIT;
01964         }
01965         else {
01966             res_encindex = ptr_encindex;
01967             res_cr = ptr_cr;
01968         }
01969     }
01970     else if (str_cr == ENC_CODERANGE_VALID) {
01971         res_encindex = str_encindex;
01972         if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
01973             res_cr = str_cr;
01974         else
01975             res_cr = ptr_cr;
01976     }
01977     else { /* str_cr == ENC_CODERANGE_BROKEN */
01978         res_encindex = str_encindex;
01979         res_cr = str_cr;
01980         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
01981     }
01982 
01983     if (len < 0) {
01984         rb_raise(rb_eArgError, "negative string size (or size too big)");
01985     }
01986     str_buf_cat(str, ptr, len);
01987     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
01988     return str;
01989 }
01990 
01991 VALUE
01992 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
01993 {
01994     return rb_enc_cr_str_buf_cat(str, ptr, len,
01995         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
01996 }
01997 
01998 VALUE
01999 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
02000 {
02001     /* ptr must reference NUL terminated ASCII string. */
02002     int encindex = ENCODING_GET(str);
02003     rb_encoding *enc = rb_enc_from_index(encindex);
02004     if (rb_enc_asciicompat(enc)) {
02005         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
02006             encindex, ENC_CODERANGE_7BIT, 0);
02007     }
02008     else {
02009         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
02010         while (*ptr) {
02011             unsigned int c = (unsigned char)*ptr;
02012             int len = rb_enc_codelen(c, enc);
02013             rb_enc_mbcput(c, buf, enc);
02014             rb_enc_cr_str_buf_cat(str, buf, len,
02015                 encindex, ENC_CODERANGE_VALID, 0);
02016             ptr++;
02017         }
02018         return str;
02019     }
02020 }
02021 
02022 VALUE
02023 rb_str_buf_append(VALUE str, VALUE str2)
02024 {
02025     int str2_cr;
02026 
02027     str2_cr = ENC_CODERANGE(str2);
02028 
02029     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
02030         ENCODING_GET(str2), str2_cr, &str2_cr);
02031 
02032     OBJ_INFECT(str, str2);
02033     ENC_CODERANGE_SET(str2, str2_cr);
02034 
02035     return str;
02036 }
02037 
02038 VALUE
02039 rb_str_append(VALUE str, VALUE str2)
02040 {
02041     rb_encoding *enc;
02042     int cr, cr2;
02043     long len2;
02044 
02045     StringValue(str2);
02046     if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
02047         long len = RSTRING_LEN(str) + len2;
02048         enc = rb_enc_check(str, str2);
02049         cr = ENC_CODERANGE(str);
02050         if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
02051         rb_str_modify_expand(str, len2);
02052         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
02053                RSTRING_PTR(str2), len2+1);
02054         RSTRING(str)->as.heap.len = len;
02055         rb_enc_associate(str, enc);
02056         ENC_CODERANGE_SET(str, cr);
02057         OBJ_INFECT(str, str2);
02058         return str;
02059     }
02060     return rb_str_buf_append(str, str2);
02061 }
02062 
02063 /*
02064  *  call-seq:
02065  *     str << integer       -> str
02066  *     str.concat(integer)  -> str
02067  *     str << obj           -> str
02068  *     str.concat(obj)      -> str
02069  *
02070  *  Append---Concatenates the given object to <i>str</i>. If the object is a
02071  *  <code>Integer</code>, it is considered as a codepoint, and is converted
02072  *  to a character before concatenation.
02073  *
02074  *     a = "hello "
02075  *     a << "world"   #=> "hello world"
02076  *     a.concat(33)   #=> "hello world!"
02077  */
02078 
02079 VALUE
02080 rb_str_concat(VALUE str1, VALUE str2)
02081 {
02082     unsigned int code;
02083     rb_encoding *enc = STR_ENC_GET(str1);
02084 
02085     if (FIXNUM_P(str2) || TYPE(str2) == T_BIGNUM) {
02086         if (rb_num_to_uint(str2, &code) == 0) {
02087         }
02088         else if (FIXNUM_P(str2)) {
02089             rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
02090         }
02091         else {
02092             rb_raise(rb_eRangeError, "bignum out of char range");
02093         }
02094     }
02095     else {
02096         return rb_str_append(str1, str2);
02097     }
02098 
02099     if (enc == rb_usascii_encoding()) {
02100         /* US-ASCII automatically extended to ASCII-8BIT */
02101         char buf[1] = {(char)code};
02102         if (code > 0xFF) {
02103             rb_raise(rb_eRangeError, "%u out of char range", code);
02104         }
02105         rb_str_cat(str1, buf, 1);
02106         if (code > 127) {
02107             rb_enc_associate(str1, rb_ascii8bit_encoding());
02108             ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
02109         }
02110     }
02111     else {
02112         long pos = RSTRING_LEN(str1);
02113         int cr = ENC_CODERANGE(str1);
02114         int len;
02115         char *buf;
02116 
02117         switch (len = rb_enc_codelen(code, enc)) {
02118           case ONIGERR_INVALID_CODE_POINT_VALUE:
02119             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02120             break;
02121           case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
02122           case 0:
02123             rb_raise(rb_eRangeError, "%u out of char range", code);
02124             break;
02125         }
02126         buf = ALLOCA_N(char, len + 1);
02127         rb_enc_mbcput(code, buf, enc);
02128         if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
02129             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02130         }
02131         rb_str_resize(str1, pos+len);
02132         strncpy(RSTRING_PTR(str1) + pos, buf, len);
02133         if (cr == ENC_CODERANGE_7BIT && code > 127)
02134             cr = ENC_CODERANGE_VALID;
02135         ENC_CODERANGE_SET(str1, cr);
02136     }
02137     return str1;
02138 }
02139 
02140 /*
02141  *  call-seq:
02142  *     str.prepend(other_str)  -> str
02143  *
02144  *  Prepend---Prepend the given string to <i>str</i>.
02145  *
02146  *  a = "world"
02147  *  a.prepend("hello ") #=> "hello world"
02148  *  a                   #=> "hello world"
02149  */
02150 
02151 static VALUE
02152 rb_str_prepend(VALUE str, VALUE str2)
02153 {
02154     StringValue(str2);
02155     StringValue(str);
02156     rb_str_update(str, 0L, 0L, str2);
02157     return str;
02158 }
02159 
02160 st_index_t
02161 rb_str_hash(VALUE str)
02162 {
02163     int e = ENCODING_GET(str);
02164     if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02165         e = 0;
02166     }
02167     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02168 }
02169 
02170 int
02171 rb_str_hash_cmp(VALUE str1, VALUE str2)
02172 {
02173     long len;
02174 
02175     if (!rb_str_comparable(str1, str2)) return 1;
02176     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02177         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02178         return 0;
02179     }
02180     return 1;
02181 }
02182 
02183 /*
02184  * call-seq:
02185  *    str.hash   -> fixnum
02186  *
02187  * Return a hash based on the string's length and content.
02188  */
02189 
02190 static VALUE
02191 rb_str_hash_m(VALUE str)
02192 {
02193     st_index_t hval = rb_str_hash(str);
02194     return INT2FIX(hval);
02195 }
02196 
02197 #define lesser(a,b) (((a)>(b))?(b):(a))
02198 
02199 int
02200 rb_str_comparable(VALUE str1, VALUE str2)
02201 {
02202     int idx1, idx2;
02203     int rc1, rc2;
02204 
02205     if (RSTRING_LEN(str1) == 0) return TRUE;
02206     if (RSTRING_LEN(str2) == 0) return TRUE;
02207     idx1 = ENCODING_GET(str1);
02208     idx2 = ENCODING_GET(str2);
02209     if (idx1 == idx2) return TRUE;
02210     rc1 = rb_enc_str_coderange(str1);
02211     rc2 = rb_enc_str_coderange(str2);
02212     if (rc1 == ENC_CODERANGE_7BIT) {
02213         if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02214         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02215             return TRUE;
02216     }
02217     if (rc2 == ENC_CODERANGE_7BIT) {
02218         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02219             return TRUE;
02220     }
02221     return FALSE;
02222 }
02223 
02224 int
02225 rb_str_cmp(VALUE str1, VALUE str2)
02226 {
02227     long len1, len2;
02228     const char *ptr1, *ptr2;
02229     int retval;
02230 
02231     if (str1 == str2) return 0;
02232     RSTRING_GETMEM(str1, ptr1, len1);
02233     RSTRING_GETMEM(str2, ptr2, len2);
02234     if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
02235         if (len1 == len2) {
02236             if (!rb_str_comparable(str1, str2)) {
02237                 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02238                     return 1;
02239                 return -1;
02240             }
02241             return 0;
02242         }
02243         if (len1 > len2) return 1;
02244         return -1;
02245     }
02246     if (retval > 0) return 1;
02247     return -1;
02248 }
02249 
02250 /* expect tail call optimization */
02251 static VALUE
02252 str_eql(const VALUE str1, const VALUE str2)
02253 {
02254     const long len = RSTRING_LEN(str1);
02255     const char *ptr1, *ptr2;
02256 
02257     if (len != RSTRING_LEN(str2)) return Qfalse;
02258     if (!rb_str_comparable(str1, str2)) return Qfalse;
02259     if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
02260         return Qtrue;
02261     if (memcmp(ptr1, ptr2, len) == 0)
02262         return Qtrue;
02263     return Qfalse;
02264 }
02265 /*
02266  *  call-seq:
02267  *     str == obj   -> true or false
02268  *
02269  *  Equality---If <i>obj</i> is not a <code>String</code>, returns
02270  *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
02271  *  <code><=></code> <i>obj</i> returns zero.
02272  */
02273 
02274 VALUE
02275 rb_str_equal(VALUE str1, VALUE str2)
02276 {
02277     if (str1 == str2) return Qtrue;
02278     if (TYPE(str2) != T_STRING) {
02279         if (!rb_respond_to(str2, rb_intern("to_str"))) {
02280             return Qfalse;
02281         }
02282         return rb_equal(str2, str1);
02283     }
02284     return str_eql(str1, str2);
02285 }
02286 
02287 /*
02288  * call-seq:
02289  *   str.eql?(other)   -> true or false
02290  *
02291  * Two strings are equal if they have the same length and content.
02292  */
02293 
02294 static VALUE
02295 rb_str_eql(VALUE str1, VALUE str2)
02296 {
02297     if (str1 == str2) return Qtrue;
02298     if (TYPE(str2) != T_STRING) return Qfalse;
02299     return str_eql(str1, str2);
02300 }
02301 
02302 /*
02303  *  call-seq:
02304  *     str <=> other_str   -> -1, 0, +1 or nil
02305  *
02306  *  Comparison---Returns -1 if <i>other_str</i> is greater than, 0 if
02307  *  <i>other_str</i> is equal to, and +1 if <i>other_str</i> is less than
02308  *  <i>str</i>. If the strings are of different lengths, and the strings are
02309  *  equal when compared up to the shortest length, then the longer string is
02310  *  considered greater than the shorter one. In older versions of Ruby, setting
02311  *  <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
02312  *  in favor of using <code>String#casecmp</code>.
02313  *
02314  *  <code><=></code> is the basis for the methods <code><</code>,
02315  *  <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
02316  *  included from module <code>Comparable</code>.  The method
02317  *  <code>String#==</code> does not use <code>Comparable#==</code>.
02318  *
02319  *     "abcdef" <=> "abcde"     #=> 1
02320  *     "abcdef" <=> "abcdef"    #=> 0
02321  *     "abcdef" <=> "abcdefg"   #=> -1
02322  *     "abcdef" <=> "ABCDEF"    #=> 1
02323  */
02324 
02325 static VALUE
02326 rb_str_cmp_m(VALUE str1, VALUE str2)
02327 {
02328     long result;
02329 
02330     if (TYPE(str2) != T_STRING) {
02331         if (!rb_respond_to(str2, rb_intern("to_str"))) {
02332             return Qnil;
02333         }
02334         else if (!rb_respond_to(str2, rb_intern("<=>"))) {
02335             return Qnil;
02336         }
02337         else {
02338             VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
02339 
02340             if (NIL_P(tmp)) return Qnil;
02341             if (!FIXNUM_P(tmp)) {
02342                 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
02343             }
02344             result = -FIX2LONG(tmp);
02345         }
02346     }
02347     else {
02348         result = rb_str_cmp(str1, str2);
02349     }
02350     return LONG2NUM(result);
02351 }
02352 
02353 /*
02354  *  call-seq:
02355  *     str.casecmp(other_str)   -> -1, 0, +1 or nil
02356  *
02357  *  Case-insensitive version of <code>String#<=></code>.
02358  *
02359  *     "abcdef".casecmp("abcde")     #=> 1
02360  *     "aBcDeF".casecmp("abcdef")    #=> 0
02361  *     "abcdef".casecmp("abcdefg")   #=> -1
02362  *     "abcdef".casecmp("ABCDEF")    #=> 0
02363  */
02364 
02365 static VALUE
02366 rb_str_casecmp(VALUE str1, VALUE str2)
02367 {
02368     long len;
02369     rb_encoding *enc;
02370     char *p1, *p1end, *p2, *p2end;
02371 
02372     StringValue(str2);
02373     enc = rb_enc_compatible(str1, str2);
02374     if (!enc) {
02375         return Qnil;
02376     }
02377 
02378     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02379     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02380     if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02381         while (p1 < p1end && p2 < p2end) {
02382             if (*p1 != *p2) {
02383                 unsigned int c1 = TOUPPER(*p1 & 0xff);
02384                 unsigned int c2 = TOUPPER(*p2 & 0xff);
02385                 if (c1 != c2)
02386                     return INT2FIX(c1 < c2 ? -1 : 1);
02387             }
02388             p1++;
02389             p2++;
02390         }
02391     }
02392     else {
02393         while (p1 < p1end && p2 < p2end) {
02394             int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02395             int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02396 
02397             if (0 <= c1 && 0 <= c2) {
02398                 c1 = TOUPPER(c1);
02399                 c2 = TOUPPER(c2);
02400                 if (c1 != c2)
02401                     return INT2FIX(c1 < c2 ? -1 : 1);
02402             }
02403             else {
02404                 int r;
02405                 l1 = rb_enc_mbclen(p1, p1end, enc);
02406                 l2 = rb_enc_mbclen(p2, p2end, enc);
02407                 len = l1 < l2 ? l1 : l2;
02408                 r = memcmp(p1, p2, len);
02409                 if (r != 0)
02410                     return INT2FIX(r < 0 ? -1 : 1);
02411                 if (l1 != l2)
02412                     return INT2FIX(l1 < l2 ? -1 : 1);
02413             }
02414             p1 += l1;
02415             p2 += l2;
02416         }
02417     }
02418     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02419     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02420     return INT2FIX(-1);
02421 }
02422 
02423 static long
02424 rb_str_index(VALUE str, VALUE sub, long offset)
02425 {
02426     long pos;
02427     char *s, *sptr, *e;
02428     long len, slen;
02429     rb_encoding *enc;
02430 
02431     enc = rb_enc_check(str, sub);
02432     if (is_broken_string(sub)) {
02433         return -1;
02434     }
02435     len = str_strlen(str, enc);
02436     slen = str_strlen(sub, enc);
02437     if (offset < 0) {
02438         offset += len;
02439         if (offset < 0) return -1;
02440     }
02441     if (len - offset < slen) return -1;
02442     s = RSTRING_PTR(str);
02443     e = s + RSTRING_LEN(str);
02444     if (offset) {
02445         offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
02446         s += offset;
02447     }
02448     if (slen == 0) return offset;
02449     /* need proceed one character at a time */
02450     sptr = RSTRING_PTR(sub);
02451     slen = RSTRING_LEN(sub);
02452     len = RSTRING_LEN(str) - offset;
02453     for (;;) {
02454         char *t;
02455         pos = rb_memsearch(sptr, slen, s, len, enc);
02456         if (pos < 0) return pos;
02457         t = rb_enc_right_char_head(s, s+pos, e, enc);
02458         if (t == s + pos) break;
02459         if ((len -= t - s) <= 0) return -1;
02460         offset += t - s;
02461         s = t;
02462     }
02463     return pos + offset;
02464 }
02465 
02466 
02467 /*
02468  *  call-seq:
02469  *     str.index(substring [, offset])   -> fixnum or nil
02470  *     str.index(regexp [, offset])      -> fixnum or nil
02471  *
02472  *  Returns the index of the first occurrence of the given <i>substring</i> or
02473  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
02474  *  found. If the second parameter is present, it specifies the position in the
02475  *  string to begin the search.
02476  *
02477  *     "hello".index('e')             #=> 1
02478  *     "hello".index('lo')            #=> 3
02479  *     "hello".index('a')             #=> nil
02480  *     "hello".index(?e)              #=> 1
02481  *     "hello".index(/[aeiou]/, -3)   #=> 4
02482  */
02483 
02484 static VALUE
02485 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02486 {
02487     VALUE sub;
02488     VALUE initpos;
02489     long pos;
02490 
02491     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02492         pos = NUM2LONG(initpos);
02493     }
02494     else {
02495         pos = 0;
02496     }
02497     if (pos < 0) {
02498         pos += str_strlen(str, STR_ENC_GET(str));
02499         if (pos < 0) {
02500             if (TYPE(sub) == T_REGEXP) {
02501                 rb_backref_set(Qnil);
02502             }
02503             return Qnil;
02504         }
02505     }
02506 
02507     switch (TYPE(sub)) {
02508       case T_REGEXP:
02509         if (pos > str_strlen(str, STR_ENC_GET(str)))
02510             return Qnil;
02511         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02512                          rb_enc_check(str, sub), single_byte_optimizable(str));
02513 
02514         pos = rb_reg_search(sub, str, pos, 0);
02515         pos = rb_str_sublen(str, pos);
02516         break;
02517 
02518       default: {
02519         VALUE tmp;
02520 
02521         tmp = rb_check_string_type(sub);
02522         if (NIL_P(tmp)) {
02523             rb_raise(rb_eTypeError, "type mismatch: %s given",
02524                      rb_obj_classname(sub));
02525         }
02526         sub = tmp;
02527       }
02528         /* fall through */
02529       case T_STRING:
02530         pos = rb_str_index(str, sub, pos);
02531         pos = rb_str_sublen(str, pos);
02532         break;
02533     }
02534 
02535     if (pos == -1) return Qnil;
02536     return LONG2NUM(pos);
02537 }
02538 
02539 static long
02540 rb_str_rindex(VALUE str, VALUE sub, long pos)
02541 {
02542     long len, slen;
02543     char *s, *sbeg, *e, *t;
02544     rb_encoding *enc;
02545     int singlebyte = single_byte_optimizable(str);
02546 
02547     enc = rb_enc_check(str, sub);
02548     if (is_broken_string(sub)) {
02549         return -1;
02550     }
02551     len = str_strlen(str, enc);
02552     slen = str_strlen(sub, enc);
02553     /* substring longer than string */
02554     if (len < slen) return -1;
02555     if (len - pos < slen) {
02556         pos = len - slen;
02557     }
02558     if (len == 0) {
02559         return pos;
02560     }
02561     sbeg = RSTRING_PTR(str);
02562     e = RSTRING_END(str);
02563     t = RSTRING_PTR(sub);
02564     slen = RSTRING_LEN(sub);
02565     s = str_nth(sbeg, e, pos, enc, singlebyte);
02566     while (s) {
02567         if (memcmp(s, t, slen) == 0) {
02568             return pos;
02569         }
02570         if (pos == 0) break;
02571         pos--;
02572         s = rb_enc_prev_char(sbeg, s, e, enc);
02573     }
02574     return -1;
02575 }
02576 
02577 
02578 /*
02579  *  call-seq:
02580  *     str.rindex(substring [, fixnum])   -> fixnum or nil
02581  *     str.rindex(regexp [, fixnum])   -> fixnum or nil
02582  *
02583  *  Returns the index of the last occurrence of the given <i>substring</i> or
02584  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
02585  *  found. If the second parameter is present, it specifies the position in the
02586  *  string to end the search---characters beyond this point will not be
02587  *  considered.
02588  *
02589  *     "hello".rindex('e')             #=> 1
02590  *     "hello".rindex('l')             #=> 3
02591  *     "hello".rindex('a')             #=> nil
02592  *     "hello".rindex(?e)              #=> 1
02593  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
02594  */
02595 
02596 static VALUE
02597 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02598 {
02599     VALUE sub;
02600     VALUE vpos;
02601     rb_encoding *enc = STR_ENC_GET(str);
02602     long pos, len = str_strlen(str, enc);
02603 
02604     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02605         pos = NUM2LONG(vpos);
02606         if (pos < 0) {
02607             pos += len;
02608             if (pos < 0) {
02609                 if (TYPE(sub) == T_REGEXP) {
02610                     rb_backref_set(Qnil);
02611                 }
02612                 return Qnil;
02613             }
02614         }
02615         if (pos > len) pos = len;
02616     }
02617     else {
02618         pos = len;
02619     }
02620 
02621     switch (TYPE(sub)) {
02622       case T_REGEXP:
02623         /* enc = rb_get_check(str, sub); */
02624         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02625                          STR_ENC_GET(str), single_byte_optimizable(str));
02626 
02627         if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02628             pos = rb_reg_search(sub, str, pos, 1);
02629             pos = rb_str_sublen(str, pos);
02630         }
02631         if (pos >= 0) return LONG2NUM(pos);
02632         break;
02633 
02634       default: {
02635         VALUE tmp;
02636 
02637         tmp = rb_check_string_type(sub);
02638         if (NIL_P(tmp)) {
02639             rb_raise(rb_eTypeError, "type mismatch: %s given",
02640                      rb_obj_classname(sub));
02641         }
02642         sub = tmp;
02643       }
02644         /* fall through */
02645       case T_STRING:
02646         pos = rb_str_rindex(str, sub, pos);
02647         if (pos >= 0) return LONG2NUM(pos);
02648         break;
02649     }
02650     return Qnil;
02651 }
02652 
02653 /*
02654  *  call-seq:
02655  *     str =~ obj   -> fixnum or nil
02656  *
02657  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
02658  *  against <i>str</i>,and returns the position the match starts, or
02659  *  <code>nil</code> if there is no match. Otherwise, invokes
02660  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
02661  *  <code>=~</code> in <code>Object</code> returns <code>nil</code>.
02662  *
02663  *     "cat o' 9 tails" =~ /\d/   #=> 7
02664  *     "cat o' 9 tails" =~ 9      #=> nil
02665  */
02666 
02667 static VALUE
02668 rb_str_match(VALUE x, VALUE y)
02669 {
02670     switch (TYPE(y)) {
02671       case T_STRING:
02672         rb_raise(rb_eTypeError, "type mismatch: String given");
02673 
02674       case T_REGEXP:
02675         return rb_reg_match(y, x);
02676 
02677       default:
02678         return rb_funcall(y, rb_intern("=~"), 1, x);
02679     }
02680 }
02681 
02682 
02683 static VALUE get_pat(VALUE, int);
02684 
02685 
02686 /*
02687  *  call-seq:
02688  *     str.match(pattern)        -> matchdata or nil
02689  *     str.match(pattern, pos)   -> matchdata or nil
02690  *
02691  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
02692  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
02693  *  parameter is present, it specifies the position in the string to begin the
02694  *  search.
02695  *
02696  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
02697  *     'hello'.match('(.)\1')[0]   #=> "ll"
02698  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
02699  *     'hello'.match('xx')         #=> nil
02700  *
02701  *  If a block is given, invoke the block with MatchData if match succeed, so
02702  *  that you can write
02703  *
02704  *     str.match(pat) {|m| ...}
02705  *
02706  *  instead of
02707  *
02708  *     if m = str.match(pat)
02709  *       ...
02710  *     end
02711  *
02712  *  The return value is a value from block execution in this case.
02713  */
02714 
02715 static VALUE
02716 rb_str_match_m(int argc, VALUE *argv, VALUE str)
02717 {
02718     VALUE re, result;
02719     if (argc < 1)
02720        rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
02721     re = argv[0];
02722     argv[0] = str;
02723     result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
02724     if (!NIL_P(result) && rb_block_given_p()) {
02725         return rb_yield(result);
02726     }
02727     return result;
02728 }
02729 
02730 enum neighbor_char {
02731     NEIGHBOR_NOT_CHAR,
02732     NEIGHBOR_FOUND,
02733     NEIGHBOR_WRAPPED
02734 };
02735 
02736 static enum neighbor_char
02737 enc_succ_char(char *p, long len, rb_encoding *enc)
02738 {
02739     long i;
02740     int l;
02741     while (1) {
02742         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
02743             p[i] = '\0';
02744         if (i < 0)
02745             return NEIGHBOR_WRAPPED;
02746         ++((unsigned char*)p)[i];
02747         l = rb_enc_precise_mbclen(p, p+len, enc);
02748         if (MBCLEN_CHARFOUND_P(l)) {
02749             l = MBCLEN_CHARFOUND_LEN(l);
02750             if (l == len) {
02751                 return NEIGHBOR_FOUND;
02752             }
02753             else {
02754                 memset(p+l, 0xff, len-l);
02755             }
02756         }
02757         if (MBCLEN_INVALID_P(l) && i < len-1) {
02758             long len2;
02759             int l2;
02760             for (len2 = len-1; 0 < len2; len2--) {
02761                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02762                 if (!MBCLEN_INVALID_P(l2))
02763                     break;
02764             }
02765             memset(p+len2+1, 0xff, len-(len2+1));
02766         }
02767     }
02768 }
02769 
02770 static enum neighbor_char
02771 enc_pred_char(char *p, long len, rb_encoding *enc)
02772 {
02773     long i;
02774     int l;
02775     while (1) {
02776         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
02777             p[i] = '\xff';
02778         if (i < 0)
02779             return NEIGHBOR_WRAPPED;
02780         --((unsigned char*)p)[i];
02781         l = rb_enc_precise_mbclen(p, p+len, enc);
02782         if (MBCLEN_CHARFOUND_P(l)) {
02783             l = MBCLEN_CHARFOUND_LEN(l);
02784             if (l == len) {
02785                 return NEIGHBOR_FOUND;
02786             }
02787             else {
02788                 memset(p+l, 0, len-l);
02789             }
02790         }
02791         if (MBCLEN_INVALID_P(l) && i < len-1) {
02792             long len2;
02793             int l2;
02794             for (len2 = len-1; 0 < len2; len2--) {
02795                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02796                 if (!MBCLEN_INVALID_P(l2))
02797                     break;
02798             }
02799             memset(p+len2+1, 0, len-(len2+1));
02800         }
02801     }
02802 }
02803 
02804 /*
02805   overwrite +p+ by succeeding letter in +enc+ and returns
02806   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
02807   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
02808   assuming each ranges are successive, and mbclen
02809   never change in each ranges.
02810   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
02811   character.
02812  */
02813 static enum neighbor_char
02814 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
02815 {
02816     enum neighbor_char ret;
02817     unsigned int c;
02818     int ctype;
02819     int range;
02820     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
02821 
02822     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02823     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
02824         ctype = ONIGENC_CTYPE_DIGIT;
02825     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
02826         ctype = ONIGENC_CTYPE_ALPHA;
02827     else
02828         return NEIGHBOR_NOT_CHAR;
02829 
02830     MEMCPY(save, p, char, len);
02831     ret = enc_succ_char(p, len, enc);
02832     if (ret == NEIGHBOR_FOUND) {
02833         c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02834         if (rb_enc_isctype(c, ctype, enc))
02835             return NEIGHBOR_FOUND;
02836     }
02837     MEMCPY(p, save, char, len);
02838     range = 1;
02839     while (1) {
02840         MEMCPY(save, p, char, len);
02841         ret = enc_pred_char(p, len, enc);
02842         if (ret == NEIGHBOR_FOUND) {
02843             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02844             if (!rb_enc_isctype(c, ctype, enc)) {
02845                 MEMCPY(p, save, char, len);
02846                 break;
02847             }
02848         }
02849         else {
02850             MEMCPY(p, save, char, len);
02851             break;
02852         }
02853         range++;
02854     }
02855     if (range == 1) {
02856         return NEIGHBOR_NOT_CHAR;
02857     }
02858 
02859     if (ctype != ONIGENC_CTYPE_DIGIT) {
02860         MEMCPY(carry, p, char, len);
02861         return NEIGHBOR_WRAPPED;
02862     }
02863 
02864     MEMCPY(carry, p, char, len);
02865     enc_succ_char(carry, len, enc);
02866     return NEIGHBOR_WRAPPED;
02867 }
02868 
02869 
02870 /*
02871  *  call-seq:
02872  *     str.succ   -> new_str
02873  *     str.next   -> new_str
02874  *
02875  *  Returns the successor to <i>str</i>. The successor is calculated by
02876  *  incrementing characters starting from the rightmost alphanumeric (or
02877  *  the rightmost character if there are no alphanumerics) in the
02878  *  string. Incrementing a digit always results in another digit, and
02879  *  incrementing a letter results in another letter of the same case.
02880  *  Incrementing nonalphanumerics uses the underlying character set's
02881  *  collating sequence.
02882  *
02883  *  If the increment generates a ``carry,'' the character to the left of
02884  *  it is incremented. This process repeats until there is no carry,
02885  *  adding an additional character if necessary.
02886  *
02887  *     "abcd".succ        #=> "abce"
02888  *     "THX1138".succ     #=> "THX1139"
02889  *     "<<koala>>".succ   #=> "<<koalb>>"
02890  *     "1999zzz".succ     #=> "2000aaa"
02891  *     "ZZZ9999".succ     #=> "AAAA0000"
02892  *     "***".succ         #=> "**+"
02893  */
02894 
02895 VALUE
02896 rb_str_succ(VALUE orig)
02897 {
02898     rb_encoding *enc;
02899     VALUE str;
02900     char *sbeg, *s, *e, *last_alnum = 0;
02901     int c = -1;
02902     long l;
02903     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
02904     long carry_pos = 0, carry_len = 1;
02905     enum neighbor_char neighbor = NEIGHBOR_FOUND;
02906 
02907     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
02908     rb_enc_cr_str_copy_for_substr(str, orig);
02909     OBJ_INFECT(str, orig);
02910     if (RSTRING_LEN(str) == 0) return str;
02911 
02912     enc = STR_ENC_GET(orig);
02913     sbeg = RSTRING_PTR(str);
02914     s = e = sbeg + RSTRING_LEN(str);
02915 
02916     while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02917         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
02918             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
02919                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
02920                 s = last_alnum;
02921                 break;
02922             }
02923         }
02924         if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02925         neighbor = enc_succ_alnum_char(s, l, enc, carry);
02926         switch (neighbor) {
02927           case NEIGHBOR_NOT_CHAR:
02928             continue;
02929           case NEIGHBOR_FOUND:
02930             return str;
02931           case NEIGHBOR_WRAPPED:
02932             last_alnum = s;
02933             break;
02934         }
02935         c = 1;
02936         carry_pos = s - sbeg;
02937         carry_len = l;
02938     }
02939     if (c == -1) {              /* str contains no alnum */
02940         s = e;
02941         while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02942             enum neighbor_char neighbor;
02943             if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02944             neighbor = enc_succ_char(s, l, enc);
02945             if (neighbor == NEIGHBOR_FOUND)
02946                 return str;
02947             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
02948                 /* wrapped to \0...\0.  search next valid char. */
02949                 enc_succ_char(s, l, enc);
02950             }
02951             if (!rb_enc_asciicompat(enc)) {
02952                 MEMCPY(carry, s, char, l);
02953                 carry_len = l;
02954             }
02955             carry_pos = s - sbeg;
02956         }
02957     }
02958     RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
02959     s = RSTRING_PTR(str) + carry_pos;
02960     memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
02961     memmove(s, carry, carry_len);
02962     STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
02963     RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
02964     rb_enc_str_coderange(str);
02965     return str;
02966 }
02967 
02968 
02969 /*
02970  *  call-seq:
02971  *     str.succ!   -> str
02972  *     str.next!   -> str
02973  *
02974  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
02975  *  place.
02976  */
02977 
02978 static VALUE
02979 rb_str_succ_bang(VALUE str)
02980 {
02981     rb_str_shared_replace(str, rb_str_succ(str));
02982 
02983     return str;
02984 }
02985 
02986 
02987 /*
02988  *  call-seq:
02989  *     str.upto(other_str, exclusive=false) {|s| block }   -> str
02990  *     str.upto(other_str, exclusive=false)                -> an_enumerator
02991  *
02992  *  Iterates through successive values, starting at <i>str</i> and
02993  *  ending at <i>other_str</i> inclusive, passing each value in turn to
02994  *  the block. The <code>String#succ</code> method is used to generate
02995  *  each value.  If optional second argument exclusive is omitted or is false,
02996  *  the last value will be included; otherwise it will be excluded.
02997  *
02998  *  If no block is given, an enumerator is returned instead.
02999  *
03000  *     "a8".upto("b6") {|s| print s, ' ' }
03001  *     for s in "a8".."b6"
03002  *       print s, ' '
03003  *     end
03004  *
03005  *  <em>produces:</em>
03006  *
03007  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
03008  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
03009  *
03010  *  If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
03011  *  both are recognized as decimal numbers. In addition, the width of
03012  *  string (e.g. leading zeros) is handled appropriately.
03013  *
03014  *     "9".upto("11").to_a   #=> ["9", "10", "11"]
03015  *     "25".upto("5").to_a   #=> []
03016  *     "07".upto("11").to_a  #=> ["07", "08", "09", "10", "11"]
03017  */
03018 
03019 static VALUE
03020 rb_str_upto(int argc, VALUE *argv, VALUE beg)
03021 {
03022     VALUE end, exclusive;
03023     VALUE current, after_end;
03024     ID succ;
03025     int n, excl, ascii;
03026     rb_encoding *enc;
03027 
03028     rb_scan_args(argc, argv, "11", &end, &exclusive);
03029     RETURN_ENUMERATOR(beg, argc, argv);
03030     excl = RTEST(exclusive);
03031     CONST_ID(succ, "succ");
03032     StringValue(end);
03033     enc = rb_enc_check(beg, end);
03034     ascii = (is_ascii_string(beg) && is_ascii_string(end));
03035     /* single character */
03036     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
03037         char c = RSTRING_PTR(beg)[0];
03038         char e = RSTRING_PTR(end)[0];
03039 
03040         if (c > e || (excl && c == e)) return beg;
03041         for (;;) {
03042             rb_yield(rb_enc_str_new(&c, 1, enc));
03043             if (!excl && c == e) break;
03044             c++;
03045             if (excl && c == e) break;
03046         }
03047         return beg;
03048     }
03049     /* both edges are all digits */
03050     if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
03051         char *s, *send;
03052         VALUE b, e;
03053         int width;
03054 
03055         s = RSTRING_PTR(beg); send = RSTRING_END(beg);
03056         width = rb_long2int(send - s);
03057         while (s < send) {
03058             if (!ISDIGIT(*s)) goto no_digits;
03059             s++;
03060         }
03061         s = RSTRING_PTR(end); send = RSTRING_END(end);
03062         while (s < send) {
03063             if (!ISDIGIT(*s)) goto no_digits;
03064             s++;
03065         }
03066         b = rb_str_to_inum(beg, 10, FALSE);
03067         e = rb_str_to_inum(end, 10, FALSE);
03068         if (FIXNUM_P(b) && FIXNUM_P(e)) {
03069             long bi = FIX2LONG(b);
03070             long ei = FIX2LONG(e);
03071             rb_encoding *usascii = rb_usascii_encoding();
03072 
03073             while (bi <= ei) {
03074                 if (excl && bi == ei) break;
03075                 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
03076                 bi++;
03077             }
03078         }
03079         else {
03080             ID op = excl ? '<' : rb_intern("<=");
03081             VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
03082 
03083             args[0] = INT2FIX(width);
03084             while (rb_funcall(b, op, 1, e)) {
03085                 args[1] = b;
03086                 rb_yield(rb_str_format(numberof(args), args, fmt));
03087                 b = rb_funcall(b, succ, 0, 0);
03088             }
03089         }
03090         return beg;
03091     }
03092     /* normal case */
03093   no_digits:
03094     n = rb_str_cmp(beg, end);
03095     if (n > 0 || (excl && n == 0)) return beg;
03096 
03097     after_end = rb_funcall(end, succ, 0, 0);
03098     current = rb_str_dup(beg);
03099     while (!rb_str_equal(current, after_end)) {
03100         VALUE next = Qnil;
03101         if (excl || !rb_str_equal(current, end))
03102             next = rb_funcall(current, succ, 0, 0);
03103         rb_yield(current);
03104         if (NIL_P(next)) break;
03105         current = next;
03106         StringValue(current);
03107         if (excl && rb_str_equal(current, end)) break;
03108         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
03109             break;
03110     }
03111 
03112     return beg;
03113 }
03114 
03115 static VALUE
03116 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
03117 {
03118     if (rb_reg_search(re, str, 0, 0) >= 0) {
03119         VALUE match = rb_backref_get();
03120         int nth = rb_reg_backref_number(match, backref);
03121         return rb_reg_nth_match(nth, match);
03122     }
03123     return Qnil;
03124 }
03125 
03126 static VALUE
03127 rb_str_aref(VALUE str, VALUE indx)
03128 {
03129     long idx;
03130 
03131     switch (TYPE(indx)) {
03132       case T_FIXNUM:
03133         idx = FIX2LONG(indx);
03134 
03135       num_index:
03136         str = rb_str_substr(str, idx, 1);
03137         if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03138         return str;
03139 
03140       case T_REGEXP:
03141         return rb_str_subpat(str, indx, INT2FIX(0));
03142 
03143       case T_STRING:
03144         if (rb_str_index(str, indx, 0) != -1)
03145             return rb_str_dup(indx);
03146         return Qnil;
03147 
03148       default:
03149         /* check if indx is Range */
03150         {
03151             long beg, len;
03152             VALUE tmp;
03153 
03154             len = str_strlen(str, STR_ENC_GET(str));
03155             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03156               case Qfalse:
03157                 break;
03158               case Qnil:
03159                 return Qnil;
03160               default:
03161                 tmp = rb_str_substr(str, beg, len);
03162                 return tmp;
03163             }
03164         }
03165         idx = NUM2LONG(indx);
03166         goto num_index;
03167     }
03168     return Qnil;                /* not reached */
03169 }
03170 
03171 
03172 /*
03173  *  call-seq:
03174  *     str[fixnum]                 -> new_str or nil
03175  *     str[fixnum, fixnum]         -> new_str or nil
03176  *     str[range]                  -> new_str or nil
03177  *     str[regexp]                 -> new_str or nil
03178  *     str[regexp, fixnum]         -> new_str or nil
03179  *     str[other_str]              -> new_str or nil
03180  *     str.slice(fixnum)           -> new_str or nil
03181  *     str.slice(fixnum, fixnum)   -> new_str or nil
03182  *     str.slice(range)            -> new_str or nil
03183  *     str.slice(regexp)           -> new_str or nil
03184  *     str.slice(regexp, fixnum)   -> new_str or nil
03185  *     str.slice(regexp, capname)  -> new_str or nil
03186  *     str.slice(other_str)        -> new_str or nil
03187  *
03188  *  Element Reference---If passed a single <code>Fixnum</code>, returns a
03189  *  substring of one character at that position. If passed two <code>Fixnum</code>
03190  *  objects, returns a substring starting at the offset given by the first, and
03191  *  with a length given by the second. If passed a range, its beginning and end
03192  *  are interpreted as offsets delimiting the substring to be returned. In all
03193  *  three cases, if an offset is negative, it is counted from the end of <i>str</i>.
03194  *  Returns <code>nil</code> if the initial offset falls outside the string or
03195  *  the length is negative.
03196  *
03197  *  If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
03198  *  returned. If a numeric or name parameter follows the regular expression, that
03199  *  component of the <code>MatchData</code> is returned instead. If a
03200  *  <code>String</code> is given, that string is returned if it occurs in
03201  *  <i>str</i>. In both cases, <code>nil</code> is returned if there is no
03202  *  match.
03203  *
03204  *     a = "hello there"
03205  *     a[1]                   #=> "e"
03206  *     a[2, 3]                #=> "llo"
03207  *     a[2..3]                #=> "ll"
03208  *     a[-3, 2]               #=> "er"
03209  *     a[7..-2]               #=> "her"
03210  *     a[-4..-2]              #=> "her"
03211  *     a[-2..-4]              #=> ""
03212  *     a[12..-1]              #=> nil
03213  *     a[/[aeiou](.)\1/]      #=> "ell"
03214  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
03215  *     a[/[aeiou](.)\1/, 1]   #=> "l"
03216  *     a[/[aeiou](.)\1/, 2]   #=> nil
03217  *     a["lo"]                #=> "lo"
03218  *     a["bye"]               #=> nil
03219  */
03220 
03221 static VALUE
03222 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03223 {
03224     if (argc == 2) {
03225         if (TYPE(argv[0]) == T_REGEXP) {
03226             return rb_str_subpat(str, argv[0], argv[1]);
03227         }
03228         return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03229     }
03230     if (argc != 1) {
03231         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03232     }
03233     return rb_str_aref(str, argv[0]);
03234 }
03235 
03236 VALUE
03237 rb_str_drop_bytes(VALUE str, long len)
03238 {
03239     char *ptr = RSTRING_PTR(str);
03240     long olen = RSTRING_LEN(str), nlen;
03241 
03242     str_modifiable(str);
03243     if (len > olen) len = olen;
03244     nlen = olen - len;
03245     if (nlen <= RSTRING_EMBED_LEN_MAX) {
03246         char *oldptr = ptr;
03247         int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03248         STR_SET_EMBED(str);
03249         STR_SET_EMBED_LEN(str, nlen);
03250         ptr = RSTRING(str)->as.ary;
03251         memmove(ptr, oldptr + len, nlen);
03252         if (fl == STR_NOEMBED) xfree(oldptr);
03253     }
03254     else {
03255         if (!STR_SHARED_P(str)) rb_str_new4(str);
03256         ptr = RSTRING(str)->as.heap.ptr += len;
03257         RSTRING(str)->as.heap.len = nlen;
03258     }
03259     ptr[nlen] = 0;
03260     ENC_CODERANGE_CLEAR(str);
03261     return str;
03262 }
03263 
03264 static void
03265 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03266 {
03267     if (beg == 0 && RSTRING_LEN(val) == 0) {
03268         rb_str_drop_bytes(str, len);
03269         OBJ_INFECT(str, val);
03270         return;
03271     }
03272 
03273     rb_str_modify(str);
03274     if (len < RSTRING_LEN(val)) {
03275         /* expand string */
03276         RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
03277     }
03278 
03279     if (RSTRING_LEN(val) != len) {
03280         memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03281                 RSTRING_PTR(str) + beg + len,
03282                 RSTRING_LEN(str) - (beg + len));
03283     }
03284     if (RSTRING_LEN(val) < beg && len < 0) {
03285         MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03286     }
03287     if (RSTRING_LEN(val) > 0) {
03288         memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03289     }
03290     STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03291     if (RSTRING_PTR(str)) {
03292         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03293     }
03294     OBJ_INFECT(str, val);
03295 }
03296 
03297 static void
03298 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03299 {
03300     long slen;
03301     char *p, *e;
03302     rb_encoding *enc;
03303     int singlebyte = single_byte_optimizable(str);
03304     int cr;
03305 
03306     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03307 
03308     StringValue(val);
03309     enc = rb_enc_check(str, val);
03310     slen = str_strlen(str, enc);
03311 
03312     if (slen < beg) {
03313       out_of_range:
03314         rb_raise(rb_eIndexError, "index %ld out of string", beg);
03315     }
03316     if (beg < 0) {
03317         if (-beg > slen) {
03318             goto out_of_range;
03319         }
03320         beg += slen;
03321     }
03322     if (slen < len || slen < beg + len) {
03323         len = slen - beg;
03324     }
03325     str_modify_keep_cr(str);
03326     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03327     if (!p) p = RSTRING_END(str);
03328     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03329     if (!e) e = RSTRING_END(str);
03330     /* error check */
03331     beg = p - RSTRING_PTR(str); /* physical position */
03332     len = e - p;                /* physical length */
03333     rb_str_splice_0(str, beg, len, val);
03334     rb_enc_associate(str, enc);
03335     cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03336     if (cr != ENC_CODERANGE_BROKEN)
03337         ENC_CODERANGE_SET(str, cr);
03338 }
03339 
03340 void
03341 rb_str_update(VALUE str, long beg, long len, VALUE val)
03342 {
03343     rb_str_splice(str, beg, len, val);
03344 }
03345 
03346 static void
03347 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03348 {
03349     int nth;
03350     VALUE match;
03351     long start, end, len;
03352     rb_encoding *enc;
03353     struct re_registers *regs;
03354 
03355     if (rb_reg_search(re, str, 0, 0) < 0) {
03356         rb_raise(rb_eIndexError, "regexp not matched");
03357     }
03358     match = rb_backref_get();
03359     nth = rb_reg_backref_number(match, backref);
03360     regs = RMATCH_REGS(match);
03361     if (nth >= regs->num_regs) {
03362       out_of_range:
03363         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03364     }
03365     if (nth < 0) {
03366         if (-nth >= regs->num_regs) {
03367             goto out_of_range;
03368         }
03369         nth += regs->num_regs;
03370     }
03371 
03372     start = BEG(nth);
03373     if (start == -1) {
03374         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03375     }
03376     end = END(nth);
03377     len = end - start;
03378     StringValue(val);
03379     enc = rb_enc_check(str, val);
03380     rb_str_splice_0(str, start, len, val);
03381     rb_enc_associate(str, enc);
03382 }
03383 
03384 static VALUE
03385 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03386 {
03387     long idx, beg;
03388 
03389     switch (TYPE(indx)) {
03390       case T_FIXNUM:
03391         idx = FIX2LONG(indx);
03392       num_index:
03393         rb_str_splice(str, idx, 1, val);
03394         return val;
03395 
03396       case T_REGEXP:
03397         rb_str_subpat_set(str, indx, INT2FIX(0), val);
03398         return val;
03399 
03400       case T_STRING:
03401         beg = rb_str_index(str, indx, 0);
03402         if (beg < 0) {
03403             rb_raise(rb_eIndexError, "string not matched");
03404         }
03405         beg = rb_str_sublen(str, beg);
03406         rb_str_splice(str, beg, str_strlen(indx, 0), val);
03407         return val;
03408 
03409       default:
03410         /* check if indx is Range */
03411         {
03412             long beg, len;
03413             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03414                 rb_str_splice(str, beg, len, val);
03415                 return val;
03416             }
03417         }
03418         idx = NUM2LONG(indx);
03419         goto num_index;
03420     }
03421 }
03422 
03423 /*
03424  *  call-seq:
03425  *     str[fixnum] = new_str
03426  *     str[fixnum, fixnum] = new_str
03427  *     str[range] = aString
03428  *     str[regexp] = new_str
03429  *     str[regexp, fixnum] = new_str
03430  *     str[regexp, name] = new_str
03431  *     str[other_str] = new_str
03432  *
03433  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
03434  *  portion of the string affected is determined using the same criteria as
03435  *  <code>String#[]</code>. If the replacement string is not the same length as
03436  *  the text it is replacing, the string will be adjusted accordingly. If the
03437  *  regular expression or string is used as the index doesn't match a position
03438  *  in the string, <code>IndexError</code> is raised. If the regular expression
03439  *  form is used, the optional second <code>Fixnum</code> allows you to specify
03440  *  which portion of the match to replace (effectively using the
03441  *  <code>MatchData</code> indexing rules. The forms that take a
03442  *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
03443  *  out of range; the <code>Range</code> form will raise a
03444  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
03445  *  forms will silently ignore the assignment.
03446  */
03447 
03448 static VALUE
03449 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03450 {
03451     if (argc == 3) {
03452         if (TYPE(argv[0]) == T_REGEXP) {
03453             rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03454         }
03455         else {
03456             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03457         }
03458         return argv[2];
03459     }
03460     if (argc != 2) {
03461         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
03462     }
03463     return rb_str_aset(str, argv[0], argv[1]);
03464 }
03465 
03466 /*
03467  *  call-seq:
03468  *     str.insert(index, other_str)   -> str
03469  *
03470  *  Inserts <i>other_str</i> before the character at the given
03471  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
03472  *  end of the string, and insert <em>after</em> the given character.
03473  *  The intent is insert <i>aString</i> so that it starts at the given
03474  *  <i>index</i>.
03475  *
03476  *     "abcd".insert(0, 'X')    #=> "Xabcd"
03477  *     "abcd".insert(3, 'X')    #=> "abcXd"
03478  *     "abcd".insert(4, 'X')    #=> "abcdX"
03479  *     "abcd".insert(-3, 'X')   #=> "abXcd"
03480  *     "abcd".insert(-1, 'X')   #=> "abcdX"
03481  */
03482 
03483 static VALUE
03484 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03485 {
03486     long pos = NUM2LONG(idx);
03487 
03488     if (pos == -1) {
03489         return rb_str_append(str, str2);
03490     }
03491     else if (pos < 0) {
03492         pos++;
03493     }
03494     rb_str_splice(str, pos, 0, str2);
03495     return str;
03496 }
03497 
03498 
03499 /*
03500  *  call-seq:
03501  *     str.slice!(fixnum)           -> fixnum or nil
03502  *     str.slice!(fixnum, fixnum)   -> new_str or nil
03503  *     str.slice!(range)            -> new_str or nil
03504  *     str.slice!(regexp)           -> new_str or nil
03505  *     str.slice!(other_str)        -> new_str or nil
03506  *
03507  *  Deletes the specified portion from <i>str</i>, and returns the portion
03508  *  deleted.
03509  *
03510  *     string = "this is a string"
03511  *     string.slice!(2)        #=> "i"
03512  *     string.slice!(3..6)     #=> " is "
03513  *     string.slice!(/s.*t/)   #=> "sa st"
03514  *     string.slice!("r")      #=> "r"
03515  *     string                  #=> "thing"
03516  */
03517 
03518 static VALUE
03519 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03520 {
03521     VALUE result;
03522     VALUE buf[3];
03523     int i;
03524 
03525     if (argc < 1 || 2 < argc) {
03526         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03527     }
03528     for (i=0; i<argc; i++) {
03529         buf[i] = argv[i];
03530     }
03531     str_modify_keep_cr(str);
03532     result = rb_str_aref_m(argc, buf, str);
03533     if (!NIL_P(result)) {
03534         buf[i] = rb_str_new(0,0);
03535         rb_str_aset_m(argc+1, buf, str);
03536     }
03537     return result;
03538 }
03539 
03540 static VALUE
03541 get_pat(VALUE pat, int quote)
03542 {
03543     VALUE val;
03544 
03545     switch (TYPE(pat)) {
03546       case T_REGEXP:
03547         return pat;
03548 
03549       case T_STRING:
03550         break;
03551 
03552       default:
03553         val = rb_check_string_type(pat);
03554         if (NIL_P(val)) {
03555             Check_Type(pat, T_REGEXP);
03556         }
03557         pat = val;
03558     }
03559 
03560     if (quote) {
03561         pat = rb_reg_quote(pat);
03562     }
03563 
03564     return rb_reg_regcomp(pat);
03565 }
03566 
03567 
03568 /*
03569  *  call-seq:
03570  *     str.sub!(pattern, replacement)          -> str or nil
03571  *     str.sub!(pattern) {|match| block }      -> str or nil
03572  *
03573  *  Performs the substitutions of <code>String#sub</code> in place,
03574  *  returning <i>str</i>, or <code>nil</code> if no substitutions were
03575  *  performed.
03576  */
03577 
03578 static VALUE
03579 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03580 {
03581     VALUE pat, repl, hash = Qnil;
03582     int iter = 0;
03583     int tainted = 0;
03584     int untrusted = 0;
03585     long plen;
03586 
03587     if (argc == 1 && rb_block_given_p()) {
03588         iter = 1;
03589     }
03590     else if (argc == 2) {
03591         repl = argv[1];
03592         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03593         if (NIL_P(hash)) {
03594             StringValue(repl);
03595         }
03596         if (OBJ_TAINTED(repl)) tainted = 1;
03597         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03598     }
03599     else {
03600         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03601     }
03602 
03603     pat = get_pat(argv[0], 1);
03604     str_modifiable(str);
03605     if (rb_reg_search(pat, str, 0, 0) >= 0) {
03606         rb_encoding *enc;
03607         int cr = ENC_CODERANGE(str);
03608         VALUE match = rb_backref_get();
03609         struct re_registers *regs = RMATCH_REGS(match);
03610         long beg0 = BEG(0);
03611         long end0 = END(0);
03612         char *p, *rp;
03613         long len, rlen;
03614 
03615         if (iter || !NIL_P(hash)) {
03616             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03617 
03618             if (iter) {
03619                 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03620             }
03621             else {
03622                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
03623                 repl = rb_obj_as_string(repl);
03624             }
03625             str_mod_check(str, p, len);
03626             rb_check_frozen(str);
03627         }
03628         else {
03629             repl = rb_reg_regsub(repl, str, regs, pat);
03630         }
03631         enc = rb_enc_compatible(str, repl);
03632         if (!enc) {
03633             rb_encoding *str_enc = STR_ENC_GET(str);
03634             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03635             if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
03636                 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
03637                 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
03638                          rb_enc_name(str_enc),
03639                          rb_enc_name(STR_ENC_GET(repl)));
03640             }
03641             enc = STR_ENC_GET(repl);
03642         }
03643         rb_str_modify(str);
03644         rb_enc_associate(str, enc);
03645         if (OBJ_TAINTED(repl)) tainted = 1;
03646         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03647         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
03648             int cr2 = ENC_CODERANGE(repl);
03649             if (cr2 == ENC_CODERANGE_BROKEN ||
03650                 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
03651                 cr = ENC_CODERANGE_UNKNOWN;
03652             else
03653                 cr = cr2;
03654         }
03655         plen = end0 - beg0;
03656         rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
03657         len = RSTRING_LEN(str);
03658         if (rlen > plen) {
03659             RESIZE_CAPA(str, len + rlen - plen);
03660         }
03661         p = RSTRING_PTR(str);
03662         if (rlen != plen) {
03663             memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
03664         }
03665         memcpy(p + beg0, rp, rlen);
03666         len += rlen - plen;
03667         STR_SET_LEN(str, len);
03668         RSTRING_PTR(str)[len] = '\0';
03669         ENC_CODERANGE_SET(str, cr);
03670         if (tainted) OBJ_TAINT(str);
03671         if (untrusted) OBJ_UNTRUST(str);
03672 
03673         return str;
03674     }
03675     return Qnil;
03676 }
03677 
03678 
03679 /*
03680  *  call-seq:
03681  *     str.sub(pattern, replacement)         -> new_str
03682  *     str.sub(pattern, hash)                -> new_str
03683  *     str.sub(pattern) {|match| block }     -> new_str
03684  *
03685  *  Returns a copy of <i>str</i> with the <em>first</em> occurrence of
03686  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
03687  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
03688  *  regular expression metacharacters it contains will be interpreted
03689  *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
03690  *  instead of a digit.
03691  *
03692  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
03693  *  the matched text. It may contain back-references to the pattern's capture
03694  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
03695  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
03696  *  double-quoted string, both back-references must be preceded by an
03697  *  additional backslash. However, within <i>replacement</i> the special match
03698  *  variables, such as <code>&$</code>, will not refer to the current match.
03699  *
03700  *  If the second argument is a <code>Hash</code>, and the matched text is one
03701  *  of its keys, the corresponding value is the replacement string.
03702  *
03703  *  In the block form, the current match string is passed in as a parameter,
03704  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
03705  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
03706  *  returned by the block will be substituted for the match on each call.
03707  *
03708  *  The result inherits any tainting in the original string or any supplied
03709  *  replacement string.
03710  *
03711  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
03712  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
03713  *     "hello".sub(/./) {|s| s.ord.to_s + ' ' }     #=> "104 ello"
03714  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
03715  *     'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
03716  *      #=> "Is /bin/bash your preferred shell?"
03717  */
03718 
03719 static VALUE
03720 rb_str_sub(int argc, VALUE *argv, VALUE str)
03721 {
03722     str = rb_str_dup(str);
03723     rb_str_sub_bang(argc, argv, str);
03724     return str;
03725 }
03726 
03727 static VALUE
03728 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
03729 {
03730     VALUE pat, val, repl, match, dest, hash = Qnil;
03731     struct re_registers *regs;
03732     long beg, n;
03733     long beg0, end0;
03734     long offset, blen, slen, len, last;
03735     int iter = 0;
03736     char *sp, *cp;
03737     int tainted = 0;
03738     rb_encoding *str_enc;
03739 
03740     switch (argc) {
03741       case 1:
03742         RETURN_ENUMERATOR(str, argc, argv);
03743         iter = 1;
03744         break;
03745       case 2:
03746         repl = argv[1];
03747         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03748         if (NIL_P(hash)) {
03749             StringValue(repl);
03750         }
03751         if (OBJ_TAINTED(repl)) tainted = 1;
03752         break;
03753       default:
03754         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03755     }
03756 
03757     pat = get_pat(argv[0], 1);
03758     beg = rb_reg_search(pat, str, 0, 0);
03759     if (beg < 0) {
03760         if (bang) return Qnil;  /* no match, no substitution */
03761         return rb_str_dup(str);
03762     }
03763 
03764     offset = 0;
03765     n = 0;
03766     blen = RSTRING_LEN(str) + 30; /* len + margin */
03767     dest = rb_str_buf_new(blen);
03768     sp = RSTRING_PTR(str);
03769     slen = RSTRING_LEN(str);
03770     cp = sp;
03771     str_enc = STR_ENC_GET(str);
03772     rb_enc_associate(dest, str_enc);
03773     ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
03774 
03775     do {
03776         n++;
03777         match = rb_backref_get();
03778         regs = RMATCH_REGS(match);
03779         beg0 = BEG(0);
03780         end0 = END(0);
03781         if (iter || !NIL_P(hash)) {
03782             if (iter) {
03783                 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03784             }
03785             else {
03786                 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
03787                 val = rb_obj_as_string(val);
03788             }
03789             str_mod_check(str, sp, slen);
03790             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
03791                 rb_raise(rb_eRuntimeError, "block should not cheat");
03792             }
03793         }
03794         else {
03795             val = rb_reg_regsub(repl, str, regs, pat);
03796         }
03797 
03798         if (OBJ_TAINTED(val)) tainted = 1;
03799 
03800         len = beg - offset;     /* copy pre-match substr */
03801         if (len) {
03802             rb_enc_str_buf_cat(dest, cp, len, str_enc);
03803         }
03804 
03805         rb_str_buf_append(dest, val);
03806 
03807         last = offset;
03808         offset = end0;
03809         if (beg0 == end0) {
03810             /*
03811              * Always consume at least one character of the input string
03812              * in order to prevent infinite loops.
03813              */
03814             if (RSTRING_LEN(str) <= end0) break;
03815             len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
03816             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
03817             offset = end0 + len;
03818         }
03819         cp = RSTRING_PTR(str) + offset;
03820         if (offset > RSTRING_LEN(str)) break;
03821         beg = rb_reg_search(pat, str, offset, 0);
03822     } while (beg >= 0);
03823     if (RSTRING_LEN(str) > offset) {
03824         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
03825     }
03826     rb_reg_search(pat, str, last, 0);
03827     if (bang) {
03828         rb_str_shared_replace(str, dest);
03829     }
03830     else {
03831         RBASIC(dest)->klass = rb_obj_class(str);
03832         OBJ_INFECT(dest, str);
03833         str = dest;
03834     }
03835 
03836     if (tainted) OBJ_TAINT(str);
03837     return str;
03838 }
03839 
03840 
03841 /*
03842  *  call-seq:
03843  *     str.gsub!(pattern, replacement)        -> str or nil
03844  *     str.gsub!(pattern) {|match| block }    -> str or nil
03845  *     str.gsub!(pattern)                     -> an_enumerator
03846  *
03847  *  Performs the substitutions of <code>String#gsub</code> in place, returning
03848  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
03849  *  If no block and no <i>replacement</i> is given, an enumerator is returned instead.
03850  */
03851 
03852 static VALUE
03853 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
03854 {
03855     str_modify_keep_cr(str);
03856     return str_gsub(argc, argv, str, 1);
03857 }
03858 
03859 
03860 /*
03861  *  call-seq:
03862  *     str.gsub(pattern, replacement)       -> new_str
03863  *     str.gsub(pattern, hash)              -> new_str
03864  *     str.gsub(pattern) {|match| block }   -> new_str
03865  *     str.gsub(pattern)                    -> enumerator
03866  *
03867  *  Returns a copy of <i>str</i> with the <em>all</em> occurrences of
03868  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
03869  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
03870  *  regular expression metacharacters it contains will be interpreted
03871  *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
03872  *  instead of a digit.
03873  *
03874  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
03875  *  the matched text. It may contain back-references to the pattern's capture
03876  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
03877  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
03878  *  double-quoted string, both back-references must be preceded by an
03879  *  additional backslash. However, within <i>replacement</i> the special match
03880  *  variables, such as <code>&$</code>, will not refer to the current match.
03881  *
03882  *  If the second argument is a <code>Hash</code>, and the matched text is one
03883  *  of its keys, the corresponding value is the replacement string.
03884  *
03885  *  In the block form, the current match string is passed in as a parameter,
03886  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
03887  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
03888  *  returned by the block will be substituted for the match on each call.
03889  *
03890  *  The result inherits any tainting in the original string or any supplied
03891  *  replacement string.
03892  *
03893  *  When neither a block nor a second argument is supplied, an
03894  *  <code>Enumerator</code> is returned.
03895  *
03896  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
03897  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
03898  *     "hello".gsub(/./) {|s| s.ord.to_s + ' '}      #=> "104 101 108 108 111 "
03899  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
03900  *     'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*')    #=> "h3ll*"
03901  */
03902 
03903 static VALUE
03904 rb_str_gsub(int argc, VALUE *argv, VALUE str)
03905 {
03906     return str_gsub(argc, argv, str, 0);
03907 }
03908 
03909 
03910 /*
03911  *  call-seq:
03912  *     str.replace(other_str)   -> str
03913  *
03914  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
03915  *  values in <i>other_str</i>.
03916  *
03917  *     s = "hello"         #=> "hello"
03918  *     s.replace "world"   #=> "world"
03919  */
03920 
03921 VALUE
03922 rb_str_replace(VALUE str, VALUE str2)
03923 {
03924     str_modifiable(str);
03925     if (str == str2) return str;
03926 
03927     StringValue(str2);
03928     str_discard(str);
03929     return str_replace(str, str2);
03930 }
03931 
03932 /*
03933  *  call-seq:
03934  *     string.clear    ->  string
03935  *
03936  *  Makes string empty.
03937  *
03938  *     a = "abcde"
03939  *     a.clear    #=> ""
03940  */
03941 
03942 static VALUE
03943 rb_str_clear(VALUE str)
03944 {
03945     str_discard(str);
03946     STR_SET_EMBED(str);
03947     STR_SET_EMBED_LEN(str, 0);
03948     RSTRING_PTR(str)[0] = 0;
03949     if (rb_enc_asciicompat(STR_ENC_GET(str)))
03950         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03951     else
03952         ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03953     return str;
03954 }
03955 
03956 /*
03957  *  call-seq:
03958  *     string.chr    ->  string
03959  *
03960  *  Returns a one-character string at the beginning of the string.
03961  *
03962  *     a = "abcde"
03963  *     a.chr    #=> "a"
03964  */
03965 
03966 static VALUE
03967 rb_str_chr(VALUE str)
03968 {
03969     return rb_str_substr(str, 0, 1);
03970 }
03971 
03972 /*
03973  *  call-seq:
03974  *     str.getbyte(index)          -> 0 .. 255
03975  *
03976  *  returns the <i>index</i>th byte as an integer.
03977  */
03978 static VALUE
03979 rb_str_getbyte(VALUE str, VALUE index)
03980 {
03981     long pos = NUM2LONG(index);
03982 
03983     if (pos < 0)
03984         pos += RSTRING_LEN(str);
03985     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
03986         return Qnil;
03987 
03988     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
03989 }
03990 
03991 /*
03992  *  call-seq:
03993  *     str.setbyte(index, int) -> int
03994  *
03995  *  modifies the <i>index</i>th byte as <i>int</i>.
03996  */
03997 static VALUE
03998 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
03999 {
04000     long pos = NUM2LONG(index);
04001     int byte = NUM2INT(value);
04002 
04003     rb_str_modify(str);
04004 
04005     if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
04006         rb_raise(rb_eIndexError, "index %ld out of string", pos);
04007     if (pos < 0)
04008         pos += RSTRING_LEN(str);
04009 
04010     RSTRING_PTR(str)[pos] = byte;
04011 
04012     return value;
04013 }
04014 
04015 static VALUE
04016 str_byte_substr(VALUE str, long beg, long len)
04017 {
04018     char *p, *s = RSTRING_PTR(str);
04019     long n = RSTRING_LEN(str);
04020     VALUE str2;
04021 
04022     if (beg > n || len < 0) return Qnil;
04023     if (beg < 0) {
04024         beg += n;
04025         if (beg < 0) return Qnil;
04026     }
04027     if (beg + len > n)
04028         len = n - beg;
04029     if (len <= 0) {
04030         len = 0;
04031         p = 0;
04032     }
04033     else
04034         p = s + beg;
04035 
04036     if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
04037         str2 = rb_str_new4(str);
04038         str2 = str_new3(rb_obj_class(str2), str2);
04039         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
04040         RSTRING(str2)->as.heap.len = len;
04041     }
04042     else {
04043         str2 = rb_str_new5(str, p, len);
04044         rb_enc_cr_str_copy_for_substr(str2, str);
04045         OBJ_INFECT(str2, str);
04046     }
04047 
04048     return str2;
04049 }
04050 
04051 static VALUE
04052 str_byte_aref(VALUE str, VALUE indx)
04053 {
04054     long idx;
04055     switch (TYPE(indx)) {
04056       case T_FIXNUM:
04057         idx = FIX2LONG(indx);
04058 
04059       num_index:
04060         str = str_byte_substr(str, idx, 1);
04061         if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
04062         return str;
04063 
04064       default:
04065         /* check if indx is Range */
04066         {
04067             long beg, len = RSTRING_LEN(str);
04068 
04069             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
04070               case Qfalse:
04071                 break;
04072               case Qnil:
04073                 return Qnil;
04074               default:
04075                 return str_byte_substr(str, beg, len);
04076             }
04077         }
04078         idx = NUM2LONG(indx);
04079         goto num_index;
04080     }
04081     return Qnil;                /* not reached */
04082 }
04083 
04084 /*
04085  *  call-seq:
04086  *     str.byteslice(fixnum)           -> new_str or nil
04087  *     str.byteslice(fixnum, fixnum)   -> new_str or nil
04088  *     str.byteslice(range)            -> new_str or nil
04089  *
04090  *  Byte Reference---If passed a single <code>Fixnum</code>, returns a
04091  *  substring of one byte at that position. If passed two <code>Fixnum</code>
04092  *  objects, returns a substring starting at the offset given by the first, and
04093  *  a length given by the second. If given a <code>Range</code>, a substring containing
04094  *  bytes at offsets given by the range is returned. In all three cases, if
04095  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
04096  *  <code>nil</code> if the initial offset falls outside the string, the length
04097  *  is negative, or the beginning of the range is greater than the end.
04098  *  The encoding of the resulted string keeps original encoding.
04099  *
04100  *     "hello".byteslice(1)     #=> "e"
04101  *     "hello".byteslice(-1)    #=> "o"
04102  *     "hello".byteslice(1, 2)  #=> "el"
04103  *     "\x80\u3042".byteslice(1, 3) #=> "\u3042"
04104  *     "\x03\u3042\xff".byteslice(1..3) #=> "\u3942"
04105  */
04106 
04107 static VALUE
04108 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
04109 {
04110     if (argc == 2) {
04111         return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
04112     }
04113     if (argc != 1) {
04114         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
04115     }
04116     return str_byte_aref(str, argv[0]);
04117 }
04118 
04119 /*
04120  *  call-seq:
04121  *     str.reverse   -> new_str
04122  *
04123  *  Returns a new string with the characters from <i>str</i> in reverse order.
04124  *
04125  *     "stressed".reverse   #=> "desserts"
04126  */
04127 
04128 static VALUE
04129 rb_str_reverse(VALUE str)
04130 {
04131     rb_encoding *enc;
04132     VALUE rev;
04133     char *s, *e, *p;
04134     int single = 1;
04135 
04136     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
04137     enc = STR_ENC_GET(str);
04138     rev = rb_str_new5(str, 0, RSTRING_LEN(str));
04139     s = RSTRING_PTR(str); e = RSTRING_END(str);
04140     p = RSTRING_END(rev);
04141 
04142     if (RSTRING_LEN(str) > 1) {
04143         if (single_byte_optimizable(str)) {
04144             while (s < e) {
04145                 *--p = *s++;
04146             }
04147         }
04148         else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
04149             while (s < e) {
04150                 int clen = rb_enc_fast_mbclen(s, e, enc);
04151 
04152                 if (clen > 1 || (*s & 0x80)) single = 0;
04153                 p -= clen;
04154                 memcpy(p, s, clen);
04155                 s += clen;
04156             }
04157         }
04158         else {
04159             while (s < e) {
04160                 int clen = rb_enc_mbclen(s, e, enc);
04161 
04162                 if (clen > 1 || (*s & 0x80)) single = 0;
04163                 p -= clen;
04164                 memcpy(p, s, clen);
04165                 s += clen;
04166             }
04167         }
04168     }
04169     STR_SET_LEN(rev, RSTRING_LEN(str));
04170     OBJ_INFECT(rev, str);
04171     if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
04172         if (single) {
04173             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
04174         }
04175         else {
04176             ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
04177         }
04178     }
04179     rb_enc_cr_str_copy_for_substr(rev, str);
04180 
04181     return rev;
04182 }
04183 
04184 
04185 /*
04186  *  call-seq:
04187  *     str.reverse!   -> str
04188  *
04189  *  Reverses <i>str</i> in place.
04190  */
04191 
04192 static VALUE
04193 rb_str_reverse_bang(VALUE str)
04194 {
04195     if (RSTRING_LEN(str) > 1) {
04196         if (single_byte_optimizable(str)) {
04197             char *s, *e, c;
04198 
04199             str_modify_keep_cr(str);
04200             s = RSTRING_PTR(str);
04201             e = RSTRING_END(str) - 1;
04202             while (s < e) {
04203                 c = *s;
04204                 *s++ = *e;
04205                 *e-- = c;
04206             }
04207         }
04208         else {
04209             rb_str_shared_replace(str, rb_str_reverse(str));
04210         }
04211     }
04212     else {
04213         str_modify_keep_cr(str);
04214     }
04215     return str;
04216 }
04217 
04218 
04219 /*
04220  *  call-seq:
04221  *     str.include? other_str   -> true or false
04222  *
04223  *  Returns <code>true</code> if <i>str</i> contains the given string or
04224  *  character.
04225  *
04226  *     "hello".include? "lo"   #=> true
04227  *     "hello".include? "ol"   #=> false
04228  *     "hello".include? ?h     #=> true
04229  */
04230 
04231 static VALUE
04232 rb_str_include(VALUE str, VALUE arg)
04233 {
04234     long i;
04235 
04236     StringValue(arg);
04237     i = rb_str_index(str, arg, 0);
04238 
04239     if (i == -1) return Qfalse;
04240     return Qtrue;
04241 }
04242 
04243 
04244 /*
04245  *  call-seq:
04246  *     str.to_i(base=10)   -> integer
04247  *
04248  *  Returns the result of interpreting leading characters in <i>str</i> as an
04249  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
04250  *  end of a valid number are ignored. If there is not a valid number at the
04251  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
04252  *  exception when <i>base</i> is valid.
04253  *
04254  *     "12345".to_i             #=> 12345
04255  *     "99 red balloons".to_i   #=> 99
04256  *     "0a".to_i                #=> 0
04257  *     "0a".to_i(16)            #=> 10
04258  *     "hello".to_i             #=> 0
04259  *     "1100101".to_i(2)        #=> 101
04260  *     "1100101".to_i(8)        #=> 294977
04261  *     "1100101".to_i(10)       #=> 1100101
04262  *     "1100101".to_i(16)       #=> 17826049
04263  */
04264 
04265 static VALUE
04266 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04267 {
04268     int base;
04269 
04270     if (argc == 0) base = 10;
04271     else {
04272         VALUE b;
04273 
04274         rb_scan_args(argc, argv, "01", &b);
04275         base = NUM2INT(b);
04276     }
04277     if (base < 0) {
04278         rb_raise(rb_eArgError, "invalid radix %d", base);
04279     }
04280     return rb_str_to_inum(str, base, FALSE);
04281 }
04282 
04283 
04284 /*
04285  *  call-seq:
04286  *     str.to_f   -> float
04287  *
04288  *  Returns the result of interpreting leading characters in <i>str</i> as a
04289  *  floating point number. Extraneous characters past the end of a valid number
04290  *  are ignored. If there is not a valid number at the start of <i>str</i>,
04291  *  <code>0.0</code> is returned. This method never raises an exception.
04292  *
04293  *     "123.45e1".to_f        #=> 1234.5
04294  *     "45.67 degrees".to_f   #=> 45.67
04295  *     "thx1138".to_f         #=> 0.0
04296  */
04297 
04298 static VALUE
04299 rb_str_to_f(VALUE str)
04300 {
04301     return DBL2NUM(rb_str_to_dbl(str, FALSE));
04302 }
04303 
04304 
04305 /*
04306  *  call-seq:
04307  *     str.to_s     -> str
04308  *     str.to_str   -> str
04309  *
04310  *  Returns the receiver.
04311  */
04312 
04313 static VALUE
04314 rb_str_to_s(VALUE str)
04315 {
04316     if (rb_obj_class(str) != rb_cString) {
04317         return str_duplicate(rb_cString, str);
04318     }
04319     return str;
04320 }
04321 
04322 #if 0
04323 static void
04324 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04325 {
04326     char s[RUBY_MAX_CHAR_LEN];
04327     int n = rb_enc_codelen(c, enc);
04328 
04329     rb_enc_mbcput(c, s, enc);
04330     rb_enc_str_buf_cat(str, s, n, enc);
04331 }
04332 #endif
04333 
04334 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
04335 
04336 int
04337 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04338 {
04339     char buf[CHAR_ESC_LEN + 1];
04340     int l;
04341 
04342 #if SIZEOF_INT > 4
04343     c &= 0xffffffff;
04344 #endif
04345     if (unicode_p) {
04346         if (c < 0x7F && ISPRINT(c)) {
04347             snprintf(buf, CHAR_ESC_LEN, "%c", c);
04348         }
04349         else if (c < 0x10000) {
04350             snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04351         }
04352         else {
04353             snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04354         }
04355     }
04356     else {
04357         if (c < 0x100) {
04358             snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04359         }
04360         else {
04361             snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04362         }
04363     }
04364     l = (int)strlen(buf);       /* CHAR_ESC_LEN cannot exceed INT_MAX */
04365     rb_str_buf_cat(result, buf, l);
04366     return l;
04367 }
04368 
04369 /*
04370  * call-seq:
04371  *   str.inspect   -> string
04372  *
04373  * Returns a printable version of _str_, surrounded by quote marks,
04374  * with special characters escaped.
04375  *
04376  *    str = "hello"
04377  *    str[3] = "\b"
04378  *    str.inspect       #=> "\"hel\\bo\""
04379  */
04380 
04381 VALUE
04382 rb_str_inspect(VALUE str)
04383 {
04384     rb_encoding *enc = STR_ENC_GET(str);
04385     const char *p, *pend, *prev;
04386     char buf[CHAR_ESC_LEN + 1];
04387     VALUE result = rb_str_buf_new(0);
04388     rb_encoding *resenc = rb_default_internal_encoding();
04389     int unicode_p = rb_enc_unicode_p(enc);
04390     int asciicompat = rb_enc_asciicompat(enc);
04391     static rb_encoding *utf16, *utf32;
04392 
04393     if (!utf16) utf16 = rb_enc_find("UTF-16");
04394     if (!utf32) utf32 = rb_enc_find("UTF-32");
04395     if (resenc == NULL) resenc = rb_default_external_encoding();
04396     if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04397     rb_enc_associate(result, resenc);
04398     str_buf_cat2(result, "\"");
04399 
04400     p = RSTRING_PTR(str); pend = RSTRING_END(str);
04401     prev = p;
04402     if (enc == utf16) {
04403         const unsigned char *q = (const unsigned char *)p;
04404         if (q[0] == 0xFE && q[1] == 0xFF)
04405             enc = rb_enc_find("UTF-16BE");
04406         else if (q[0] == 0xFF && q[1] == 0xFE)
04407             enc = rb_enc_find("UTF-16LE");
04408         else
04409             unicode_p = 0;
04410     }
04411     else if (enc == utf32) {
04412         const unsigned char *q = (const unsigned char *)p;
04413         if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
04414             enc = rb_enc_find("UTF-32BE");
04415         else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
04416             enc = rb_enc_find("UTF-32LE");
04417         else
04418             unicode_p = 0;
04419     }
04420     while (p < pend) {
04421         unsigned int c, cc;
04422         int n;
04423 
04424         n = rb_enc_precise_mbclen(p, pend, enc);
04425         if (!MBCLEN_CHARFOUND_P(n)) {
04426             if (p > prev) str_buf_cat(result, prev, p - prev);
04427             n = rb_enc_mbminlen(enc);
04428             if (pend < p + n)
04429                 n = (int)(pend - p);
04430             while (n--) {
04431                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04432                 str_buf_cat(result, buf, strlen(buf));
04433                 prev = ++p;
04434             }
04435             continue;
04436         }
04437         n = MBCLEN_CHARFOUND_LEN(n);
04438         c = rb_enc_mbc_to_codepoint(p, pend, enc);
04439         p += n;
04440         if ((asciicompat || unicode_p) &&
04441           (c == '"'|| c == '\\' ||
04442             (c == '#' &&
04443              p < pend &&
04444              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04445              (cc = rb_enc_codepoint(p,pend,enc),
04446               (cc == '$' || cc == '@' || cc == '{'))))) {
04447             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04448             str_buf_cat2(result, "\\");
04449             if (asciicompat || enc == resenc) {
04450                 prev = p - n;
04451                 continue;
04452             }
04453         }
04454         switch (c) {
04455           case '\n': cc = 'n'; break;
04456           case '\r': cc = 'r'; break;
04457           case '\t': cc = 't'; break;
04458           case '\f': cc = 'f'; break;
04459           case '\013': cc = 'v'; break;
04460           case '\010': cc = 'b'; break;
04461           case '\007': cc = 'a'; break;
04462           case 033: cc = 'e'; break;
04463           default: cc = 0; break;
04464         }
04465         if (cc) {
04466             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04467             buf[0] = '\\';
04468             buf[1] = (char)cc;
04469             str_buf_cat(result, buf, 2);
04470             prev = p;
04471             continue;
04472         }
04473         if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04474             (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04475             continue;
04476         }
04477         else {
04478             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04479             rb_str_buf_cat_escaped_char(result, c, unicode_p);
04480             prev = p;
04481             continue;
04482         }
04483     }
04484     if (p > prev) str_buf_cat(result, prev, p - prev);
04485     str_buf_cat2(result, "\"");
04486 
04487     OBJ_INFECT(result, str);
04488     return result;
04489 }
04490 
04491 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04492 
04493 /*
04494  *  call-seq:
04495  *     str.dump   -> new_str
04496  *
04497  *  Produces a version of <i>str</i> with all nonprinting characters replaced by
04498  *  <code>\nnn</code> notation and all special characters escaped.
04499  */
04500 
04501 VALUE
04502 rb_str_dump(VALUE str)
04503 {
04504     rb_encoding *enc = rb_enc_get(str);
04505     long len;
04506     const char *p, *pend;
04507     char *q, *qend;
04508     VALUE result;
04509     int u8 = (enc == rb_utf8_encoding());
04510 
04511     len = 2;                    /* "" */
04512     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04513     while (p < pend) {
04514         unsigned char c = *p++;
04515         switch (c) {
04516           case '"':  case '\\':
04517           case '\n': case '\r':
04518           case '\t': case '\f':
04519           case '\013': case '\010': case '\007': case '\033':
04520             len += 2;
04521             break;
04522 
04523           case '#':
04524             len += IS_EVSTR(p, pend) ? 2 : 1;
04525             break;
04526 
04527           default:
04528             if (ISPRINT(c)) {
04529                 len++;
04530             }
04531             else {
04532                 if (u8) {       /* \u{NN} */
04533                     int n = rb_enc_precise_mbclen(p-1, pend, enc);
04534                     if (MBCLEN_CHARFOUND_P(n-1)) {
04535                         unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04536                         while (cc >>= 4) len++;
04537                         len += 5;
04538                         p += MBCLEN_CHARFOUND_LEN(n)-1;
04539                         break;
04540                     }
04541                 }
04542                 len += 4;       /* \xNN */
04543             }
04544             break;
04545         }
04546     }
04547     if (!rb_enc_asciicompat(enc)) {
04548         len += 19;              /* ".force_encoding('')" */
04549         len += strlen(enc->name);
04550     }
04551 
04552     result = rb_str_new5(str, 0, len);
04553     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04554     q = RSTRING_PTR(result); qend = q + len + 1;
04555 
04556     *q++ = '"';
04557     while (p < pend) {
04558         unsigned char c = *p++;
04559 
04560         if (c == '"' || c == '\\') {
04561             *q++ = '\\';
04562             *q++ = c;
04563         }
04564         else if (c == '#') {
04565             if (IS_EVSTR(p, pend)) *q++ = '\\';
04566             *q++ = '#';
04567         }
04568         else if (c == '\n') {
04569             *q++ = '\\';
04570             *q++ = 'n';
04571         }
04572         else if (c == '\r') {
04573             *q++ = '\\';
04574             *q++ = 'r';
04575         }
04576         else if (c == '\t') {
04577             *q++ = '\\';
04578             *q++ = 't';
04579         }
04580         else if (c == '\f') {
04581             *q++ = '\\';
04582             *q++ = 'f';
04583         }
04584         else if (c == '\013') {
04585             *q++ = '\\';
04586             *q++ = 'v';
04587         }
04588         else if (c == '\010') {
04589             *q++ = '\\';
04590             *q++ = 'b';
04591         }
04592         else if (c == '\007') {
04593             *q++ = '\\';
04594             *q++ = 'a';
04595         }
04596         else if (c == '\033') {
04597             *q++ = '\\';
04598             *q++ = 'e';
04599         }
04600         else if (ISPRINT(c)) {
04601             *q++ = c;
04602         }
04603         else {
04604             *q++ = '\\';
04605             if (u8) {
04606                 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
04607                 if (MBCLEN_CHARFOUND_P(n)) {
04608                     int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04609                     p += n;
04610                     snprintf(q, qend-q, "u{%x}", cc);
04611                     q += strlen(q);
04612                     continue;
04613                 }
04614             }
04615             snprintf(q, qend-q, "x%02X", c);
04616             q += 3;
04617         }
04618     }
04619     *q++ = '"';
04620     *q = '\0';
04621     if (!rb_enc_asciicompat(enc)) {
04622         snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
04623         enc = rb_ascii8bit_encoding();
04624     }
04625     OBJ_INFECT(result, str);
04626     /* result from dump is ASCII */
04627     rb_enc_associate(result, enc);
04628     ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
04629     return result;
04630 }
04631 
04632 
04633 static void
04634 rb_str_check_dummy_enc(rb_encoding *enc)
04635 {
04636     if (rb_enc_dummy_p(enc)) {
04637         rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
04638                  rb_enc_name(enc));
04639     }
04640 }
04641 
04642 /*
04643  *  call-seq:
04644  *     str.upcase!   -> str or nil
04645  *
04646  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
04647  *  were made.
04648  *  Note: case replacement is effective only in ASCII region.
04649  */
04650 
04651 static VALUE
04652 rb_str_upcase_bang(VALUE str)
04653 {
04654     rb_encoding *enc;
04655     char *s, *send;
04656     int modify = 0;
04657     int n;
04658 
04659     str_modify_keep_cr(str);
04660     enc = STR_ENC_GET(str);
04661     rb_str_check_dummy_enc(enc);
04662     s = RSTRING_PTR(str); send = RSTRING_END(str);
04663     if (single_byte_optimizable(str)) {
04664         while (s < send) {
04665             unsigned int c = *(unsigned char*)s;
04666 
04667             if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04668                 *s = 'A' + (c - 'a');
04669                 modify = 1;
04670             }
04671             s++;
04672         }
04673     }
04674     else {
04675         int ascompat = rb_enc_asciicompat(enc);
04676 
04677         while (s < send) {
04678             unsigned int c;
04679 
04680             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04681                 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04682                     *s = 'A' + (c - 'a');
04683                     modify = 1;
04684                 }
04685                 s++;
04686             }
04687             else {
04688                 c = rb_enc_codepoint_len(s, send, &n, enc);
04689                 if (rb_enc_islower(c, enc)) {
04690                     /* assuming toupper returns codepoint with same size */
04691                     rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04692                     modify = 1;
04693                 }
04694                 s += n;
04695             }
04696         }
04697     }
04698 
04699     if (modify) return str;
04700     return Qnil;
04701 }
04702 
04703 
04704 /*
04705  *  call-seq:
04706  *     str.upcase   -> new_str
04707  *
04708  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
04709  *  uppercase counterparts. The operation is locale insensitive---only
04710  *  characters ``a'' to ``z'' are affected.
04711  *  Note: case replacement is effective only in ASCII region.
04712  *
04713  *     "hEllO".upcase   #=> "HELLO"
04714  */
04715 
04716 static VALUE
04717 rb_str_upcase(VALUE str)
04718 {
04719     str = rb_str_dup(str);
04720     rb_str_upcase_bang(str);
04721     return str;
04722 }
04723 
04724 
04725 /*
04726  *  call-seq:
04727  *     str.downcase!   -> str or nil
04728  *
04729  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
04730  *  changes were made.
04731  *  Note: case replacement is effective only in ASCII region.
04732  */
04733 
04734 static VALUE
04735 rb_str_downcase_bang(VALUE str)
04736 {
04737     rb_encoding *enc;
04738     char *s, *send;
04739     int modify = 0;
04740 
04741     str_modify_keep_cr(str);
04742     enc = STR_ENC_GET(str);
04743     rb_str_check_dummy_enc(enc);
04744     s = RSTRING_PTR(str); send = RSTRING_END(str);
04745     if (single_byte_optimizable(str)) {
04746         while (s < send) {
04747             unsigned int c = *(unsigned char*)s;
04748 
04749             if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04750                 *s = 'a' + (c - 'A');
04751                 modify = 1;
04752             }
04753             s++;
04754         }
04755     }
04756     else {
04757         int ascompat = rb_enc_asciicompat(enc);
04758 
04759         while (s < send) {
04760             unsigned int c;
04761             int n;
04762 
04763             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04764                 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04765                     *s = 'a' + (c - 'A');
04766                     modify = 1;
04767                 }
04768                 s++;
04769             }
04770             else {
04771                 c = rb_enc_codepoint_len(s, send, &n, enc);
04772                 if (rb_enc_isupper(c, enc)) {
04773                     /* assuming toupper returns codepoint with same size */
04774                     rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04775                     modify = 1;
04776                 }
04777                 s += n;
04778             }
04779         }
04780     }
04781 
04782     if (modify) return str;
04783     return Qnil;
04784 }
04785 
04786 
04787 /*
04788  *  call-seq:
04789  *     str.downcase   -> new_str
04790  *
04791  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
04792  *  lowercase counterparts. The operation is locale insensitive---only
04793  *  characters ``A'' to ``Z'' are affected.
04794  *  Note: case replacement is effective only in ASCII region.
04795  *
04796  *     "hEllO".downcase   #=> "hello"
04797  */
04798 
04799 static VALUE
04800 rb_str_downcase(VALUE str)
04801 {
04802     str = rb_str_dup(str);
04803     rb_str_downcase_bang(str);
04804     return str;
04805 }
04806 
04807 
04808 /*
04809  *  call-seq:
04810  *     str.capitalize!   -> str or nil
04811  *
04812  *  Modifies <i>str</i> by converting the first character to uppercase and the
04813  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
04814  *  Note: case conversion is effective only in ASCII region.
04815  *
04816  *     a = "hello"
04817  *     a.capitalize!   #=> "Hello"
04818  *     a               #=> "Hello"
04819  *     a.capitalize!   #=> nil
04820  */
04821 
04822 static VALUE
04823 rb_str_capitalize_bang(VALUE str)
04824 {
04825     rb_encoding *enc;
04826     char *s, *send;
04827     int modify = 0;
04828     unsigned int c;
04829     int n;
04830 
04831     str_modify_keep_cr(str);
04832     enc = STR_ENC_GET(str);
04833     rb_str_check_dummy_enc(enc);
04834     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04835     s = RSTRING_PTR(str); send = RSTRING_END(str);
04836 
04837     c = rb_enc_codepoint_len(s, send, &n, enc);
04838     if (rb_enc_islower(c, enc)) {
04839         rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04840         modify = 1;
04841     }
04842     s += n;
04843     while (s < send) {
04844         c = rb_enc_codepoint_len(s, send, &n, enc);
04845         if (rb_enc_isupper(c, enc)) {
04846             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04847             modify = 1;
04848         }
04849         s += n;
04850     }
04851 
04852     if (modify) return str;
04853     return Qnil;
04854 }
04855 
04856 
04857 /*
04858  *  call-seq:
04859  *     str.capitalize   -> new_str
04860  *
04861  *  Returns a copy of <i>str</i> with the first character converted to uppercase
04862  *  and the remainder to lowercase.
04863  *  Note: case conversion is effective only in ASCII region.
04864  *
04865  *     "hello".capitalize    #=> "Hello"
04866  *     "HELLO".capitalize    #=> "Hello"
04867  *     "123ABC".capitalize   #=> "123abc"
04868  */
04869 
04870 static VALUE
04871 rb_str_capitalize(VALUE str)
04872 {
04873     str = rb_str_dup(str);
04874     rb_str_capitalize_bang(str);
04875     return str;
04876 }
04877 
04878 
04879 /*
04880  *  call-seq:
04881  *     str.swapcase!   -> str or nil
04882  *
04883  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
04884  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
04885  *  Note: case conversion is effective only in ASCII region.
04886  */
04887 
04888 static VALUE
04889 rb_str_swapcase_bang(VALUE str)
04890 {
04891     rb_encoding *enc;
04892     char *s, *send;
04893     int modify = 0;
04894     int n;
04895 
04896     str_modify_keep_cr(str);
04897     enc = STR_ENC_GET(str);
04898     rb_str_check_dummy_enc(enc);
04899     s = RSTRING_PTR(str); send = RSTRING_END(str);
04900     while (s < send) {
04901         unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
04902 
04903         if (rb_enc_isupper(c, enc)) {
04904             /* assuming toupper returns codepoint with same size */
04905             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04906             modify = 1;
04907         }
04908         else if (rb_enc_islower(c, enc)) {
04909             /* assuming tolower returns codepoint with same size */
04910             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04911             modify = 1;
04912         }
04913         s += n;
04914     }
04915 
04916     if (modify) return str;
04917     return Qnil;
04918 }
04919 
04920 
04921 /*
04922  *  call-seq:
04923  *     str.swapcase   -> new_str
04924  *
04925  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
04926  *  to lowercase and lowercase characters converted to uppercase.
04927  *  Note: case conversion is effective only in ASCII region.
04928  *
04929  *     "Hello".swapcase          #=> "hELLO"
04930  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
04931  */
04932 
04933 static VALUE
04934 rb_str_swapcase(VALUE str)
04935 {
04936     str = rb_str_dup(str);
04937     rb_str_swapcase_bang(str);
04938     return str;
04939 }
04940 
04941 typedef unsigned char *USTR;
04942 
04943 struct tr {
04944     int gen;
04945     unsigned int now, max;
04946     char *p, *pend;
04947 };
04948 
04949 static unsigned int
04950 trnext(struct tr *t, rb_encoding *enc)
04951 {
04952     int n;
04953 
04954     for (;;) {
04955         if (!t->gen) {
04956             if (t->p == t->pend) return -1;
04957             if (t->p < t->pend - 1 && *t->p == '\\') {
04958                 t->p++;
04959             }
04960             t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04961             t->p += n;
04962             if (t->p < t->pend - 1 && *t->p == '-') {
04963                 t->p++;
04964                 if (t->p < t->pend) {
04965                     unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04966                     t->p += n;
04967                     if (t->now > c) {
04968                         if (t->now < 0x80 && c < 0x80) {
04969                             rb_raise(rb_eArgError,
04970                                      "invalid range \"%c-%c\" in string transliteration",
04971                                      t->now, c);
04972                         }
04973                         else {
04974                             rb_raise(rb_eArgError, "invalid range in string transliteration");
04975                         }
04976                         continue; /* not reached */
04977                     }
04978                     t->gen = 1;
04979                     t->max = c;
04980                 }
04981             }
04982             return t->now;
04983         }
04984         else if (++t->now < t->max) {
04985             return t->now;
04986         }
04987         else {
04988             t->gen = 0;
04989             return t->max;
04990         }
04991     }
04992 }
04993 
04994 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
04995 
04996 static VALUE
04997 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
04998 {
04999     const unsigned int errc = -1;
05000     unsigned int trans[256];
05001     rb_encoding *enc, *e1, *e2;
05002     struct tr trsrc, trrepl;
05003     int cflag = 0;
05004     unsigned int c, c0, last = 0;
05005     int modify = 0, i, l;
05006     char *s, *send;
05007     VALUE hash = 0;
05008     int singlebyte = single_byte_optimizable(str);
05009     int cr;
05010 
05011 #define CHECK_IF_ASCII(c) \
05012     (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
05013            (cr = ENC_CODERANGE_VALID) : 0)
05014 
05015     StringValue(src);
05016     StringValue(repl);
05017     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05018     if (RSTRING_LEN(repl) == 0) {
05019         return rb_str_delete_bang(1, &src, str);
05020     }
05021 
05022     cr = ENC_CODERANGE(str);
05023     e1 = rb_enc_check(str, src);
05024     e2 = rb_enc_check(str, repl);
05025     if (e1 == e2) {
05026         enc = e1;
05027     }
05028     else {
05029         enc = rb_enc_check(src, repl);
05030     }
05031     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
05032     if (RSTRING_LEN(src) > 1 &&
05033         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
05034         trsrc.p + l < trsrc.pend) {
05035         cflag = 1;
05036         trsrc.p += l;
05037     }
05038     trrepl.p = RSTRING_PTR(repl);
05039     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
05040     trsrc.gen = trrepl.gen = 0;
05041     trsrc.now = trrepl.now = 0;
05042     trsrc.max = trrepl.max = 0;
05043 
05044     if (cflag) {
05045         for (i=0; i<256; i++) {
05046             trans[i] = 1;
05047         }
05048         while ((c = trnext(&trsrc, enc)) != errc) {
05049             if (c < 256) {
05050                 trans[c] = errc;
05051             }
05052             else {
05053                 if (!hash) hash = rb_hash_new();
05054                 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
05055             }
05056         }
05057         while ((c = trnext(&trrepl, enc)) != errc)
05058             /* retrieve last replacer */;
05059         last = trrepl.now;
05060         for (i=0; i<256; i++) {
05061             if (trans[i] != errc) {
05062                 trans[i] = last;
05063             }
05064         }
05065     }
05066     else {
05067         unsigned int r;
05068 
05069         for (i=0; i<256; i++) {
05070             trans[i] = errc;
05071         }
05072         while ((c = trnext(&trsrc, enc)) != errc) {
05073             r = trnext(&trrepl, enc);
05074             if (r == errc) r = trrepl.now;
05075             if (c < 256) {
05076                 trans[c] = r;
05077                 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
05078             }
05079             else {
05080                 if (!hash) hash = rb_hash_new();
05081                 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
05082             }
05083         }
05084     }
05085 
05086     if (cr == ENC_CODERANGE_VALID)
05087         cr = ENC_CODERANGE_7BIT;
05088     str_modify_keep_cr(str);
05089     s = RSTRING_PTR(str); send = RSTRING_END(str);
05090     if (sflag) {
05091         int clen, tlen;
05092         long offset, max = RSTRING_LEN(str);
05093         unsigned int save = -1;
05094         char *buf = ALLOC_N(char, max), *t = buf;
05095 
05096         while (s < send) {
05097             int may_modify = 0;
05098 
05099             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05100             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05101 
05102             s += clen;
05103             if (c < 256) {
05104                 c = trans[c];
05105             }
05106             else if (hash) {
05107                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05108                 if (NIL_P(tmp)) {
05109                     if (cflag) c = last;
05110                     else c = errc;
05111                 }
05112                 else if (cflag) c = errc;
05113                 else c = NUM2INT(tmp);
05114             }
05115             else {
05116                 c = errc;
05117             }
05118             if (c != (unsigned int)-1) {
05119                 if (save == c) {
05120                     CHECK_IF_ASCII(c);
05121                     continue;
05122                 }
05123                 save = c;
05124                 tlen = rb_enc_codelen(c, enc);
05125                 modify = 1;
05126             }
05127             else {
05128                 save = -1;
05129                 c = c0;
05130                 if (enc != e1) may_modify = 1;
05131             }
05132             while (t - buf + tlen >= max) {
05133                 offset = t - buf;
05134                 max *= 2;
05135                 REALLOC_N(buf, char, max);
05136                 t = buf + offset;
05137             }
05138             rb_enc_mbcput(c, t, enc);
05139             if (may_modify && memcmp(s, t, tlen) != 0) {
05140                 modify = 1;
05141             }
05142             CHECK_IF_ASCII(c);
05143             t += tlen;
05144         }
05145         if (!STR_EMBED_P(str)) {
05146             xfree(RSTRING(str)->as.heap.ptr);
05147         }
05148         *t = '\0';
05149         RSTRING(str)->as.heap.ptr = buf;
05150         RSTRING(str)->as.heap.len = t - buf;
05151         STR_SET_NOEMBED(str);
05152         RSTRING(str)->as.heap.aux.capa = max;
05153     }
05154     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
05155         while (s < send) {
05156             c = (unsigned char)*s;
05157             if (trans[c] != errc) {
05158                 if (!cflag) {
05159                     c = trans[c];
05160                     *s = c;
05161                     modify = 1;
05162                 }
05163                 else {
05164                     *s = last;
05165                     modify = 1;
05166                 }
05167             }
05168             CHECK_IF_ASCII(c);
05169             s++;
05170         }
05171     }
05172     else {
05173         int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
05174         long offset;
05175         char *buf = ALLOC_N(char, max), *t = buf;
05176 
05177         while (s < send) {
05178             int may_modify = 0;
05179             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05180             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05181 
05182             if (c < 256) {
05183                 c = trans[c];
05184             }
05185             else if (hash) {
05186                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05187                 if (NIL_P(tmp)) {
05188                     if (cflag) c = last;
05189                     else c = errc;
05190                 }
05191                 else if (cflag) c = errc;
05192                 else c = NUM2INT(tmp);
05193             }
05194             else {
05195                 c = cflag ? last : errc;
05196             }
05197             if (c != errc) {
05198                 tlen = rb_enc_codelen(c, enc);
05199                 modify = 1;
05200             }
05201             else {
05202                 c = c0;
05203                 if (enc != e1) may_modify = 1;
05204             }
05205             while (t - buf + tlen >= max) {
05206                 offset = t - buf;
05207                 max *= 2;
05208                 REALLOC_N(buf, char, max);
05209                 t = buf + offset;
05210             }
05211             if (s != t) {
05212                 rb_enc_mbcput(c, t, enc);
05213                 if (may_modify && memcmp(s, t, tlen) != 0) {
05214                     modify = 1;
05215                 }
05216             }
05217             CHECK_IF_ASCII(c);
05218             s += clen;
05219             t += tlen;
05220         }
05221         if (!STR_EMBED_P(str)) {
05222             xfree(RSTRING(str)->as.heap.ptr);
05223         }
05224         *t = '\0';
05225         RSTRING(str)->as.heap.ptr = buf;
05226         RSTRING(str)->as.heap.len = t - buf;
05227         STR_SET_NOEMBED(str);
05228         RSTRING(str)->as.heap.aux.capa = max;
05229     }
05230 
05231     if (modify) {
05232         if (cr != ENC_CODERANGE_BROKEN)
05233             ENC_CODERANGE_SET(str, cr);
05234         rb_enc_associate(str, enc);
05235         return str;
05236     }
05237     return Qnil;
05238 }
05239 
05240 
05241 /*
05242  *  call-seq:
05243  *     str.tr!(from_str, to_str)   -> str or nil
05244  *
05245  *  Translates <i>str</i> in place, using the same rules as
05246  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
05247  *  changes were made.
05248  */
05249 
05250 static VALUE
05251 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
05252 {
05253     return tr_trans(str, src, repl, 0);
05254 }
05255 
05256 
05257 /*
05258  *  call-seq:
05259  *     str.tr(from_str, to_str)   => new_str
05260  *
05261  *  Returns a copy of <i>str</i> with the characters in <i>from_str</i>
05262  *  replaced by the corresponding characters in <i>to_str</i>. If
05263  *  <i>to_str</i> is shorter than <i>from_str</i>, it is padded with its last
05264  *  character in order to maintain the correspondence.
05265  *
05266  *     "hello".tr('el', 'ip')      #=> "hippo"
05267  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
05268  *
05269  *  Both strings may use the c1-c2 notation to denote ranges of characters,
05270  *  and <i>from_str</i> may start with a <code>^</code>, which denotes all
05271  *  characters except those listed.
05272  *
05273  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
05274  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
05275  */
05276 
05277 static VALUE
05278 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05279 {
05280     str = rb_str_dup(str);
05281     tr_trans(str, src, repl, 0);
05282     return str;
05283 }
05284 
05285 #define TR_TABLE_SIZE 257
05286 static void
05287 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
05288                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05289 {
05290     const unsigned int errc = -1;
05291     char buf[256];
05292     struct tr tr;
05293     unsigned int c;
05294     VALUE table = 0, ptable = 0;
05295     int i, l, cflag = 0;
05296 
05297     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05298     tr.gen = tr.now = tr.max = 0;
05299 
05300     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05301         cflag = 1;
05302         tr.p += l;
05303     }
05304     if (first) {
05305         for (i=0; i<256; i++) {
05306             stable[i] = 1;
05307         }
05308         stable[256] = cflag;
05309     }
05310     else if (stable[256] && !cflag) {
05311         stable[256] = 0;
05312     }
05313     for (i=0; i<256; i++) {
05314         buf[i] = cflag;
05315     }
05316 
05317     while ((c = trnext(&tr, enc)) != errc) {
05318         if (c < 256) {
05319             buf[c & 0xff] = !cflag;
05320         }
05321         else {
05322             VALUE key = UINT2NUM(c);
05323 
05324             if (!table) {
05325                 table = rb_hash_new();
05326                 if (cflag) {
05327                     ptable = *ctablep;
05328                     *ctablep = table;
05329                 }
05330                 else {
05331                     ptable = *tablep;
05332                     *tablep = table;
05333                 }
05334             }
05335             if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
05336                 rb_hash_aset(table, key, Qtrue);
05337             }
05338         }
05339     }
05340     for (i=0; i<256; i++) {
05341         stable[i] = stable[i] && buf[i];
05342     }
05343 }
05344 
05345 
05346 static int
05347 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
05348 {
05349     if (c < 256) {
05350         return table[c] != 0;
05351     }
05352     else {
05353         VALUE v = UINT2NUM(c);
05354 
05355         if (del) {
05356             if (!NIL_P(rb_hash_lookup(del, v)) &&
05357                     (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
05358                 return TRUE;
05359             }
05360         }
05361         else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
05362             return FALSE;
05363         }
05364         return table[256] ? TRUE : FALSE;
05365     }
05366 }
05367 
05368 /*
05369  *  call-seq:
05370  *     str.delete!([other_str]+)   -> str or nil
05371  *
05372  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
05373  *  <code>nil</code> if <i>str</i> was not modified.
05374  */
05375 
05376 static VALUE
05377 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05378 {
05379     char squeez[TR_TABLE_SIZE];
05380     rb_encoding *enc = 0;
05381     char *s, *send, *t;
05382     VALUE del = 0, nodel = 0;
05383     int modify = 0;
05384     int i, ascompat, cr;
05385 
05386     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05387     if (argc < 1) {
05388         rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05389     }
05390     for (i=0; i<argc; i++) {
05391         VALUE s = argv[i];
05392 
05393         StringValue(s);
05394         enc = rb_enc_check(str, s);
05395         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05396     }
05397 
05398     str_modify_keep_cr(str);
05399     ascompat = rb_enc_asciicompat(enc);
05400     s = t = RSTRING_PTR(str);
05401     send = RSTRING_END(str);
05402     cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05403     while (s < send) {
05404         unsigned int c;
05405         int clen;
05406 
05407         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05408             if (squeez[c]) {
05409                 modify = 1;
05410             }
05411             else {
05412                 if (t != s) *t = c;
05413                 t++;
05414             }
05415             s++;
05416         }
05417         else {
05418             c = rb_enc_codepoint_len(s, send, &clen, enc);
05419 
05420             if (tr_find(c, squeez, del, nodel)) {
05421                 modify = 1;
05422             }
05423             else {
05424                 if (t != s) rb_enc_mbcput(c, t, enc);
05425                 t += clen;
05426                 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05427             }
05428             s += clen;
05429         }
05430     }
05431     *t = '\0';
05432     STR_SET_LEN(str, t - RSTRING_PTR(str));
05433     ENC_CODERANGE_SET(str, cr);
05434 
05435     if (modify) return str;
05436     return Qnil;
05437 }
05438 
05439 
05440 /*
05441  *  call-seq:
05442  *     str.delete([other_str]+)   -> new_str
05443  *
05444  *  Returns a copy of <i>str</i> with all characters in the intersection of its
05445  *  arguments deleted. Uses the same rules for building the set of characters as
05446  *  <code>String#count</code>.
05447  *
05448  *     "hello".delete "l","lo"        #=> "heo"
05449  *     "hello".delete "lo"            #=> "he"
05450  *     "hello".delete "aeiou", "^e"   #=> "hell"
05451  *     "hello".delete "ej-m"          #=> "ho"
05452  */
05453 
05454 static VALUE
05455 rb_str_delete(int argc, VALUE *argv, VALUE str)
05456 {
05457     str = rb_str_dup(str);
05458     rb_str_delete_bang(argc, argv, str);
05459     return str;
05460 }
05461 
05462 
05463 /*
05464  *  call-seq:
05465  *     str.squeeze!([other_str]*)   -> str or nil
05466  *
05467  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
05468  *  <code>nil</code> if no changes were made.
05469  */
05470 
05471 static VALUE
05472 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05473 {
05474     char squeez[TR_TABLE_SIZE];
05475     rb_encoding *enc = 0;
05476     VALUE del = 0, nodel = 0;
05477     char *s, *send, *t;
05478     int i, modify = 0;
05479     int ascompat, singlebyte = single_byte_optimizable(str);
05480     unsigned int save;
05481 
05482     if (argc == 0) {
05483         enc = STR_ENC_GET(str);
05484     }
05485     else {
05486         for (i=0; i<argc; i++) {
05487             VALUE s = argv[i];
05488 
05489             StringValue(s);
05490             enc = rb_enc_check(str, s);
05491             if (singlebyte && !single_byte_optimizable(s))
05492                 singlebyte = 0;
05493             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05494         }
05495     }
05496 
05497     str_modify_keep_cr(str);
05498     s = t = RSTRING_PTR(str);
05499     if (!s || RSTRING_LEN(str) == 0) return Qnil;
05500     send = RSTRING_END(str);
05501     save = -1;
05502     ascompat = rb_enc_asciicompat(enc);
05503 
05504     if (singlebyte) {
05505         while (s < send) {
05506             unsigned int c = *(unsigned char*)s++;
05507             if (c != save || (argc > 0 && !squeez[c])) {
05508                 *t++ = save = c;
05509             }
05510         }
05511     } else {
05512         while (s < send) {
05513             unsigned int c;
05514             int clen;
05515 
05516             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05517                 if (c != save || (argc > 0 && !squeez[c])) {
05518                     *t++ = save = c;
05519                 }
05520                 s++;
05521             }
05522             else {
05523                 c = rb_enc_codepoint_len(s, send, &clen, enc);
05524 
05525                 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05526                     if (t != s) rb_enc_mbcput(c, t, enc);
05527                     save = c;
05528                     t += clen;
05529                 }
05530                 s += clen;
05531             }
05532         }
05533     }
05534 
05535     *t = '\0';
05536     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05537         STR_SET_LEN(str, t - RSTRING_PTR(str));
05538         modify = 1;
05539     }
05540 
05541     if (modify) return str;
05542     return Qnil;
05543 }
05544 
05545 
05546 /*
05547  *  call-seq:
05548  *     str.squeeze([other_str]*)    -> new_str
05549  *
05550  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
05551  *  procedure described for <code>String#count</code>. Returns a new string
05552  *  where runs of the same character that occur in this set are replaced by a
05553  *  single character. If no arguments are given, all runs of identical
05554  *  characters are replaced by a single character.
05555  *
05556  *     "yellow moon".squeeze                  #=> "yelow mon"
05557  *     "  now   is  the".squeeze(" ")         #=> " now is the"
05558  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
05559  */
05560 
05561 static VALUE
05562 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05563 {
05564     str = rb_str_dup(str);
05565     rb_str_squeeze_bang(argc, argv, str);
05566     return str;
05567 }
05568 
05569 
05570 /*
05571  *  call-seq:
05572  *     str.tr_s!(from_str, to_str)   -> str or nil
05573  *
05574  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
05575  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
05576  */
05577 
05578 static VALUE
05579 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
05580 {
05581     return tr_trans(str, src, repl, 1);
05582 }
05583 
05584 
05585 /*
05586  *  call-seq:
05587  *     str.tr_s(from_str, to_str)   -> new_str
05588  *
05589  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
05590  *  then removes duplicate characters in regions that were affected by the
05591  *  translation.
05592  *
05593  *     "hello".tr_s('l', 'r')     #=> "hero"
05594  *     "hello".tr_s('el', '*')    #=> "h*o"
05595  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
05596  */
05597 
05598 static VALUE
05599 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
05600 {
05601     str = rb_str_dup(str);
05602     tr_trans(str, src, repl, 1);
05603     return str;
05604 }
05605 
05606 
05607 /*
05608  *  call-seq:
05609  *     str.count([other_str]+)   -> fixnum
05610  *
05611  *  Each <i>other_str</i> parameter defines a set of characters to count.  The
05612  *  intersection of these sets defines the characters to count in
05613  *  <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is
05614  *  negated. The sequence c1--c2 means all characters between c1 and c2.
05615  *
05616  *     a = "hello world"
05617  *     a.count "lo"            #=> 5
05618  *     a.count "lo", "o"       #=> 2
05619  *     a.count "hello", "^l"   #=> 4
05620  *     a.count "ej-m"          #=> 4
05621  */
05622 
05623 static VALUE
05624 rb_str_count(int argc, VALUE *argv, VALUE str)
05625 {
05626     char table[TR_TABLE_SIZE];
05627     rb_encoding *enc = 0;
05628     VALUE del = 0, nodel = 0;
05629     char *s, *send;
05630     int i;
05631     int ascompat;
05632 
05633     if (argc < 1) {
05634         rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05635     }
05636     for (i=0; i<argc; i++) {
05637         VALUE tstr = argv[i];
05638         unsigned char c;
05639 
05640         StringValue(tstr);
05641         enc = rb_enc_check(str, tstr);
05642         if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
05643             (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
05644             int n = 0;
05645 
05646             s = RSTRING_PTR(str);
05647             if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05648             send = RSTRING_END(str);
05649             while (s < send) {
05650                 if (*(unsigned char*)s++ == c) n++;
05651             }
05652             return INT2NUM(n);
05653         }
05654         tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
05655     }
05656 
05657     s = RSTRING_PTR(str);
05658     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05659     send = RSTRING_END(str);
05660     ascompat = rb_enc_asciicompat(enc);
05661     i = 0;
05662     while (s < send) {
05663         unsigned int c;
05664 
05665         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05666             if (table[c]) {
05667                 i++;
05668             }
05669             s++;
05670         }
05671         else {
05672             int clen;
05673             c = rb_enc_codepoint_len(s, send, &clen, enc);
05674             if (tr_find(c, table, del, nodel)) {
05675                 i++;
05676             }
05677             s += clen;
05678         }
05679     }
05680 
05681     return INT2NUM(i);
05682 }
05683 
05684 static const char isspacetable[256] = {
05685     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
05686     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05687     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05688     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05689     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05690     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05691     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05692     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05693     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05694     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05695     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05696     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05697     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05698     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05699     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05700     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
05701 };
05702 
05703 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
05704 
05705 /*
05706  *  call-seq:
05707  *     str.split(pattern=$;, [limit])   -> anArray
05708  *
05709  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
05710  *  of these substrings.
05711  *
05712  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
05713  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
05714  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
05715  *  of contiguous whitespace characters ignored.
05716  *
05717  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
05718  *  pattern matches. Whenever the pattern matches a zero-length string,
05719  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
05720  *  groups, the respective matches will be returned in the array as well.
05721  *
05722  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
05723  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
05724  *  split on whitespace as if ` ' were specified.
05725  *
05726  *  If the <i>limit</i> parameter is omitted, trailing null fields are
05727  *  suppressed. If <i>limit</i> is a positive number, at most that number of
05728  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
05729  *  string is returned as the only entry in an array). If negative, there is no
05730  *  limit to the number of fields returned, and trailing null fields are not
05731  *  suppressed.
05732  *
05733  *     " now's  the time".split        #=> ["now's", "the", "time"]
05734  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
05735  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
05736  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
05737  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
05738  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
05739  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
05740  *
05741  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
05742  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
05743  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
05744  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
05745  */
05746 
05747 static VALUE
05748 rb_str_split_m(int argc, VALUE *argv, VALUE str)
05749 {
05750     rb_encoding *enc;
05751     VALUE spat;
05752     VALUE limit;
05753     enum {awk, string, regexp} split_type;
05754     long beg, end, i = 0;
05755     int lim = 0;
05756     VALUE result, tmp;
05757 
05758     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
05759         lim = NUM2INT(limit);
05760         if (lim <= 0) limit = Qnil;
05761         else if (lim == 1) {
05762             if (RSTRING_LEN(str) == 0)
05763                 return rb_ary_new2(0);
05764             return rb_ary_new3(1, str);
05765         }
05766         i = 1;
05767     }
05768 
05769     enc = STR_ENC_GET(str);
05770     if (NIL_P(spat)) {
05771         if (!NIL_P(rb_fs)) {
05772             spat = rb_fs;
05773             goto fs_set;
05774         }
05775         split_type = awk;
05776     }
05777     else {
05778       fs_set:
05779         if (TYPE(spat) == T_STRING) {
05780             rb_encoding *enc2 = STR_ENC_GET(spat);
05781 
05782             split_type = string;
05783             if (RSTRING_LEN(spat) == 0) {
05784                 /* Special case - split into chars */
05785                 spat = rb_reg_regcomp(spat);
05786                 split_type = regexp;
05787             }
05788             else if (rb_enc_asciicompat(enc2) == 1) {
05789                 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
05790                     split_type = awk;
05791                 }
05792             }
05793             else {
05794                 int l;
05795                 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
05796                     RSTRING_LEN(spat) == l) {
05797                     split_type = awk;
05798                 }
05799             }
05800         }
05801         else {
05802             spat = get_pat(spat, 1);
05803             split_type = regexp;
05804         }
05805     }
05806 
05807     result = rb_ary_new();
05808     beg = 0;
05809     if (split_type == awk) {
05810         char *ptr = RSTRING_PTR(str);
05811         char *eptr = RSTRING_END(str);
05812         char *bptr = ptr;
05813         int skip = 1;
05814         unsigned int c;
05815 
05816         end = beg;
05817         if (is_ascii_string(str)) {
05818             while (ptr < eptr) {
05819                 c = (unsigned char)*ptr++;
05820                 if (skip) {
05821                     if (ascii_isspace(c)) {
05822                         beg = ptr - bptr;
05823                     }
05824                     else {
05825                         end = ptr - bptr;
05826                         skip = 0;
05827                         if (!NIL_P(limit) && lim <= i) break;
05828                     }
05829                 }
05830                 else if (ascii_isspace(c)) {
05831                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05832                     skip = 1;
05833                     beg = ptr - bptr;
05834                     if (!NIL_P(limit)) ++i;
05835                 }
05836                 else {
05837                     end = ptr - bptr;
05838                 }
05839             }
05840         }
05841         else {
05842             while (ptr < eptr) {
05843                 int n;
05844 
05845                 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
05846                 ptr += n;
05847                 if (skip) {
05848                     if (rb_isspace(c)) {
05849                         beg = ptr - bptr;
05850                     }
05851                     else {
05852                         end = ptr - bptr;
05853                         skip = 0;
05854                         if (!NIL_P(limit) && lim <= i) break;
05855                     }
05856                 }
05857                 else if (rb_isspace(c)) {
05858                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05859                     skip = 1;
05860                     beg = ptr - bptr;
05861                     if (!NIL_P(limit)) ++i;
05862                 }
05863                 else {
05864                     end = ptr - bptr;
05865                 }
05866             }
05867         }
05868     }
05869     else if (split_type == string) {
05870         char *ptr = RSTRING_PTR(str);
05871         char *temp = ptr;
05872         char *eptr = RSTRING_END(str);
05873         char *sptr = RSTRING_PTR(spat);
05874         long slen = RSTRING_LEN(spat);
05875 
05876         if (is_broken_string(str)) {
05877             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
05878         }
05879         if (is_broken_string(spat)) {
05880             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
05881         }
05882         enc = rb_enc_check(str, spat);
05883         while (ptr < eptr &&
05884                (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
05885             /* Check we are at the start of a char */
05886             char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
05887             if (t != ptr + end) {
05888                 ptr = t;
05889                 continue;
05890             }
05891             rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
05892             ptr += end + slen;
05893             if (!NIL_P(limit) && lim <= ++i) break;
05894         }
05895         beg = ptr - temp;
05896     }
05897     else {
05898         char *ptr = RSTRING_PTR(str);
05899         long len = RSTRING_LEN(str);
05900         long start = beg;
05901         long idx;
05902         int last_null = 0;
05903         struct re_registers *regs;
05904 
05905         while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
05906             regs = RMATCH_REGS(rb_backref_get());
05907             if (start == end && BEG(0) == END(0)) {
05908                 if (!ptr) {
05909                     rb_ary_push(result, str_new_empty(str));
05910                     break;
05911                 }
05912                 else if (last_null == 1) {
05913                     rb_ary_push(result, rb_str_subseq(str, beg,
05914                                                       rb_enc_fast_mbclen(ptr+beg,
05915                                                                          ptr+len,
05916                                                                          enc)));
05917                     beg = start;
05918                 }
05919                 else {
05920                     if (ptr+start == ptr+len)
05921                         start++;
05922                     else
05923                         start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
05924                     last_null = 1;
05925                     continue;
05926                 }
05927             }
05928             else {
05929                 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05930                 beg = start = END(0);
05931             }
05932             last_null = 0;
05933 
05934             for (idx=1; idx < regs->num_regs; idx++) {
05935                 if (BEG(idx) == -1) continue;
05936                 if (BEG(idx) == END(idx))
05937                     tmp = str_new_empty(str);
05938                 else
05939                     tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
05940                 rb_ary_push(result, tmp);
05941             }
05942             if (!NIL_P(limit) && lim <= ++i) break;
05943         }
05944     }
05945     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
05946         if (RSTRING_LEN(str) == beg)
05947             tmp = str_new_empty(str);
05948         else
05949             tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
05950         rb_ary_push(result, tmp);
05951     }
05952     if (NIL_P(limit) && lim == 0) {
05953         long len;
05954         while ((len = RARRAY_LEN(result)) > 0 &&
05955                (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
05956             rb_ary_pop(result);
05957     }
05958 
05959     return result;
05960 }
05961 
05962 VALUE
05963 rb_str_split(VALUE str, const char *sep0)
05964 {
05965     VALUE sep;
05966 
05967     StringValue(str);
05968     sep = rb_str_new2(sep0);
05969     return rb_str_split_m(1, &sep, str);
05970 }
05971 
05972 
05973 /*
05974  *  call-seq:
05975  *     str.each_line(separator=$/) {|substr| block }   -> str
05976  *     str.each_line(separator=$/)                     -> an_enumerator
05977  *
05978  *     str.lines(separator=$/) {|substr| block }       -> str
05979  *     str.lines(separator=$/)                         -> an_enumerator
05980  *
05981  *  Splits <i>str</i> using the supplied parameter as the record separator
05982  *  (<code>$/</code> by default), passing each substring in turn to the supplied
05983  *  block. If a zero-length record separator is supplied, the string is split
05984  *  into paragraphs delimited by multiple successive newlines.
05985  *
05986  *  If no block is given, an enumerator is returned instead.
05987  *
05988  *     print "Example one\n"
05989  *     "hello\nworld".each_line {|s| p s}
05990  *     print "Example two\n"
05991  *     "hello\nworld".each_line('l') {|s| p s}
05992  *     print "Example three\n"
05993  *     "hello\n\n\nworld".each_line('') {|s| p s}
05994  *
05995  *  <em>produces:</em>
05996  *
05997  *     Example one
05998  *     "hello\n"
05999  *     "world"
06000  *     Example two
06001  *     "hel"
06002  *     "l"
06003  *     "o\nworl"
06004  *     "d"
06005  *     Example three
06006  *     "hello\n\n\n"
06007  *     "world"
06008  */
06009 
06010 static VALUE
06011 rb_str_each_line(int argc, VALUE *argv, VALUE str)
06012 {
06013     rb_encoding *enc;
06014     VALUE rs;
06015     unsigned int newline;
06016     const char *p, *pend, *s, *ptr;
06017     long len, rslen;
06018     VALUE line;
06019     int n;
06020     VALUE orig = str;
06021 
06022     if (argc == 0) {
06023         rs = rb_rs;
06024     }
06025     else {
06026         rb_scan_args(argc, argv, "01", &rs);
06027     }
06028     RETURN_ENUMERATOR(str, argc, argv);
06029     if (NIL_P(rs)) {
06030         rb_yield(str);
06031         return orig;
06032     }
06033     str = rb_str_new4(str);
06034     ptr = p = s = RSTRING_PTR(str);
06035     pend = p + RSTRING_LEN(str);
06036     len = RSTRING_LEN(str);
06037     StringValue(rs);
06038     if (rs == rb_default_rs) {
06039         enc = rb_enc_get(str);
06040         while (p < pend) {
06041             char *p0;
06042 
06043             p = memchr(p, '\n', pend - p);
06044             if (!p) break;
06045             p0 = rb_enc_left_char_head(s, p, pend, enc);
06046             if (!rb_enc_is_newline(p0, pend, enc)) {
06047                 p++;
06048                 continue;
06049             }
06050             p = p0 + rb_enc_mbclen(p0, pend, enc);
06051             line = rb_str_new5(str, s, p - s);
06052             OBJ_INFECT(line, str);
06053             rb_enc_cr_str_copy_for_substr(line, str);
06054             rb_yield(line);
06055             str_mod_check(str, ptr, len);
06056             s = p;
06057         }
06058         goto finish;
06059     }
06060 
06061     enc = rb_enc_check(str, rs);
06062     rslen = RSTRING_LEN(rs);
06063     if (rslen == 0) {
06064         newline = '\n';
06065     }
06066     else {
06067         newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
06068     }
06069 
06070     while (p < pend) {
06071         unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
06072 
06073       again:
06074         if (rslen == 0 && c == newline) {
06075             p += n;
06076             if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
06077                 goto again;
06078             }
06079             while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
06080                 p += n;
06081             }
06082             p -= n;
06083         }
06084         if (c == newline &&
06085             (rslen <= 1 ||
06086              (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
06087             line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
06088             OBJ_INFECT(line, str);
06089             rb_enc_cr_str_copy_for_substr(line, str);
06090             rb_yield(line);
06091             str_mod_check(str, ptr, len);
06092             s = p + (rslen ? rslen : n);
06093         }
06094         p += n;
06095     }
06096 
06097   finish:
06098     if (s != pend) {
06099         line = rb_str_new5(str, s, pend - s);
06100         OBJ_INFECT(line, str);
06101         rb_enc_cr_str_copy_for_substr(line, str);
06102         rb_yield(line);
06103     }
06104 
06105     return orig;
06106 }
06107 
06108 
06109 /*
06110  *  call-seq:
06111  *     str.bytes {|fixnum| block }        -> str
06112  *     str.bytes                          -> an_enumerator
06113  *
06114  *     str.each_byte {|fixnum| block }    -> str
06115  *     str.each_byte                      -> an_enumerator
06116  *
06117  *  Passes each byte in <i>str</i> to the given block, or returns
06118  *  an enumerator if no block is given.
06119  *
06120  *     "hello".each_byte {|c| print c, ' ' }
06121  *
06122  *  <em>produces:</em>
06123  *
06124  *     104 101 108 108 111
06125  */
06126 
06127 static VALUE
06128 rb_str_each_byte(VALUE str)
06129 {
06130     long i;
06131 
06132     RETURN_ENUMERATOR(str, 0, 0);
06133     for (i=0; i<RSTRING_LEN(str); i++) {
06134         rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
06135     }
06136     return str;
06137 }
06138 
06139 
06140 /*
06141  *  call-seq:
06142  *     str.chars {|cstr| block }        -> str
06143  *     str.chars                        -> an_enumerator
06144  *
06145  *     str.each_char {|cstr| block }    -> str
06146  *     str.each_char                    -> an_enumerator
06147  *
06148  *  Passes each character in <i>str</i> to the given block, or returns
06149  *  an enumerator if no block is given.
06150  *
06151  *     "hello".each_char {|c| print c, ' ' }
06152  *
06153  *  <em>produces:</em>
06154  *
06155  *     h e l l o
06156  */
06157 
06158 static VALUE
06159 rb_str_each_char(VALUE str)
06160 {
06161     VALUE orig = str;
06162     long i, len, n;
06163     const char *ptr;
06164     rb_encoding *enc;
06165 
06166     RETURN_ENUMERATOR(str, 0, 0);
06167     str = rb_str_new4(str);
06168     ptr = RSTRING_PTR(str);
06169     len = RSTRING_LEN(str);
06170     enc = rb_enc_get(str);
06171     switch (ENC_CODERANGE(str)) {
06172       case ENC_CODERANGE_VALID:
06173       case ENC_CODERANGE_7BIT:
06174         for (i = 0; i < len; i += n) {
06175             n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
06176             rb_yield(rb_str_subseq(str, i, n));
06177         }
06178         break;
06179       default:
06180         for (i = 0; i < len; i += n) {
06181             n = rb_enc_mbclen(ptr + i, ptr + len, enc);
06182             rb_yield(rb_str_subseq(str, i, n));
06183         }
06184     }
06185     return orig;
06186 }
06187 
06188 /*
06189  *  call-seq:
06190  *     str.codepoints {|integer| block }        -> str
06191  *     str.codepoints                           -> an_enumerator
06192  *
06193  *     str.each_codepoint {|integer| block }    -> str
06194  *     str.each_codepoint                       -> an_enumerator
06195  *
06196  *  Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
06197  *  also known as a <i>codepoint</i> when applied to Unicode strings to the
06198  *  given block.
06199  *
06200  *  If no block is given, an enumerator is returned instead.
06201  *
06202  *     "hello\u0639".each_codepoint {|c| print c, ' ' }
06203  *
06204  *  <em>produces:</em>
06205  *
06206  *     104 101 108 108 111 1593
06207  */
06208 
06209 static VALUE
06210 rb_str_each_codepoint(VALUE str)
06211 {
06212     VALUE orig = str;
06213     int n;
06214     unsigned int c;
06215     const char *ptr, *end;
06216     rb_encoding *enc;
06217 
06218     if (single_byte_optimizable(str)) return rb_str_each_byte(str);
06219     RETURN_ENUMERATOR(str, 0, 0);
06220     str = rb_str_new4(str);
06221     ptr = RSTRING_PTR(str);
06222     end = RSTRING_END(str);
06223     enc = STR_ENC_GET(str);
06224     while (ptr < end) {
06225         c = rb_enc_codepoint_len(ptr, end, &n, enc);
06226         rb_yield(UINT2NUM(c));
06227         ptr += n;
06228     }
06229     return orig;
06230 }
06231 
06232 static long
06233 chopped_length(VALUE str)
06234 {
06235     rb_encoding *enc = STR_ENC_GET(str);
06236     const char *p, *p2, *beg, *end;
06237 
06238     beg = RSTRING_PTR(str);
06239     end = beg + RSTRING_LEN(str);
06240     if (beg > end) return 0;
06241     p = rb_enc_prev_char(beg, end, end, enc);
06242     if (!p) return 0;
06243     if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
06244         p2 = rb_enc_prev_char(beg, p, end, enc);
06245         if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
06246     }
06247     return p - beg;
06248 }
06249 
06250 /*
06251  *  call-seq:
06252  *     str.chop!   -> str or nil
06253  *
06254  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
06255  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
06256  *  <code>String#chomp!</code>.
06257  */
06258 
06259 static VALUE
06260 rb_str_chop_bang(VALUE str)
06261 {
06262     str_modify_keep_cr(str);
06263     if (RSTRING_LEN(str) > 0) {
06264         long len;
06265         len = chopped_length(str);
06266         STR_SET_LEN(str, len);
06267         RSTRING_PTR(str)[len] = '\0';
06268         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06269             ENC_CODERANGE_CLEAR(str);
06270         }
06271         return str;
06272     }
06273     return Qnil;
06274 }
06275 
06276 
06277 /*
06278  *  call-seq:
06279  *     str.chop   -> new_str
06280  *
06281  *  Returns a new <code>String</code> with the last character removed.  If the
06282  *  string ends with <code>\r\n</code>, both characters are removed. Applying
06283  *  <code>chop</code> to an empty string returns an empty
06284  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
06285  *  the string unchanged if it doesn't end in a record separator.
06286  *
06287  *     "string\r\n".chop   #=> "string"
06288  *     "string\n\r".chop   #=> "string\n"
06289  *     "string\n".chop     #=> "string"
06290  *     "string".chop       #=> "strin"
06291  *     "x".chop.chop       #=> ""
06292  */
06293 
06294 static VALUE
06295 rb_str_chop(VALUE str)
06296 {
06297     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
06298     rb_enc_cr_str_copy_for_substr(str2, str);
06299     OBJ_INFECT(str2, str);
06300     return str2;
06301 }
06302 
06303 
06304 /*
06305  *  call-seq:
06306  *     str.chomp!(separator=$/)   -> str or nil
06307  *
06308  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
06309  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
06310  */
06311 
06312 static VALUE
06313 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06314 {
06315     rb_encoding *enc;
06316     VALUE rs;
06317     int newline;
06318     char *p, *pp, *e;
06319     long len, rslen;
06320 
06321     str_modify_keep_cr(str);
06322     len = RSTRING_LEN(str);
06323     if (len == 0) return Qnil;
06324     p = RSTRING_PTR(str);
06325     e = p + len;
06326     if (argc == 0) {
06327         rs = rb_rs;
06328         if (rs == rb_default_rs) {
06329           smart_chomp:
06330             enc = rb_enc_get(str);
06331             if (rb_enc_mbminlen(enc) > 1) {
06332                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06333                 if (rb_enc_is_newline(pp, e, enc)) {
06334                     e = pp;
06335                 }
06336                 pp = e - rb_enc_mbminlen(enc);
06337                 if (pp >= p) {
06338                     pp = rb_enc_left_char_head(p, pp, e, enc);
06339                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06340                         e = pp;
06341                     }
06342                 }
06343                 if (e == RSTRING_END(str)) {
06344                     return Qnil;
06345                 }
06346                 len = e - RSTRING_PTR(str);
06347                 STR_SET_LEN(str, len);
06348             }
06349             else {
06350                 if (RSTRING_PTR(str)[len-1] == '\n') {
06351                     STR_DEC_LEN(str);
06352                     if (RSTRING_LEN(str) > 0 &&
06353                         RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06354                         STR_DEC_LEN(str);
06355                     }
06356                 }
06357                 else if (RSTRING_PTR(str)[len-1] == '\r') {
06358                     STR_DEC_LEN(str);
06359                 }
06360                 else {
06361                     return Qnil;
06362                 }
06363             }
06364             RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06365             return str;
06366         }
06367     }
06368     else {
06369         rb_scan_args(argc, argv, "01", &rs);
06370     }
06371     if (NIL_P(rs)) return Qnil;
06372     StringValue(rs);
06373     rslen = RSTRING_LEN(rs);
06374     if (rslen == 0) {
06375         while (len>0 && p[len-1] == '\n') {
06376             len--;
06377             if (len>0 && p[len-1] == '\r')
06378                 len--;
06379         }
06380         if (len < RSTRING_LEN(str)) {
06381             STR_SET_LEN(str, len);
06382             RSTRING_PTR(str)[len] = '\0';
06383             return str;
06384         }
06385         return Qnil;
06386     }
06387     if (rslen > len) return Qnil;
06388     newline = RSTRING_PTR(rs)[rslen-1];
06389     if (rslen == 1 && newline == '\n')
06390         goto smart_chomp;
06391 
06392     enc = rb_enc_check(str, rs);
06393     if (is_broken_string(rs)) {
06394         return Qnil;
06395     }
06396     pp = e - rslen;
06397     if (p[len-1] == newline &&
06398         (rslen <= 1 ||
06399          memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
06400         if (rb_enc_left_char_head(p, pp, e, enc) != pp)
06401             return Qnil;
06402         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06403             ENC_CODERANGE_CLEAR(str);
06404         }
06405         STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
06406         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06407         return str;
06408     }
06409     return Qnil;
06410 }
06411 
06412 
06413 /*
06414  *  call-seq:
06415  *     str.chomp(separator=$/)   -> new_str
06416  *
06417  *  Returns a new <code>String</code> with the given record separator removed
06418  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
06419  *  changed from the default Ruby record separator, then <code>chomp</code> also
06420  *  removes carriage return characters (that is it will remove <code>\n</code>,
06421  *  <code>\r</code>, and <code>\r\n</code>).
06422  *
06423  *     "hello".chomp            #=> "hello"
06424  *     "hello\n".chomp          #=> "hello"
06425  *     "hello\r\n".chomp        #=> "hello"
06426  *     "hello\n\r".chomp        #=> "hello\n"
06427  *     "hello\r".chomp          #=> "hello"
06428  *     "hello \n there".chomp   #=> "hello \n there"
06429  *     "hello".chomp("llo")     #=> "he"
06430  */
06431 
06432 static VALUE
06433 rb_str_chomp(int argc, VALUE *argv, VALUE str)
06434 {
06435     str = rb_str_dup(str);
06436     rb_str_chomp_bang(argc, argv, str);
06437     return str;
06438 }
06439 
06440 /*
06441  *  call-seq:
06442  *     str.lstrip!   -> self or nil
06443  *
06444  *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
06445  *  change was made. See also <code>String#rstrip!</code> and
06446  *  <code>String#strip!</code>.
06447  *
06448  *     "  hello  ".lstrip   #=> "hello  "
06449  *     "hello".lstrip!      #=> nil
06450  */
06451 
06452 static VALUE
06453 rb_str_lstrip_bang(VALUE str)
06454 {
06455     rb_encoding *enc;
06456     char *s, *t, *e;
06457 
06458     str_modify_keep_cr(str);
06459     enc = STR_ENC_GET(str);
06460     s = RSTRING_PTR(str);
06461     if (!s || RSTRING_LEN(str) == 0) return Qnil;
06462     e = t = RSTRING_END(str);
06463     /* remove spaces at head */
06464     while (s < e) {
06465         int n;
06466         unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
06467 
06468         if (!rb_isspace(cc)) break;
06469         s += n;
06470     }
06471 
06472     if (s > RSTRING_PTR(str)) {
06473         STR_SET_LEN(str, t-s);
06474         memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
06475         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06476         return str;
06477     }
06478     return Qnil;
06479 }
06480 
06481 
06482 /*
06483  *  call-seq:
06484  *     str.lstrip   -> new_str
06485  *
06486  *  Returns a copy of <i>str</i> with leading whitespace removed. See also
06487  *  <code>String#rstrip</code> and <code>String#strip</code>.
06488  *
06489  *     "  hello  ".lstrip   #=> "hello  "
06490  *     "hello".lstrip       #=> "hello"
06491  */
06492 
06493 static VALUE
06494 rb_str_lstrip(VALUE str)
06495 {
06496     str = rb_str_dup(str);
06497     rb_str_lstrip_bang(str);
06498     return str;
06499 }
06500 
06501 
06502 /*
06503  *  call-seq:
06504  *     str.rstrip!   -> self or nil
06505  *
06506  *  Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
06507  *  no change was made. See also <code>String#lstrip!</code> and
06508  *  <code>String#strip!</code>.
06509  *
06510  *     "  hello  ".rstrip   #=> "  hello"
06511  *     "hello".rstrip!      #=> nil
06512  */
06513 
06514 static VALUE
06515 rb_str_rstrip_bang(VALUE str)
06516 {
06517     rb_encoding *enc;
06518     char *s, *t, *e;
06519 
06520     str_modify_keep_cr(str);
06521     enc = STR_ENC_GET(str);
06522     rb_str_check_dummy_enc(enc);
06523     s = RSTRING_PTR(str);
06524     if (!s || RSTRING_LEN(str) == 0) return Qnil;
06525     t = e = RSTRING_END(str);
06526 
06527     /* remove trailing spaces or '\0's */
06528     if (single_byte_optimizable(str)) {
06529         unsigned char c;
06530         while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
06531     }
06532     else {
06533         char *tp;
06534 
06535         while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
06536             unsigned int c = rb_enc_codepoint(tp, e, enc);
06537             if (c && !rb_isspace(c)) break;
06538             t = tp;
06539         }
06540     }
06541     if (t < e) {
06542         long len = t-RSTRING_PTR(str);
06543 
06544         STR_SET_LEN(str, len);
06545         RSTRING_PTR(str)[len] = '\0';
06546         return str;
06547     }
06548     return Qnil;
06549 }
06550 
06551 
06552 /*
06553  *  call-seq:
06554  *     str.rstrip   -> new_str
06555  *
06556  *  Returns a copy of <i>str</i> with trailing whitespace removed. See also
06557  *  <code>String#lstrip</code> and <code>String#strip</code>.
06558  *
06559  *     "  hello  ".rstrip   #=> "  hello"
06560  *     "hello".rstrip       #=> "hello"
06561  */
06562 
06563 static VALUE
06564 rb_str_rstrip(VALUE str)
06565 {
06566     str = rb_str_dup(str);
06567     rb_str_rstrip_bang(str);
06568     return str;
06569 }
06570 
06571 
06572 /*
06573  *  call-seq:
06574  *     str.strip!   -> str or nil
06575  *
06576  *  Removes leading and trailing whitespace from <i>str</i>. Returns
06577  *  <code>nil</code> if <i>str</i> was not altered.
06578  */
06579 
06580 static VALUE
06581 rb_str_strip_bang(VALUE str)
06582 {
06583     VALUE l = rb_str_lstrip_bang(str);
06584     VALUE r = rb_str_rstrip_bang(str);
06585 
06586     if (NIL_P(l) && NIL_P(r)) return Qnil;
06587     return str;
06588 }
06589 
06590 
06591 /*
06592  *  call-seq:
06593  *     str.strip   -> new_str
06594  *
06595  *  Returns a copy of <i>str</i> with leading and trailing whitespace removed.
06596  *
06597  *     "    hello    ".strip   #=> "hello"
06598  *     "\tgoodbye\r\n".strip   #=> "goodbye"
06599  */
06600 
06601 static VALUE
06602 rb_str_strip(VALUE str)
06603 {
06604     str = rb_str_dup(str);
06605     rb_str_strip_bang(str);
06606     return str;
06607 }
06608 
06609 static VALUE
06610 scan_once(VALUE str, VALUE pat, long *start)
06611 {
06612     VALUE result, match;
06613     struct re_registers *regs;
06614     int i;
06615 
06616     if (rb_reg_search(pat, str, *start, 0) >= 0) {
06617         match = rb_backref_get();
06618         regs = RMATCH_REGS(match);
06619         if (BEG(0) == END(0)) {
06620             rb_encoding *enc = STR_ENC_GET(str);
06621             /*
06622              * Always consume at least one character of the input string
06623              */
06624             if (RSTRING_LEN(str) > END(0))
06625                 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
06626                                                    RSTRING_END(str), enc);
06627             else
06628                 *start = END(0)+1;
06629         }
06630         else {
06631             *start = END(0);
06632         }
06633         if (regs->num_regs == 1) {
06634             return rb_reg_nth_match(0, match);
06635         }
06636         result = rb_ary_new2(regs->num_regs);
06637         for (i=1; i < regs->num_regs; i++) {
06638             rb_ary_push(result, rb_reg_nth_match(i, match));
06639         }
06640 
06641         return result;
06642     }
06643     return Qnil;
06644 }
06645 
06646 
06647 /*
06648  *  call-seq:
06649  *     str.scan(pattern)                         -> array
06650  *     str.scan(pattern) {|match, ...| block }   -> str
06651  *
06652  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
06653  *  <code>Regexp</code> or a <code>String</code>). For each match, a result is
06654  *  generated and either added to the result array or passed to the block. If
06655  *  the pattern contains no groups, each individual result consists of the
06656  *  matched string, <code>$&</code>.  If the pattern contains groups, each
06657  *  individual result is itself an array containing one entry per group.
06658  *
06659  *     a = "cruel world"
06660  *     a.scan(/\w+/)        #=> ["cruel", "world"]
06661  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
06662  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
06663  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
06664  *
06665  *  And the block form:
06666  *
06667  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
06668  *     print "\n"
06669  *     a.scan(/(.)(.)/) {|x,y| print y, x }
06670  *     print "\n"
06671  *
06672  *  <em>produces:</em>
06673  *
06674  *     <<cruel>> <<world>>
06675  *     rceu lowlr
06676  */
06677 
06678 static VALUE
06679 rb_str_scan(VALUE str, VALUE pat)
06680 {
06681     VALUE result;
06682     long start = 0;
06683     long last = -1, prev = 0;
06684     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
06685 
06686     pat = get_pat(pat, 1);
06687     if (!rb_block_given_p()) {
06688         VALUE ary = rb_ary_new();
06689 
06690         while (!NIL_P(result = scan_once(str, pat, &start))) {
06691             last = prev;
06692             prev = start;
06693             rb_ary_push(ary, result);
06694         }
06695         if (last >= 0) rb_reg_search(pat, str, last, 0);
06696         return ary;
06697     }
06698 
06699     while (!NIL_P(result = scan_once(str, pat, &start))) {
06700         last = prev;
06701         prev = start;
06702         rb_yield(result);
06703         str_mod_check(str, p, len);
06704     }
06705     if (last >= 0) rb_reg_search(pat, str, last, 0);
06706     return str;
06707 }
06708 
06709 
06710 /*
06711  *  call-seq:
06712  *     str.hex   -> integer
06713  *
06714  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
06715  *  (with an optional sign and an optional <code>0x</code>) and returns the
06716  *  corresponding number. Zero is returned on error.
06717  *
06718  *     "0x0a".hex     #=> 10
06719  *     "-1234".hex    #=> -4660
06720  *     "0".hex        #=> 0
06721  *     "wombat".hex   #=> 0
06722  */
06723 
06724 static VALUE
06725 rb_str_hex(VALUE str)
06726 {
06727     rb_encoding *enc = rb_enc_get(str);
06728 
06729     if (!rb_enc_asciicompat(enc)) {
06730         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06731     }
06732     return rb_str_to_inum(str, 16, FALSE);
06733 }
06734 
06735 
06736 /*
06737  *  call-seq:
06738  *     str.oct   -> integer
06739  *
06740  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
06741  *  optional sign) and returns the corresponding number.  Returns 0 if the
06742  *  conversion fails.
06743  *
06744  *     "123".oct       #=> 83
06745  *     "-377".oct      #=> -255
06746  *     "bad".oct       #=> 0
06747  *     "0377bad".oct   #=> 255
06748  */
06749 
06750 static VALUE
06751 rb_str_oct(VALUE str)
06752 {
06753     rb_encoding *enc = rb_enc_get(str);
06754 
06755     if (!rb_enc_asciicompat(enc)) {
06756         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06757     }
06758     return rb_str_to_inum(str, -8, FALSE);
06759 }
06760 
06761 
06762 /*
06763  *  call-seq:
06764  *     str.crypt(other_str)   -> new_str
06765  *
06766  *  Applies a one-way cryptographic hash to <i>str</i> by invoking the standard
06767  *  library function <code>crypt</code>. The argument is the salt string, which
06768  *  should be two characters long, each character drawn from
06769  *  <code>[a-zA-Z0-9./]</code>.
06770  */
06771 
06772 static VALUE
06773 rb_str_crypt(VALUE str, VALUE salt)
06774 {
06775     extern char *crypt(const char *, const char *);
06776     VALUE result;
06777     const char *s, *saltp;
06778 #ifdef BROKEN_CRYPT
06779     char salt_8bit_clean[3];
06780 #endif
06781 
06782     StringValue(salt);
06783     if (RSTRING_LEN(salt) < 2)
06784         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
06785 
06786     s = RSTRING_PTR(str);
06787     if (!s) s = "";
06788     saltp = RSTRING_PTR(salt);
06789 #ifdef BROKEN_CRYPT
06790     if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
06791         salt_8bit_clean[0] = saltp[0] & 0x7f;
06792         salt_8bit_clean[1] = saltp[1] & 0x7f;
06793         salt_8bit_clean[2] = '\0';
06794         saltp = salt_8bit_clean;
06795     }
06796 #endif
06797     result = rb_str_new2(crypt(s, saltp));
06798     OBJ_INFECT(result, str);
06799     OBJ_INFECT(result, salt);
06800     return result;
06801 }
06802 
06803 
06804 /*
06805  *  call-seq:
06806  *     str.intern   -> symbol
06807  *     str.to_sym   -> symbol
06808  *
06809  *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
06810  *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
06811  *
06812  *     "Koala".intern         #=> :Koala
06813  *     s = 'cat'.to_sym       #=> :cat
06814  *     s == :cat              #=> true
06815  *     s = '@cat'.to_sym      #=> :@cat
06816  *     s == :@cat             #=> true
06817  *
06818  *  This can also be used to create symbols that cannot be represented using the
06819  *  <code>:xxx</code> notation.
06820  *
06821  *     'cat and dog'.to_sym   #=> :"cat and dog"
06822  */
06823 
06824 VALUE
06825 rb_str_intern(VALUE s)
06826 {
06827     VALUE str = RB_GC_GUARD(s);
06828     ID id;
06829 
06830     id = rb_intern_str(str);
06831     return ID2SYM(id);
06832 }
06833 
06834 
06835 /*
06836  *  call-seq:
06837  *     str.ord   -> integer
06838  *
06839  *  Return the <code>Integer</code> ordinal of a one-character string.
06840  *
06841  *     "a".ord         #=> 97
06842  */
06843 
06844 VALUE
06845 rb_str_ord(VALUE s)
06846 {
06847     unsigned int c;
06848 
06849     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
06850     return UINT2NUM(c);
06851 }
06852 /*
06853  *  call-seq:
06854  *     str.sum(n=16)   -> integer
06855  *
06856  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
06857  *  where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
06858  *  to 16. The result is simply the sum of the binary value of each character in
06859  *  <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
06860  *  checksum.
06861  */
06862 
06863 static VALUE
06864 rb_str_sum(int argc, VALUE *argv, VALUE str)
06865 {
06866     VALUE vbits;
06867     int bits;
06868     char *ptr, *p, *pend;
06869     long len;
06870     VALUE sum = INT2FIX(0);
06871     unsigned long sum0 = 0;
06872 
06873     if (argc == 0) {
06874         bits = 16;
06875     }
06876     else {
06877         rb_scan_args(argc, argv, "01", &vbits);
06878         bits = NUM2INT(vbits);
06879     }
06880     ptr = p = RSTRING_PTR(str);
06881     len = RSTRING_LEN(str);
06882     pend = p + len;
06883 
06884     while (p < pend) {
06885         if (FIXNUM_MAX - UCHAR_MAX < sum0) {
06886             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06887             str_mod_check(str, ptr, len);
06888             sum0 = 0;
06889         }
06890         sum0 += (unsigned char)*p;
06891         p++;
06892     }
06893 
06894     if (bits == 0) {
06895         if (sum0) {
06896             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06897         }
06898     }
06899     else {
06900         if (sum == INT2FIX(0)) {
06901             if (bits < (int)sizeof(long)*CHAR_BIT) {
06902                 sum0 &= (((unsigned long)1)<<bits)-1;
06903             }
06904             sum = LONG2FIX(sum0);
06905         }
06906         else {
06907             VALUE mod;
06908 
06909             if (sum0) {
06910                 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06911             }
06912 
06913             mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
06914             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
06915             sum = rb_funcall(sum, '&', 1, mod);
06916         }
06917     }
06918     return sum;
06919 }
06920 
06921 static VALUE
06922 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
06923 {
06924     rb_encoding *enc;
06925     VALUE w;
06926     long width, len, flen = 1, fclen = 1;
06927     VALUE res;
06928     char *p;
06929     const char *f = " ";
06930     long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
06931     volatile VALUE pad;
06932     int singlebyte = 1, cr;
06933 
06934     rb_scan_args(argc, argv, "11", &w, &pad);
06935     enc = STR_ENC_GET(str);
06936     width = NUM2LONG(w);
06937     if (argc == 2) {
06938         StringValue(pad);
06939         enc = rb_enc_check(str, pad);
06940         f = RSTRING_PTR(pad);
06941         flen = RSTRING_LEN(pad);
06942         fclen = str_strlen(pad, enc);
06943         singlebyte = single_byte_optimizable(pad);
06944         if (flen == 0 || fclen == 0) {
06945             rb_raise(rb_eArgError, "zero width padding");
06946         }
06947     }
06948     len = str_strlen(str, enc);
06949     if (width < 0 || len >= width) return rb_str_dup(str);
06950     n = width - len;
06951     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
06952     rlen = n - llen;
06953     cr = ENC_CODERANGE(str);
06954     if (flen > 1) {
06955        llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
06956        rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
06957     }
06958     size = RSTRING_LEN(str);
06959     if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
06960        (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
06961        (len += llen2 + rlen2) >= LONG_MAX - size) {
06962        rb_raise(rb_eArgError, "argument too big");
06963     }
06964     len += size;
06965     res = rb_str_new5(str, 0, len);
06966     p = RSTRING_PTR(res);
06967     if (flen <= 1) {
06968        memset(p, *f, llen);
06969        p += llen;
06970     }
06971     else {
06972        while (llen >= fclen) {
06973             memcpy(p,f,flen);
06974             p += flen;
06975             llen -= fclen;
06976         }
06977        if (llen > 0) {
06978            memcpy(p, f, llen2);
06979            p += llen2;
06980         }
06981     }
06982     memcpy(p, RSTRING_PTR(str), size);
06983     p += size;
06984     if (flen <= 1) {
06985        memset(p, *f, rlen);
06986        p += rlen;
06987     }
06988     else {
06989        while (rlen >= fclen) {
06990             memcpy(p,f,flen);
06991             p += flen;
06992             rlen -= fclen;
06993         }
06994        if (rlen > 0) {
06995            memcpy(p, f, rlen2);
06996            p += rlen2;
06997         }
06998     }
06999     *p = '\0';
07000     STR_SET_LEN(res, p-RSTRING_PTR(res));
07001     OBJ_INFECT(res, str);
07002     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
07003     rb_enc_associate(res, enc);
07004     if (argc == 2)
07005         cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
07006     if (cr != ENC_CODERANGE_BROKEN)
07007         ENC_CODERANGE_SET(res, cr);
07008     return res;
07009 }
07010 
07011 
07012 /*
07013  *  call-seq:
07014  *     str.ljust(integer, padstr=' ')   -> new_str
07015  *
07016  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
07017  *  <code>String</code> of length <i>integer</i> with <i>str</i> left justified
07018  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
07019  *
07020  *     "hello".ljust(4)            #=> "hello"
07021  *     "hello".ljust(20)           #=> "hello               "
07022  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
07023  */
07024 
07025 static VALUE
07026 rb_str_ljust(int argc, VALUE *argv, VALUE str)
07027 {
07028     return rb_str_justify(argc, argv, str, 'l');
07029 }
07030 
07031 
07032 /*
07033  *  call-seq:
07034  *     str.rjust(integer, padstr=' ')   -> new_str
07035  *
07036  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
07037  *  <code>String</code> of length <i>integer</i> with <i>str</i> right justified
07038  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
07039  *
07040  *     "hello".rjust(4)            #=> "hello"
07041  *     "hello".rjust(20)           #=> "               hello"
07042  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
07043  */
07044 
07045 static VALUE
07046 rb_str_rjust(int argc, VALUE *argv, VALUE str)
07047 {
07048     return rb_str_justify(argc, argv, str, 'r');
07049 }
07050 
07051 
07052 /*
07053  *  call-seq:
07054  *     str.center(integer, padstr)   -> new_str
07055  *
07056  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
07057  *  <code>String</code> of length <i>integer</i> with <i>str</i> centered and
07058  *  padded with <i>padstr</i>; otherwise, returns <i>str</i>.
07059  *
07060  *     "hello".center(4)         #=> "hello"
07061  *     "hello".center(20)        #=> "       hello        "
07062  *     "hello".center(20, '123') #=> "1231231hello12312312"
07063  */
07064 
07065 static VALUE
07066 rb_str_center(int argc, VALUE *argv, VALUE str)
07067 {
07068     return rb_str_justify(argc, argv, str, 'c');
07069 }
07070 
07071 /*
07072  *  call-seq:
07073  *     str.partition(sep)              -> [head, sep, tail]
07074  *     str.partition(regexp)           -> [head, match, tail]
07075  *
07076  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
07077  *  and returns the part before it, the match, and the part
07078  *  after it.
07079  *  If it is not found, returns two empty strings and <i>str</i>.
07080  *
07081  *     "hello".partition("l")         #=> ["he", "l", "lo"]
07082  *     "hello".partition("x")         #=> ["hello", "", ""]
07083  *     "hello".partition(/.l/)        #=> ["h", "el", "lo"]
07084  */
07085 
07086 static VALUE
07087 rb_str_partition(VALUE str, VALUE sep)
07088 {
07089     long pos;
07090     int regex = FALSE;
07091 
07092     if (TYPE(sep) == T_REGEXP) {
07093         pos = rb_reg_search(sep, str, 0, 0);
07094         regex = TRUE;
07095     }
07096     else {
07097         VALUE tmp;
07098 
07099         tmp = rb_check_string_type(sep);
07100         if (NIL_P(tmp)) {
07101             rb_raise(rb_eTypeError, "type mismatch: %s given",
07102                      rb_obj_classname(sep));
07103         }
07104         sep = tmp;
07105         pos = rb_str_index(str, sep, 0);
07106     }
07107     if (pos < 0) {
07108       failed:
07109         return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
07110     }
07111     if (regex) {
07112         sep = rb_str_subpat(str, sep, INT2FIX(0));
07113         if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
07114     }
07115     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
07116                           sep,
07117                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
07118                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
07119 }
07120 
07121 /*
07122  *  call-seq:
07123  *     str.rpartition(sep)             -> [head, sep, tail]
07124  *     str.rpartition(regexp)          -> [head, match, tail]
07125  *
07126  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
07127  *  of the string, and returns the part before it, the match, and the part
07128  *  after it.
07129  *  If it is not found, returns two empty strings and <i>str</i>.
07130  *
07131  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
07132  *     "hello".rpartition("x")         #=> ["", "", "hello"]
07133  *     "hello".rpartition(/.l/)        #=> ["he", "ll", "o"]
07134  */
07135 
07136 static VALUE
07137 rb_str_rpartition(VALUE str, VALUE sep)
07138 {
07139     long pos = RSTRING_LEN(str);
07140     int regex = FALSE;
07141 
07142     if (TYPE(sep) == T_REGEXP) {
07143         pos = rb_reg_search(sep, str, pos, 1);
07144         regex = TRUE;
07145     }
07146     else {
07147         VALUE tmp;
07148 
07149         tmp = rb_check_string_type(sep);
07150         if (NIL_P(tmp)) {
07151             rb_raise(rb_eTypeError, "type mismatch: %s given",
07152                      rb_obj_classname(sep));
07153         }
07154         sep = tmp;
07155         pos = rb_str_sublen(str, pos);
07156         pos = rb_str_rindex(str, sep, pos);
07157     }
07158     if (pos < 0) {
07159         return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
07160     }
07161     if (regex) {
07162         sep = rb_reg_nth_match(0, rb_backref_get());
07163     }
07164     return rb_ary_new3(3, rb_str_substr(str, 0, pos),
07165                           sep,
07166                           rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
07167 }
07168 
07169 /*
07170  *  call-seq:
07171  *     str.start_with?([prefix]+)   -> true or false
07172  *
07173  *  Returns true if <i>str</i> starts with one of the prefixes given.
07174  *
07175  *    p "hello".start_with?("hell")               #=> true
07176  *
07177  *    # returns true if one of the prefixes matches.
07178  *    p "hello".start_with?("heaven", "hell")     #=> true
07179  *    p "hello".start_with?("heaven", "paradise") #=> false
07180  *
07181  *
07182  *
07183  */
07184 
07185 static VALUE
07186 rb_str_start_with(int argc, VALUE *argv, VALUE str)
07187 {
07188     int i;
07189 
07190     for (i=0; i<argc; i++) {
07191         VALUE tmp = rb_check_string_type(argv[i]);
07192         if (NIL_P(tmp)) continue;
07193         rb_enc_check(str, tmp);
07194         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07195         if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07196             return Qtrue;
07197     }
07198     return Qfalse;
07199 }
07200 
07201 /*
07202  *  call-seq:
07203  *     str.end_with?([suffix]+)   -> true or false
07204  *
07205  *  Returns true if <i>str</i> ends with one of the suffixes given.
07206  */
07207 
07208 static VALUE
07209 rb_str_end_with(int argc, VALUE *argv, VALUE str)
07210 {
07211     int i;
07212     char *p, *s, *e;
07213     rb_encoding *enc;
07214 
07215     for (i=0; i<argc; i++) {
07216         VALUE tmp = rb_check_string_type(argv[i]);
07217         if (NIL_P(tmp)) continue;
07218         enc = rb_enc_check(str, tmp);
07219         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07220         p = RSTRING_PTR(str);
07221         e = p + RSTRING_LEN(str);
07222         s = e - RSTRING_LEN(tmp);
07223         if (rb_enc_left_char_head(p, s, e, enc) != s)
07224             continue;
07225         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07226             return Qtrue;
07227     }
07228     return Qfalse;
07229 }
07230 
07231 void
07232 rb_str_setter(VALUE val, ID id, VALUE *var)
07233 {
07234     if (!NIL_P(val) && TYPE(val) != T_STRING) {
07235         rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
07236     }
07237     *var = val;
07238 }
07239 
07240 
07241 /*
07242  *  call-seq:
07243  *     str.force_encoding(encoding)   -> str
07244  *
07245  *  Changes the encoding to +encoding+ and returns self.
07246  */
07247 
07248 static VALUE
07249 rb_str_force_encoding(VALUE str, VALUE enc)
07250 {
07251     str_modifiable(str);
07252     rb_enc_associate(str, rb_to_encoding(enc));
07253     ENC_CODERANGE_CLEAR(str);
07254     return str;
07255 }
07256 
07257 /*
07258  *  call-seq:
07259  *     str.valid_encoding?  -> true or false
07260  *
07261  *  Returns true for a string which encoded correctly.
07262  *
07263  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding?  #=> true
07264  *    "\xc2".force_encoding("UTF-8").valid_encoding?      #=> false
07265  *    "\x80".force_encoding("UTF-8").valid_encoding?      #=> false
07266  */
07267 
07268 static VALUE
07269 rb_str_valid_encoding_p(VALUE str)
07270 {
07271     int cr = rb_enc_str_coderange(str);
07272 
07273     return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07274 }
07275 
07276 /*
07277  *  call-seq:
07278  *     str.ascii_only?  -> true or false
07279  *
07280  *  Returns true for a string which has only ASCII characters.
07281  *
07282  *    "abc".force_encoding("UTF-8").ascii_only?          #=> true
07283  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only?  #=> false
07284  */
07285 
07286 static VALUE
07287 rb_str_is_ascii_only_p(VALUE str)
07288 {
07289     int cr = rb_enc_str_coderange(str);
07290 
07291     return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07292 }
07293 
07308 VALUE
07309 rb_str_ellipsize(VALUE str, long len)
07310 {
07311     static const char ellipsis[] = "...";
07312     const long ellipsislen = sizeof(ellipsis) - 1;
07313     rb_encoding *const enc = rb_enc_get(str);
07314     const long blen = RSTRING_LEN(str);
07315     const char *const p = RSTRING_PTR(str), *e = p + blen;
07316     VALUE estr, ret = 0;
07317 
07318     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
07319     if (len * rb_enc_mbminlen(enc) >= blen ||
07320         (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
07321         ret = str;
07322     }
07323     else if (len <= ellipsislen ||
07324              !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
07325         if (rb_enc_asciicompat(enc)) {
07326             ret = rb_str_new_with_class(str, ellipsis, len);
07327             rb_enc_associate(ret, enc);
07328         }
07329         else {
07330             estr = rb_usascii_str_new(ellipsis, len);
07331             ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
07332         }
07333     }
07334     else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
07335         rb_str_cat(ret, ellipsis, ellipsislen);
07336     }
07337     else {
07338         estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
07339                              rb_enc_from_encoding(enc), 0, Qnil);
07340         rb_str_append(ret, estr);
07341     }
07342     return ret;
07343 }
07344 
07345 /**********************************************************************
07346  * Document-class: Symbol
07347  *
07348  *  <code>Symbol</code> objects represent names and some strings
07349  *  inside the Ruby
07350  *  interpreter. They are generated using the <code>:name</code> and
07351  *  <code>:"string"</code> literals
07352  *  syntax, and by the various <code>to_sym</code> methods. The same
07353  *  <code>Symbol</code> object will be created for a given name or string
07354  *  for the duration of a program's execution, regardless of the context
07355  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
07356  *  one context, a method in another, and a class in a third, the
07357  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
07358  *  all three contexts.
07359  *
07360  *     module One
07361  *       class Fred
07362  *       end
07363  *       $f1 = :Fred
07364  *     end
07365  *     module Two
07366  *       Fred = 1
07367  *       $f2 = :Fred
07368  *     end
07369  *     def Fred()
07370  *     end
07371  *     $f3 = :Fred
07372  *     $f1.object_id   #=> 2514190
07373  *     $f2.object_id   #=> 2514190
07374  *     $f3.object_id   #=> 2514190
07375  *
07376  */
07377 
07378 
07379 /*
07380  *  call-seq:
07381  *     sym == obj   -> true or false
07382  *
07383  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
07384  *  symbol, returns <code>true</code>.
07385  */
07386 
07387 static VALUE
07388 sym_equal(VALUE sym1, VALUE sym2)
07389 {
07390     if (sym1 == sym2) return Qtrue;
07391     return Qfalse;
07392 }
07393 
07394 
07395 static int
07396 sym_printable(const char *s, const char *send, rb_encoding *enc)
07397 {
07398     while (s < send) {
07399         int n;
07400         int c = rb_enc_codepoint_len(s, send, &n, enc);
07401 
07402         if (!rb_enc_isprint(c, enc)) return FALSE;
07403         s += n;
07404     }
07405     return TRUE;
07406 }
07407 
07408 /*
07409  *  call-seq:
07410  *     sym.inspect    -> string
07411  *
07412  *  Returns the representation of <i>sym</i> as a symbol literal.
07413  *
07414  *     :fred.inspect   #=> ":fred"
07415  */
07416 
07417 static VALUE
07418 sym_inspect(VALUE sym)
07419 {
07420     VALUE str;
07421     ID id = SYM2ID(sym);
07422     rb_encoding *enc;
07423     const char *ptr;
07424     long len;
07425     char *dest;
07426     rb_encoding *resenc = rb_default_internal_encoding();
07427 
07428     if (resenc == NULL) resenc = rb_default_external_encoding();
07429     sym = rb_id2str(id);
07430     enc = STR_ENC_GET(sym);
07431     ptr = RSTRING_PTR(sym);
07432     len = RSTRING_LEN(sym);
07433     if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
07434         !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
07435         str = rb_str_inspect(sym);
07436         len = RSTRING_LEN(str);
07437         rb_str_resize(str, len + 1);
07438         dest = RSTRING_PTR(str);
07439         memmove(dest + 1, dest, len);
07440         dest[0] = ':';
07441     }
07442     else {
07443         char *dest;
07444         str = rb_enc_str_new(0, len + 1, enc);
07445         dest = RSTRING_PTR(str);
07446         dest[0] = ':';
07447         memcpy(dest + 1, ptr, len);
07448     }
07449     return str;
07450 }
07451 
07452 
07453 /*
07454  *  call-seq:
07455  *     sym.id2name   -> string
07456  *     sym.to_s      -> string
07457  *
07458  *  Returns the name or string corresponding to <i>sym</i>.
07459  *
07460  *     :fred.id2name   #=> "fred"
07461  */
07462 
07463 
07464 VALUE
07465 rb_sym_to_s(VALUE sym)
07466 {
07467     ID id = SYM2ID(sym);
07468 
07469     return str_new3(rb_cString, rb_id2str(id));
07470 }
07471 
07472 
07473 /*
07474  * call-seq:
07475  *   sym.to_sym   -> sym
07476  *   sym.intern   -> sym
07477  *
07478  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
07479  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
07480  * in this case.
07481  */
07482 
07483 static VALUE
07484 sym_to_sym(VALUE sym)
07485 {
07486     return sym;
07487 }
07488 
07489 static VALUE
07490 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
07491 {
07492     VALUE obj;
07493 
07494     if (argc < 1) {
07495         rb_raise(rb_eArgError, "no receiver given");
07496     }
07497     obj = argv[0];
07498     return rb_funcall_passing_block(obj, (ID)sym, argc - 1, argv + 1);
07499 }
07500 
07501 /*
07502  * call-seq:
07503  *   sym.to_proc
07504  *
07505  * Returns a _Proc_ object which respond to the given method by _sym_.
07506  *
07507  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
07508  */
07509 
07510 static VALUE
07511 sym_to_proc(VALUE sym)
07512 {
07513     static VALUE sym_proc_cache = Qfalse;
07514     enum {SYM_PROC_CACHE_SIZE = 67};
07515     VALUE proc;
07516     long id, index;
07517     VALUE *aryp;
07518 
07519     if (!sym_proc_cache) {
07520         sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
07521         rb_gc_register_mark_object(sym_proc_cache);
07522         rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
07523     }
07524 
07525     id = SYM2ID(sym);
07526     index = (id % SYM_PROC_CACHE_SIZE) << 1;
07527 
07528     aryp = RARRAY_PTR(sym_proc_cache);
07529     if (aryp[index] == sym) {
07530         return aryp[index + 1];
07531     }
07532     else {
07533         proc = rb_proc_new(sym_call, (VALUE)id);
07534         aryp[index] = sym;
07535         aryp[index + 1] = proc;
07536         return proc;
07537     }
07538 }
07539 
07540 /*
07541  * call-seq:
07542  *
07543  *   sym.succ
07544  *
07545  * Same as <code>sym.to_s.succ.intern</code>.
07546  */
07547 
07548 static VALUE
07549 sym_succ(VALUE sym)
07550 {
07551     return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
07552 }
07553 
07554 /*
07555  * call-seq:
07556  *
07557  *   str <=> other       -> -1, 0, +1 or nil
07558  *
07559  * Compares _sym_ with _other_ in string form.
07560  */
07561 
07562 static VALUE
07563 sym_cmp(VALUE sym, VALUE other)
07564 {
07565     if (!SYMBOL_P(other)) {
07566         return Qnil;
07567     }
07568     return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
07569 }
07570 
07571 /*
07572  * call-seq:
07573  *
07574  *   sym.casecmp(other)  -> -1, 0, +1 or nil
07575  *
07576  * Case-insensitive version of <code>Symbol#<=></code>.
07577  */
07578 
07579 static VALUE
07580 sym_casecmp(VALUE sym, VALUE other)
07581 {
07582     if (!SYMBOL_P(other)) {
07583         return Qnil;
07584     }
07585     return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
07586 }
07587 
07588 /*
07589  * call-seq:
07590  *   sym =~ obj   -> fixnum or nil
07591  *
07592  * Returns <code>sym.to_s =~ obj</code>.
07593  */
07594 
07595 static VALUE
07596 sym_match(VALUE sym, VALUE other)
07597 {
07598     return rb_str_match(rb_sym_to_s(sym), other);
07599 }
07600 
07601 /*
07602  * call-seq:
07603  *   sym[idx]      -> char
07604  *   sym[b, n]     -> char
07605  *
07606  * Returns <code>sym.to_s[]</code>.
07607  */
07608 
07609 static VALUE
07610 sym_aref(int argc, VALUE *argv, VALUE sym)
07611 {
07612     return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
07613 }
07614 
07615 /*
07616  * call-seq:
07617  *   sym.length    -> integer
07618  *
07619  * Same as <code>sym.to_s.length</code>.
07620  */
07621 
07622 static VALUE
07623 sym_length(VALUE sym)
07624 {
07625     return rb_str_length(rb_id2str(SYM2ID(sym)));
07626 }
07627 
07628 /*
07629  * call-seq:
07630  *   sym.empty?   -> true or false
07631  *
07632  * Returns that _sym_ is :"" or not.
07633  */
07634 
07635 static VALUE
07636 sym_empty(VALUE sym)
07637 {
07638     return rb_str_empty(rb_id2str(SYM2ID(sym)));
07639 }
07640 
07641 /*
07642  * call-seq:
07643  *   sym.upcase    -> symbol
07644  *
07645  * Same as <code>sym.to_s.upcase.intern</code>.
07646  */
07647 
07648 static VALUE
07649 sym_upcase(VALUE sym)
07650 {
07651     return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
07652 }
07653 
07654 /*
07655  * call-seq:
07656  *   sym.downcase  -> symbol
07657  *
07658  * Same as <code>sym.to_s.downcase.intern</code>.
07659  */
07660 
07661 static VALUE
07662 sym_downcase(VALUE sym)
07663 {
07664     return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
07665 }
07666 
07667 /*
07668  * call-seq:
07669  *   sym.capitalize  -> symbol
07670  *
07671  * Same as <code>sym.to_s.capitalize.intern</code>.
07672  */
07673 
07674 static VALUE
07675 sym_capitalize(VALUE sym)
07676 {
07677     return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
07678 }
07679 
07680 /*
07681  * call-seq:
07682  *   sym.swapcase  -> symbol
07683  *
07684  * Same as <code>sym.to_s.swapcase.intern</code>.
07685  */
07686 
07687 static VALUE
07688 sym_swapcase(VALUE sym)
07689 {
07690     return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
07691 }
07692 
07693 /*
07694  * call-seq:
07695  *   sym.encoding   -> encoding
07696  *
07697  * Returns the Encoding object that represents the encoding of _sym_.
07698  */
07699 
07700 static VALUE
07701 sym_encoding(VALUE sym)
07702 {
07703     return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
07704 }
07705 
07706 ID
07707 rb_to_id(VALUE name)
07708 {
07709     VALUE tmp;
07710 
07711     switch (TYPE(name)) {
07712       default:
07713         tmp = rb_check_string_type(name);
07714         if (NIL_P(tmp)) {
07715             tmp = rb_inspect(name);
07716             rb_raise(rb_eTypeError, "%s is not a symbol",
07717                      RSTRING_PTR(tmp));
07718         }
07719         name = tmp;
07720         /* fall through */
07721       case T_STRING:
07722         name = rb_str_intern(name);
07723         /* fall through */
07724       case T_SYMBOL:
07725         return SYM2ID(name);
07726     }
07727     return Qnil; /* not reached */
07728 }
07729 
07730 /*
07731  *  A <code>String</code> object holds and manipulates an arbitrary sequence of
07732  *  bytes, typically representing characters. String objects may be created
07733  *  using <code>String::new</code> or as literals.
07734  *
07735  *  Because of aliasing issues, users of strings should be aware of the methods
07736  *  that modify the contents of a <code>String</code> object.  Typically,
07737  *  methods with names ending in ``!'' modify their receiver, while those
07738  *  without a ``!'' return a new <code>String</code>.  However, there are
07739  *  exceptions, such as <code>String#[]=</code>.
07740  *
07741  */
07742 
07743 void
07744 Init_String(void)
07745 {
07746 #undef rb_intern
07747 #define rb_intern(str) rb_intern_const(str)
07748 
07749     rb_cString  = rb_define_class("String", rb_cObject);
07750     rb_include_module(rb_cString, rb_mComparable);
07751     rb_define_alloc_func(rb_cString, str_alloc);
07752     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
07753     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
07754     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
07755     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
07756     rb_define_method(rb_cString, "==", rb_str_equal, 1);
07757     rb_define_method(rb_cString, "===", rb_str_equal, 1);
07758     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
07759     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
07760     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
07761     rb_define_method(rb_cString, "+", rb_str_plus, 1);
07762     rb_define_method(rb_cString, "*", rb_str_times, 1);
07763     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
07764     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
07765     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
07766     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
07767     rb_define_method(rb_cString, "length", rb_str_length, 0);
07768     rb_define_method(rb_cString, "size", rb_str_length, 0);
07769     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
07770     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
07771     rb_define_method(rb_cString, "=~", rb_str_match, 1);
07772     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
07773     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
07774     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
07775     rb_define_method(rb_cString, "next", rb_str_succ, 0);
07776     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
07777     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
07778     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
07779     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
07780     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
07781     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
07782     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
07783     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
07784     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
07785     rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
07786 
07787     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
07788     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
07789     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
07790     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
07791     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
07792     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
07793 
07794     rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
07795     rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
07796     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
07797     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
07798 
07799     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
07800     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
07801     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
07802     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
07803 
07804     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
07805     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
07806     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
07807     rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
07808     rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
07809     rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
07810     rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
07811     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
07812     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
07813     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
07814     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
07815     rb_define_method(rb_cString, "prepend", rb_str_prepend, 1);
07816     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
07817     rb_define_method(rb_cString, "intern", rb_str_intern, 0);
07818     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
07819     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
07820 
07821     rb_define_method(rb_cString, "include?", rb_str_include, 1);
07822     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
07823     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
07824 
07825     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
07826 
07827     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
07828     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
07829     rb_define_method(rb_cString, "center", rb_str_center, -1);
07830 
07831     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
07832     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
07833     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
07834     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
07835     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
07836     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
07837     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
07838 
07839     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
07840     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
07841     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
07842     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
07843     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
07844     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
07845     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
07846 
07847     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
07848     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
07849     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
07850     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
07851     rb_define_method(rb_cString, "count", rb_str_count, -1);
07852 
07853     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
07854     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
07855     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
07856     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
07857 
07858     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
07859     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
07860     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
07861     rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
07862 
07863     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
07864 
07865     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
07866     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
07867 
07868     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
07869     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
07870 
07871     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
07872     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
07873     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
07874     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
07875 
07876     id_to_s = rb_intern("to_s");
07877 
07878     rb_fs = Qnil;
07879     rb_define_variable("$;", &rb_fs);
07880     rb_define_variable("$-F", &rb_fs);
07881 
07882     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
07883     rb_include_module(rb_cSymbol, rb_mComparable);
07884     rb_undef_alloc_func(rb_cSymbol);
07885     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
07886     rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
07887 
07888     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
07889     rb_define_method(rb_cSymbol, "===", sym_equal, 1);
07890     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
07891     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
07892     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
07893     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
07894     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
07895     rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
07896     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
07897     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
07898 
07899     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
07900     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
07901     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
07902 
07903     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
07904     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
07905     rb_define_method(rb_cSymbol, "length", sym_length, 0);
07906     rb_define_method(rb_cSymbol, "size", sym_length, 0);
07907     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
07908     rb_define_method(rb_cSymbol, "match", sym_match, 1);
07909 
07910     rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
07911     rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
07912     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
07913     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
07914 
07915     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
07916 }
07917