Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /********************************************************************** 00002 00003 string.c - 00004 00005 $Author: usa $ 00006 created at: Mon Aug 9 17:12:58 JST 1993 00007 00008 Copyright (C) 1993-2007 Yukihiro Matsumoto 00009 Copyright (C) 2000 Network Applied Communication Laboratory, Inc. 00010 Copyright (C) 2000 Information-technology Promotion Agency, Japan 00011 00012 **********************************************************************/ 00013 00014 #include "ruby/ruby.h" 00015 #include "ruby/re.h" 00016 #include "ruby/encoding.h" 00017 #include "internal.h" 00018 #include <assert.h> 00019 00020 #define BEG(no) (regs->beg[(no)]) 00021 #define END(no) (regs->end[(no)]) 00022 00023 #include <math.h> 00024 #include <ctype.h> 00025 00026 #ifdef HAVE_UNISTD_H 00027 #include <unistd.h> 00028 #endif 00029 00030 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0])) 00031 00032 #undef rb_str_new_cstr 00033 #undef rb_tainted_str_new_cstr 00034 #undef rb_usascii_str_new_cstr 00035 #undef rb_external_str_new_cstr 00036 #undef rb_locale_str_new_cstr 00037 #undef rb_str_new2 00038 #undef rb_str_new3 00039 #undef rb_str_new4 00040 #undef rb_str_new5 00041 #undef rb_tainted_str_new2 00042 #undef rb_usascii_str_new2 00043 #undef rb_str_dup_frozen 00044 #undef rb_str_buf_new_cstr 00045 #undef rb_str_buf_new2 00046 #undef rb_str_buf_cat2 00047 #undef rb_str_cat2 00048 00049 static VALUE rb_str_clear(VALUE str); 00050 00051 VALUE rb_cString; 00052 VALUE rb_cSymbol; 00053 00054 #define RUBY_MAX_CHAR_LEN 16 00055 #define STR_TMPLOCK FL_USER7 00056 #define STR_NOEMBED FL_USER1 00057 #define STR_SHARED FL_USER2 /* = ELTS_SHARED */ 00058 #define STR_ASSOC FL_USER3 00059 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED) 00060 #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC) 00061 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC) 00062 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC)) 00063 #define STR_UNSET_NOCAPA(s) do {\ 00064 if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\ 00065 } while (0) 00066 00067 00068 #define STR_SET_NOEMBED(str) do {\ 00069 FL_SET((str), STR_NOEMBED);\ 00070 STR_SET_EMBED_LEN((str), 0);\ 00071 } while (0) 00072 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED) 00073 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED)) 00074 #define STR_SET_EMBED_LEN(str, n) do { \ 00075 long tmp_n = (n);\ 00076 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\ 00077 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\ 00078 } while (0) 00079 00080 #define STR_SET_LEN(str, n) do { \ 00081 if (STR_EMBED_P(str)) {\ 00082 STR_SET_EMBED_LEN((str), (n));\ 00083 }\ 00084 else {\ 00085 RSTRING(str)->as.heap.len = (n);\ 00086 }\ 00087 } while (0) 00088 00089 #define STR_DEC_LEN(str) do {\ 00090 if (STR_EMBED_P(str)) {\ 00091 long n = RSTRING_LEN(str);\ 00092 n--;\ 00093 STR_SET_EMBED_LEN((str), n);\ 00094 }\ 00095 else {\ 00096 RSTRING(str)->as.heap.len--;\ 00097 }\ 00098 } while (0) 00099 00100 #define RESIZE_CAPA(str,capacity) do {\ 00101 if (STR_EMBED_P(str)) {\ 00102 if ((capacity) > RSTRING_EMBED_LEN_MAX) {\ 00103 char *tmp = ALLOC_N(char, (capacity)+1);\ 00104 memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\ 00105 RSTRING(str)->as.heap.ptr = tmp;\ 00106 RSTRING(str)->as.heap.len = RSTRING_LEN(str);\ 00107 STR_SET_NOEMBED(str);\ 00108 RSTRING(str)->as.heap.aux.capa = (capacity);\ 00109 }\ 00110 }\ 00111 else {\ 00112 REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\ 00113 if (!STR_NOCAPA_P(str))\ 00114 RSTRING(str)->as.heap.aux.capa = (capacity);\ 00115 }\ 00116 } while (0) 00117 00118 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) 00119 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) 00120 00121 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str)) 00122 00123 static inline int 00124 single_byte_optimizable(VALUE str) 00125 { 00126 rb_encoding *enc; 00127 00128 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */ 00129 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) 00130 return 1; 00131 00132 enc = STR_ENC_GET(str); 00133 if (rb_enc_mbmaxlen(enc) == 1) 00134 return 1; 00135 00136 /* Conservative. Possibly single byte. 00137 * "\xa1" in Shift_JIS for example. */ 00138 return 0; 00139 } 00140 00141 VALUE rb_fs; 00142 00143 static inline const char * 00144 search_nonascii(const char *p, const char *e) 00145 { 00146 #if SIZEOF_VALUE == 8 00147 # define NONASCII_MASK 0x8080808080808080ULL 00148 #elif SIZEOF_VALUE == 4 00149 # define NONASCII_MASK 0x80808080UL 00150 #endif 00151 #ifdef NONASCII_MASK 00152 if ((int)sizeof(VALUE) * 2 < e - p) { 00153 const VALUE *s, *t; 00154 const VALUE lowbits = sizeof(VALUE) - 1; 00155 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 00156 while (p < (const char *)s) { 00157 if (!ISASCII(*p)) 00158 return p; 00159 p++; 00160 } 00161 t = (const VALUE*)(~lowbits & (VALUE)e); 00162 while (s < t) { 00163 if (*s & NONASCII_MASK) { 00164 t = s; 00165 break; 00166 } 00167 s++; 00168 } 00169 p = (const char *)t; 00170 } 00171 #endif 00172 while (p < e) { 00173 if (!ISASCII(*p)) 00174 return p; 00175 p++; 00176 } 00177 return NULL; 00178 } 00179 00180 static int 00181 coderange_scan(const char *p, long len, rb_encoding *enc) 00182 { 00183 const char *e = p + len; 00184 00185 if (rb_enc_to_index(enc) == 0) { 00186 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ 00187 p = search_nonascii(p, e); 00188 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT; 00189 } 00190 00191 if (rb_enc_asciicompat(enc)) { 00192 p = search_nonascii(p, e); 00193 if (!p) { 00194 return ENC_CODERANGE_7BIT; 00195 } 00196 while (p < e) { 00197 int ret = rb_enc_precise_mbclen(p, e, enc); 00198 if (!MBCLEN_CHARFOUND_P(ret)) { 00199 return ENC_CODERANGE_BROKEN; 00200 } 00201 p += MBCLEN_CHARFOUND_LEN(ret); 00202 if (p < e) { 00203 p = search_nonascii(p, e); 00204 if (!p) { 00205 return ENC_CODERANGE_VALID; 00206 } 00207 } 00208 } 00209 if (e < p) { 00210 return ENC_CODERANGE_BROKEN; 00211 } 00212 return ENC_CODERANGE_VALID; 00213 } 00214 00215 while (p < e) { 00216 int ret = rb_enc_precise_mbclen(p, e, enc); 00217 00218 if (!MBCLEN_CHARFOUND_P(ret)) { 00219 return ENC_CODERANGE_BROKEN; 00220 } 00221 p += MBCLEN_CHARFOUND_LEN(ret); 00222 } 00223 if (e < p) { 00224 return ENC_CODERANGE_BROKEN; 00225 } 00226 return ENC_CODERANGE_VALID; 00227 } 00228 00229 long 00230 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr) 00231 { 00232 const char *p = s; 00233 00234 if (*cr == ENC_CODERANGE_BROKEN) 00235 return e - s; 00236 00237 if (rb_enc_to_index(enc) == 0) { 00238 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ 00239 p = search_nonascii(p, e); 00240 *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; 00241 return e - s; 00242 } 00243 else if (rb_enc_asciicompat(enc)) { 00244 p = search_nonascii(p, e); 00245 if (!p) { 00246 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT; 00247 return e - s; 00248 } 00249 while (p < e) { 00250 int ret = rb_enc_precise_mbclen(p, e, enc); 00251 if (!MBCLEN_CHARFOUND_P(ret)) { 00252 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN; 00253 return p - s; 00254 } 00255 p += MBCLEN_CHARFOUND_LEN(ret); 00256 if (p < e) { 00257 p = search_nonascii(p, e); 00258 if (!p) { 00259 *cr = ENC_CODERANGE_VALID; 00260 return e - s; 00261 } 00262 } 00263 } 00264 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID; 00265 return p - s; 00266 } 00267 else { 00268 while (p < e) { 00269 int ret = rb_enc_precise_mbclen(p, e, enc); 00270 if (!MBCLEN_CHARFOUND_P(ret)) { 00271 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN; 00272 return p - s; 00273 } 00274 p += MBCLEN_CHARFOUND_LEN(ret); 00275 } 00276 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID; 00277 return p - s; 00278 } 00279 } 00280 00281 static inline void 00282 str_enc_copy(VALUE str1, VALUE str2) 00283 { 00284 rb_enc_set_index(str1, ENCODING_GET(str2)); 00285 } 00286 00287 static void 00288 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src) 00289 { 00290 /* this function is designed for copying encoding and coderange 00291 * from src to new string "dest" which is made from the part of src. 00292 */ 00293 str_enc_copy(dest, src); 00294 switch (ENC_CODERANGE(src)) { 00295 case ENC_CODERANGE_7BIT: 00296 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 00297 break; 00298 case ENC_CODERANGE_VALID: 00299 if (!rb_enc_asciicompat(STR_ENC_GET(src)) || 00300 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest))) 00301 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID); 00302 else 00303 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 00304 break; 00305 default: 00306 if (RSTRING_LEN(dest) == 0) { 00307 if (!rb_enc_asciicompat(STR_ENC_GET(src))) 00308 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID); 00309 else 00310 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 00311 } 00312 break; 00313 } 00314 } 00315 00316 static void 00317 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src) 00318 { 00319 str_enc_copy(dest, src); 00320 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src)); 00321 } 00322 00323 int 00324 rb_enc_str_coderange(VALUE str) 00325 { 00326 int cr = ENC_CODERANGE(str); 00327 00328 if (cr == ENC_CODERANGE_UNKNOWN) { 00329 rb_encoding *enc = STR_ENC_GET(str); 00330 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc); 00331 ENC_CODERANGE_SET(str, cr); 00332 } 00333 return cr; 00334 } 00335 00336 int 00337 rb_enc_str_asciionly_p(VALUE str) 00338 { 00339 rb_encoding *enc = STR_ENC_GET(str); 00340 00341 if (!rb_enc_asciicompat(enc)) 00342 return FALSE; 00343 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) 00344 return TRUE; 00345 return FALSE; 00346 } 00347 00348 static inline void 00349 str_mod_check(VALUE s, const char *p, long len) 00350 { 00351 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){ 00352 rb_raise(rb_eRuntimeError, "string modified"); 00353 } 00354 } 00355 00356 size_t 00357 rb_str_capacity(VALUE str) 00358 { 00359 if (STR_EMBED_P(str)) { 00360 return RSTRING_EMBED_LEN_MAX; 00361 } 00362 else if (STR_NOCAPA_P(str)) { 00363 return RSTRING(str)->as.heap.len; 00364 } 00365 else { 00366 return RSTRING(str)->as.heap.aux.capa; 00367 } 00368 } 00369 00370 static inline VALUE 00371 str_alloc(VALUE klass) 00372 { 00373 NEWOBJ(str, struct RString); 00374 OBJSETUP(str, klass, T_STRING); 00375 00376 str->as.heap.ptr = 0; 00377 str->as.heap.len = 0; 00378 str->as.heap.aux.capa = 0; 00379 00380 return (VALUE)str; 00381 } 00382 00383 static VALUE 00384 str_new(VALUE klass, const char *ptr, long len) 00385 { 00386 VALUE str; 00387 00388 if (len < 0) { 00389 rb_raise(rb_eArgError, "negative string size (or size too big)"); 00390 } 00391 00392 str = str_alloc(klass); 00393 if (len > RSTRING_EMBED_LEN_MAX) { 00394 RSTRING(str)->as.heap.aux.capa = len; 00395 RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1); 00396 STR_SET_NOEMBED(str); 00397 } 00398 else if (len == 0) { 00399 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 00400 } 00401 if (ptr) { 00402 memcpy(RSTRING_PTR(str), ptr, len); 00403 } 00404 STR_SET_LEN(str, len); 00405 RSTRING_PTR(str)[len] = '\0'; 00406 return str; 00407 } 00408 00409 VALUE 00410 rb_str_new(const char *ptr, long len) 00411 { 00412 return str_new(rb_cString, ptr, len); 00413 } 00414 00415 VALUE 00416 rb_usascii_str_new(const char *ptr, long len) 00417 { 00418 VALUE str = rb_str_new(ptr, len); 00419 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT); 00420 return str; 00421 } 00422 00423 VALUE 00424 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc) 00425 { 00426 VALUE str = rb_str_new(ptr, len); 00427 rb_enc_associate(str, enc); 00428 return str; 00429 } 00430 00431 VALUE 00432 rb_str_new_cstr(const char *ptr) 00433 { 00434 if (!ptr) { 00435 rb_raise(rb_eArgError, "NULL pointer given"); 00436 } 00437 return rb_str_new(ptr, strlen(ptr)); 00438 } 00439 00440 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr)) 00441 #define rb_str_new2 rb_str_new_cstr 00442 00443 VALUE 00444 rb_usascii_str_new_cstr(const char *ptr) 00445 { 00446 VALUE str = rb_str_new2(ptr); 00447 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT); 00448 return str; 00449 } 00450 00451 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr)) 00452 #define rb_usascii_str_new2 rb_usascii_str_new_cstr 00453 00454 VALUE 00455 rb_tainted_str_new(const char *ptr, long len) 00456 { 00457 VALUE str = rb_str_new(ptr, len); 00458 00459 OBJ_TAINT(str); 00460 return str; 00461 } 00462 00463 VALUE 00464 rb_tainted_str_new_cstr(const char *ptr) 00465 { 00466 VALUE str = rb_str_new2(ptr); 00467 00468 OBJ_TAINT(str); 00469 return str; 00470 } 00471 00472 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr)) 00473 #define rb_tainted_str_new2 rb_tainted_str_new_cstr 00474 00475 VALUE 00476 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts) 00477 { 00478 rb_econv_t *ec; 00479 rb_econv_result_t ret; 00480 long len; 00481 VALUE newstr; 00482 const unsigned char *sp; 00483 unsigned char *dp; 00484 00485 if (!to) return str; 00486 if (from == to) return str; 00487 if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) || 00488 to == rb_ascii8bit_encoding()) { 00489 if (STR_ENC_GET(str) != to) { 00490 str = rb_str_dup(str); 00491 rb_enc_associate(str, to); 00492 } 00493 return str; 00494 } 00495 00496 len = RSTRING_LEN(str); 00497 newstr = rb_str_new(0, len); 00498 00499 retry: 00500 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts); 00501 if (!ec) return str; 00502 00503 sp = (unsigned char*)RSTRING_PTR(str); 00504 dp = (unsigned char*)RSTRING_PTR(newstr); 00505 ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str), 00506 &dp, (unsigned char*)RSTRING_END(newstr), 0); 00507 rb_econv_close(ec); 00508 switch (ret) { 00509 case econv_destination_buffer_full: 00510 /* destination buffer short */ 00511 len = len < 2 ? 2 : len * 2; 00512 rb_str_resize(newstr, len); 00513 goto retry; 00514 00515 case econv_finished: 00516 len = dp - (unsigned char*)RSTRING_PTR(newstr); 00517 rb_str_set_len(newstr, len); 00518 rb_enc_associate(newstr, to); 00519 return newstr; 00520 00521 default: 00522 /* some error, return original */ 00523 return str; 00524 } 00525 } 00526 00527 VALUE 00528 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to) 00529 { 00530 return rb_str_conv_enc_opts(str, from, to, 0, Qnil); 00531 } 00532 00533 VALUE 00534 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc) 00535 { 00536 VALUE str; 00537 00538 str = rb_tainted_str_new(ptr, len); 00539 if (eenc == rb_usascii_encoding() && 00540 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) { 00541 rb_enc_associate(str, rb_ascii8bit_encoding()); 00542 return str; 00543 } 00544 rb_enc_associate(str, eenc); 00545 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding()); 00546 } 00547 00548 VALUE 00549 rb_external_str_new(const char *ptr, long len) 00550 { 00551 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding()); 00552 } 00553 00554 VALUE 00555 rb_external_str_new_cstr(const char *ptr) 00556 { 00557 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding()); 00558 } 00559 00560 VALUE 00561 rb_locale_str_new(const char *ptr, long len) 00562 { 00563 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding()); 00564 } 00565 00566 VALUE 00567 rb_locale_str_new_cstr(const char *ptr) 00568 { 00569 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding()); 00570 } 00571 00572 VALUE 00573 rb_filesystem_str_new(const char *ptr, long len) 00574 { 00575 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding()); 00576 } 00577 00578 VALUE 00579 rb_filesystem_str_new_cstr(const char *ptr) 00580 { 00581 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding()); 00582 } 00583 00584 VALUE 00585 rb_str_export(VALUE str) 00586 { 00587 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding()); 00588 } 00589 00590 VALUE 00591 rb_str_export_locale(VALUE str) 00592 { 00593 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding()); 00594 } 00595 00596 VALUE 00597 rb_str_export_to_enc(VALUE str, rb_encoding *enc) 00598 { 00599 return rb_str_conv_enc(str, STR_ENC_GET(str), enc); 00600 } 00601 00602 static VALUE 00603 str_replace_shared(VALUE str2, VALUE str) 00604 { 00605 if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) { 00606 STR_SET_EMBED(str2); 00607 memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1); 00608 STR_SET_EMBED_LEN(str2, RSTRING_LEN(str)); 00609 } 00610 else { 00611 str = rb_str_new_frozen(str); 00612 FL_SET(str2, STR_NOEMBED); 00613 RSTRING(str2)->as.heap.len = RSTRING_LEN(str); 00614 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str); 00615 RSTRING(str2)->as.heap.aux.shared = str; 00616 FL_SET(str2, ELTS_SHARED); 00617 } 00618 rb_enc_cr_str_exact_copy(str2, str); 00619 00620 return str2; 00621 } 00622 00623 static VALUE 00624 str_new_shared(VALUE klass, VALUE str) 00625 { 00626 return str_replace_shared(str_alloc(klass), str); 00627 } 00628 00629 static VALUE 00630 str_new3(VALUE klass, VALUE str) 00631 { 00632 return str_new_shared(klass, str); 00633 } 00634 00635 VALUE 00636 rb_str_new_shared(VALUE str) 00637 { 00638 VALUE str2 = str_new3(rb_obj_class(str), str); 00639 00640 OBJ_INFECT(str2, str); 00641 return str2; 00642 } 00643 00644 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str)) 00645 #define rb_str_new3 rb_str_new_shared 00646 00647 static VALUE 00648 str_new4(VALUE klass, VALUE str) 00649 { 00650 VALUE str2; 00651 00652 str2 = str_alloc(klass); 00653 STR_SET_NOEMBED(str2); 00654 RSTRING(str2)->as.heap.len = RSTRING_LEN(str); 00655 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str); 00656 if (STR_SHARED_P(str)) { 00657 VALUE shared = RSTRING(str)->as.heap.aux.shared; 00658 assert(OBJ_FROZEN(shared)); 00659 FL_SET(str2, ELTS_SHARED); 00660 RSTRING(str2)->as.heap.aux.shared = shared; 00661 } 00662 else { 00663 FL_SET(str, ELTS_SHARED); 00664 RSTRING(str)->as.heap.aux.shared = str2; 00665 } 00666 rb_enc_cr_str_exact_copy(str2, str); 00667 OBJ_INFECT(str2, str); 00668 return str2; 00669 } 00670 00671 VALUE 00672 rb_str_new_frozen(VALUE orig) 00673 { 00674 VALUE klass, str; 00675 00676 if (OBJ_FROZEN(orig)) return orig; 00677 klass = rb_obj_class(orig); 00678 if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) { 00679 long ofs; 00680 assert(OBJ_FROZEN(str)); 00681 ofs = RSTRING_LEN(str) - RSTRING_LEN(orig); 00682 if ((ofs > 0) || (klass != RBASIC(str)->klass) || 00683 (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) || 00684 ENCODING_GET(str) != ENCODING_GET(orig)) { 00685 str = str_new3(klass, str); 00686 RSTRING(str)->as.heap.ptr += ofs; 00687 RSTRING(str)->as.heap.len -= ofs; 00688 rb_enc_cr_str_exact_copy(str, orig); 00689 OBJ_INFECT(str, orig); 00690 } 00691 } 00692 else if (STR_EMBED_P(orig)) { 00693 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig)); 00694 rb_enc_cr_str_exact_copy(str, orig); 00695 OBJ_INFECT(str, orig); 00696 } 00697 else if (STR_ASSOC_P(orig)) { 00698 VALUE assoc = RSTRING(orig)->as.heap.aux.shared; 00699 FL_UNSET(orig, STR_ASSOC); 00700 str = str_new4(klass, orig); 00701 FL_SET(str, STR_ASSOC); 00702 RSTRING(str)->as.heap.aux.shared = assoc; 00703 } 00704 else { 00705 str = str_new4(klass, orig); 00706 } 00707 OBJ_FREEZE(str); 00708 return str; 00709 } 00710 00711 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig)) 00712 #define rb_str_new4 rb_str_new_frozen 00713 00714 VALUE 00715 rb_str_new_with_class(VALUE obj, const char *ptr, long len) 00716 { 00717 return str_new(rb_obj_class(obj), ptr, len); 00718 } 00719 00720 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len), 00721 rb_str_new_with_class, (obj, ptr, len)) 00722 #define rb_str_new5 rb_str_new_with_class 00723 00724 static VALUE 00725 str_new_empty(VALUE str) 00726 { 00727 VALUE v = rb_str_new5(str, 0, 0); 00728 rb_enc_copy(v, str); 00729 OBJ_INFECT(v, str); 00730 return v; 00731 } 00732 00733 #define STR_BUF_MIN_SIZE 128 00734 00735 VALUE 00736 rb_str_buf_new(long capa) 00737 { 00738 VALUE str = str_alloc(rb_cString); 00739 00740 if (capa < STR_BUF_MIN_SIZE) { 00741 capa = STR_BUF_MIN_SIZE; 00742 } 00743 FL_SET(str, STR_NOEMBED); 00744 RSTRING(str)->as.heap.aux.capa = capa; 00745 RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1); 00746 RSTRING(str)->as.heap.ptr[0] = '\0'; 00747 00748 return str; 00749 } 00750 00751 VALUE 00752 rb_str_buf_new_cstr(const char *ptr) 00753 { 00754 VALUE str; 00755 long len = strlen(ptr); 00756 00757 str = rb_str_buf_new(len); 00758 rb_str_buf_cat(str, ptr, len); 00759 00760 return str; 00761 } 00762 00763 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr)) 00764 #define rb_str_buf_new2 rb_str_buf_new_cstr 00765 00766 VALUE 00767 rb_str_tmp_new(long len) 00768 { 00769 return str_new(0, 0, len); 00770 } 00771 00772 void * 00773 rb_alloc_tmp_buffer(volatile VALUE *store, long len) 00774 { 00775 VALUE s = rb_str_tmp_new(len); 00776 *store = s; 00777 return RSTRING_PTR(s); 00778 } 00779 00780 void 00781 rb_free_tmp_buffer(volatile VALUE *store) 00782 { 00783 VALUE s = *store; 00784 *store = 0; 00785 if (s) rb_str_clear(s); 00786 } 00787 00788 void 00789 rb_str_free(VALUE str) 00790 { 00791 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) { 00792 xfree(RSTRING(str)->as.heap.ptr); 00793 } 00794 } 00795 00796 RUBY_FUNC_EXPORTED size_t 00797 rb_str_memsize(VALUE str) 00798 { 00799 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) { 00800 return RSTRING(str)->as.heap.aux.capa; 00801 } 00802 else { 00803 return 0; 00804 } 00805 } 00806 00807 VALUE 00808 rb_str_to_str(VALUE str) 00809 { 00810 return rb_convert_type(str, T_STRING, "String", "to_str"); 00811 } 00812 00813 static inline void str_discard(VALUE str); 00814 00815 void 00816 rb_str_shared_replace(VALUE str, VALUE str2) 00817 { 00818 rb_encoding *enc; 00819 int cr; 00820 if (str == str2) return; 00821 enc = STR_ENC_GET(str2); 00822 cr = ENC_CODERANGE(str2); 00823 str_discard(str); 00824 OBJ_INFECT(str, str2); 00825 if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) { 00826 STR_SET_EMBED(str); 00827 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1); 00828 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2)); 00829 rb_enc_associate(str, enc); 00830 ENC_CODERANGE_SET(str, cr); 00831 return; 00832 } 00833 STR_SET_NOEMBED(str); 00834 STR_UNSET_NOCAPA(str); 00835 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); 00836 RSTRING(str)->as.heap.len = RSTRING_LEN(str2); 00837 if (STR_NOCAPA_P(str2)) { 00838 FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA); 00839 RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared; 00840 } 00841 else { 00842 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa; 00843 } 00844 STR_SET_EMBED(str2); /* abandon str2 */ 00845 RSTRING_PTR(str2)[0] = 0; 00846 STR_SET_EMBED_LEN(str2, 0); 00847 rb_enc_associate(str, enc); 00848 ENC_CODERANGE_SET(str, cr); 00849 } 00850 00851 static ID id_to_s; 00852 00853 VALUE 00854 rb_obj_as_string(VALUE obj) 00855 { 00856 VALUE str; 00857 00858 if (TYPE(obj) == T_STRING) { 00859 return obj; 00860 } 00861 str = rb_funcall(obj, id_to_s, 0); 00862 if (TYPE(str) != T_STRING) 00863 return rb_any_to_s(obj); 00864 if (OBJ_TAINTED(obj)) OBJ_TAINT(str); 00865 return str; 00866 } 00867 00868 static VALUE 00869 str_replace(VALUE str, VALUE str2) 00870 { 00871 long len; 00872 00873 len = RSTRING_LEN(str2); 00874 if (STR_ASSOC_P(str2)) { 00875 str2 = rb_str_new4(str2); 00876 } 00877 if (STR_SHARED_P(str2)) { 00878 VALUE shared = RSTRING(str2)->as.heap.aux.shared; 00879 assert(OBJ_FROZEN(shared)); 00880 STR_SET_NOEMBED(str); 00881 RSTRING(str)->as.heap.len = len; 00882 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); 00883 FL_SET(str, ELTS_SHARED); 00884 FL_UNSET(str, STR_ASSOC); 00885 RSTRING(str)->as.heap.aux.shared = shared; 00886 } 00887 else { 00888 str_replace_shared(str, str2); 00889 } 00890 00891 OBJ_INFECT(str, str2); 00892 rb_enc_cr_str_exact_copy(str, str2); 00893 return str; 00894 } 00895 00896 static VALUE 00897 str_duplicate(VALUE klass, VALUE str) 00898 { 00899 VALUE dup = str_alloc(klass); 00900 str_replace(dup, str); 00901 return dup; 00902 } 00903 00904 VALUE 00905 rb_str_dup(VALUE str) 00906 { 00907 return str_duplicate(rb_obj_class(str), str); 00908 } 00909 00910 VALUE 00911 rb_str_resurrect(VALUE str) 00912 { 00913 return str_replace(str_alloc(rb_cString), str); 00914 } 00915 00916 /* 00917 * call-seq: 00918 * String.new(str="") -> new_str 00919 * 00920 * Returns a new string object containing a copy of <i>str</i>. 00921 */ 00922 00923 static VALUE 00924 rb_str_init(int argc, VALUE *argv, VALUE str) 00925 { 00926 VALUE orig; 00927 00928 if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1) 00929 rb_str_replace(str, orig); 00930 return str; 00931 } 00932 00933 static inline long 00934 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr) 00935 { 00936 long c; 00937 const char *q; 00938 00939 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 00940 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc); 00941 } 00942 else if (rb_enc_asciicompat(enc)) { 00943 c = 0; 00944 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) { 00945 while (p < e) { 00946 if (ISASCII(*p)) { 00947 q = search_nonascii(p, e); 00948 if (!q) 00949 return c + (e - p); 00950 c += q - p; 00951 p = q; 00952 } 00953 p += rb_enc_fast_mbclen(p, e, enc); 00954 c++; 00955 } 00956 } 00957 else { 00958 while (p < e) { 00959 if (ISASCII(*p)) { 00960 q = search_nonascii(p, e); 00961 if (!q) 00962 return c + (e - p); 00963 c += q - p; 00964 p = q; 00965 } 00966 p += rb_enc_mbclen(p, e, enc); 00967 c++; 00968 } 00969 } 00970 return c; 00971 } 00972 00973 for (c=0; p<e; c++) { 00974 p += rb_enc_mbclen(p, e, enc); 00975 } 00976 return c; 00977 } 00978 00979 long 00980 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc) 00981 { 00982 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN); 00983 } 00984 00985 long 00986 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr) 00987 { 00988 long c; 00989 const char *q; 00990 int ret; 00991 00992 *cr = 0; 00993 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 00994 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc); 00995 } 00996 else if (rb_enc_asciicompat(enc)) { 00997 c = 0; 00998 while (p < e) { 00999 if (ISASCII(*p)) { 01000 q = search_nonascii(p, e); 01001 if (!q) { 01002 if (!*cr) *cr = ENC_CODERANGE_7BIT; 01003 return c + (e - p); 01004 } 01005 c += q - p; 01006 p = q; 01007 } 01008 ret = rb_enc_precise_mbclen(p, e, enc); 01009 if (MBCLEN_CHARFOUND_P(ret)) { 01010 *cr |= ENC_CODERANGE_VALID; 01011 p += MBCLEN_CHARFOUND_LEN(ret); 01012 } 01013 else { 01014 *cr = ENC_CODERANGE_BROKEN; 01015 p++; 01016 } 01017 c++; 01018 } 01019 if (!*cr) *cr = ENC_CODERANGE_7BIT; 01020 return c; 01021 } 01022 01023 for (c=0; p<e; c++) { 01024 ret = rb_enc_precise_mbclen(p, e, enc); 01025 if (MBCLEN_CHARFOUND_P(ret)) { 01026 *cr |= ENC_CODERANGE_VALID; 01027 p += MBCLEN_CHARFOUND_LEN(ret); 01028 } 01029 else { 01030 *cr = ENC_CODERANGE_BROKEN; 01031 if (p + rb_enc_mbminlen(enc) <= e) 01032 p += rb_enc_mbminlen(enc); 01033 else 01034 p = e; 01035 } 01036 } 01037 if (!*cr) *cr = ENC_CODERANGE_7BIT; 01038 return c; 01039 } 01040 01041 #ifdef NONASCII_MASK 01042 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80) 01043 01044 /* 01045 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx 01046 * bit represention. (see http://en.wikipedia.org/wiki/UTF-8) 01047 * Therefore, following pseudo code can detect UTF-8 leading byte. 01048 * 01049 * if (!(byte & 0x80)) 01050 * byte |= 0x40; // turn on bit6 01051 * return ((byte>>6) & 1); // bit6 represent it's leading byte or not. 01052 * 01053 * This function calculate every bytes in the argument word `s' 01054 * using the above logic concurrently. and gather every bytes result. 01055 */ 01056 static inline VALUE 01057 count_utf8_lead_bytes_with_word(const VALUE *s) 01058 { 01059 VALUE d = *s; 01060 01061 /* Transform into bit0 represent UTF-8 leading or not. */ 01062 d |= ~(d>>1); 01063 d >>= 6; 01064 d &= NONASCII_MASK >> 7; 01065 01066 /* Gather every bytes. */ 01067 d += (d>>8); 01068 d += (d>>16); 01069 #if SIZEOF_VALUE == 8 01070 d += (d>>32); 01071 #endif 01072 return (d&0xF); 01073 } 01074 #endif 01075 01076 static long 01077 str_strlen(VALUE str, rb_encoding *enc) 01078 { 01079 const char *p, *e; 01080 long n; 01081 int cr; 01082 01083 if (single_byte_optimizable(str)) return RSTRING_LEN(str); 01084 if (!enc) enc = STR_ENC_GET(str); 01085 p = RSTRING_PTR(str); 01086 e = RSTRING_END(str); 01087 cr = ENC_CODERANGE(str); 01088 #ifdef NONASCII_MASK 01089 if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && 01090 enc == rb_utf8_encoding()) { 01091 01092 VALUE len = 0; 01093 if ((int)sizeof(VALUE) * 2 < e - p) { 01094 const VALUE *s, *t; 01095 const VALUE lowbits = sizeof(VALUE) - 1; 01096 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 01097 t = (const VALUE*)(~lowbits & (VALUE)e); 01098 while (p < (const char *)s) { 01099 if (is_utf8_lead_byte(*p)) len++; 01100 p++; 01101 } 01102 while (s < t) { 01103 len += count_utf8_lead_bytes_with_word(s); 01104 s++; 01105 } 01106 p = (const char *)s; 01107 } 01108 while (p < e) { 01109 if (is_utf8_lead_byte(*p)) len++; 01110 p++; 01111 } 01112 return (long)len; 01113 } 01114 #endif 01115 n = rb_enc_strlen_cr(p, e, enc, &cr); 01116 if (cr) { 01117 ENC_CODERANGE_SET(str, cr); 01118 } 01119 return n; 01120 } 01121 01122 long 01123 rb_str_strlen(VALUE str) 01124 { 01125 return str_strlen(str, STR_ENC_GET(str)); 01126 } 01127 01128 /* 01129 * call-seq: 01130 * str.length -> integer 01131 * str.size -> integer 01132 * 01133 * Returns the character length of <i>str</i>. 01134 */ 01135 01136 VALUE 01137 rb_str_length(VALUE str) 01138 { 01139 long len; 01140 01141 len = str_strlen(str, STR_ENC_GET(str)); 01142 return LONG2NUM(len); 01143 } 01144 01145 /* 01146 * call-seq: 01147 * str.bytesize -> integer 01148 * 01149 * Returns the length of <i>str</i> in bytes. 01150 */ 01151 01152 static VALUE 01153 rb_str_bytesize(VALUE str) 01154 { 01155 return LONG2NUM(RSTRING_LEN(str)); 01156 } 01157 01158 /* 01159 * call-seq: 01160 * str.empty? -> true or false 01161 * 01162 * Returns <code>true</code> if <i>str</i> has a length of zero. 01163 * 01164 * "hello".empty? #=> false 01165 * "".empty? #=> true 01166 */ 01167 01168 static VALUE 01169 rb_str_empty(VALUE str) 01170 { 01171 if (RSTRING_LEN(str) == 0) 01172 return Qtrue; 01173 return Qfalse; 01174 } 01175 01176 /* 01177 * call-seq: 01178 * str + other_str -> new_str 01179 * 01180 * Concatenation---Returns a new <code>String</code> containing 01181 * <i>other_str</i> concatenated to <i>str</i>. 01182 * 01183 * "Hello from " + self.to_s #=> "Hello from main" 01184 */ 01185 01186 VALUE 01187 rb_str_plus(VALUE str1, VALUE str2) 01188 { 01189 VALUE str3; 01190 rb_encoding *enc; 01191 01192 StringValue(str2); 01193 enc = rb_enc_check(str1, str2); 01194 str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2)); 01195 memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1)); 01196 memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1), 01197 RSTRING_PTR(str2), RSTRING_LEN(str2)); 01198 RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0'; 01199 01200 if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2)) 01201 OBJ_TAINT(str3); 01202 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc), 01203 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2))); 01204 return str3; 01205 } 01206 01207 /* 01208 * call-seq: 01209 * str * integer -> new_str 01210 * 01211 * Copy---Returns a new <code>String</code> containing <i>integer</i> copies of 01212 * the receiver. 01213 * 01214 * "Ho! " * 3 #=> "Ho! Ho! Ho! " 01215 */ 01216 01217 VALUE 01218 rb_str_times(VALUE str, VALUE times) 01219 { 01220 VALUE str2; 01221 long n, len; 01222 char *ptr2; 01223 01224 len = NUM2LONG(times); 01225 if (len < 0) { 01226 rb_raise(rb_eArgError, "negative argument"); 01227 } 01228 if (len && LONG_MAX/len < RSTRING_LEN(str)) { 01229 rb_raise(rb_eArgError, "argument too big"); 01230 } 01231 01232 str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str)); 01233 ptr2 = RSTRING_PTR(str2); 01234 if (len) { 01235 n = RSTRING_LEN(str); 01236 memcpy(ptr2, RSTRING_PTR(str), n); 01237 while (n <= len/2) { 01238 memcpy(ptr2 + n, ptr2, n); 01239 n *= 2; 01240 } 01241 memcpy(ptr2 + n, ptr2, len-n); 01242 } 01243 ptr2[RSTRING_LEN(str2)] = '\0'; 01244 OBJ_INFECT(str2, str); 01245 rb_enc_cr_str_copy_for_substr(str2, str); 01246 01247 return str2; 01248 } 01249 01250 /* 01251 * call-seq: 01252 * str % arg -> new_str 01253 * 01254 * Format---Uses <i>str</i> as a format specification, and returns the result 01255 * of applying it to <i>arg</i>. If the format specification contains more than 01256 * one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code> 01257 * containing the values to be substituted. See <code>Kernel::sprintf</code> for 01258 * details of the format string. 01259 * 01260 * "%05d" % 123 #=> "00123" 01261 * "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6" 01262 * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar" 01263 */ 01264 01265 static VALUE 01266 rb_str_format_m(VALUE str, VALUE arg) 01267 { 01268 volatile VALUE tmp = rb_check_array_type(arg); 01269 01270 if (!NIL_P(tmp)) { 01271 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str); 01272 } 01273 return rb_str_format(1, &arg, str); 01274 } 01275 01276 static inline void 01277 str_modifiable(VALUE str) 01278 { 01279 if (FL_TEST(str, STR_TMPLOCK)) { 01280 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked"); 01281 } 01282 rb_check_frozen(str); 01283 if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4) 01284 rb_raise(rb_eSecurityError, "Insecure: can't modify string"); 01285 } 01286 01287 static inline int 01288 str_independent(VALUE str) 01289 { 01290 str_modifiable(str); 01291 if (!STR_SHARED_P(str)) return 1; 01292 if (STR_EMBED_P(str)) return 1; 01293 return 0; 01294 } 01295 01296 static void 01297 str_make_independent_expand(VALUE str, long expand) 01298 { 01299 char *ptr; 01300 long len = RSTRING_LEN(str); 01301 long capa = len + expand; 01302 01303 if (len > capa) len = capa; 01304 ptr = ALLOC_N(char, capa + 1); 01305 if (RSTRING_PTR(str)) { 01306 memcpy(ptr, RSTRING_PTR(str), len); 01307 } 01308 STR_SET_NOEMBED(str); 01309 STR_UNSET_NOCAPA(str); 01310 ptr[len] = 0; 01311 RSTRING(str)->as.heap.ptr = ptr; 01312 RSTRING(str)->as.heap.len = len; 01313 RSTRING(str)->as.heap.aux.capa = capa; 01314 } 01315 01316 #define str_make_independent(str) str_make_independent_expand((str), 0L) 01317 01318 void 01319 rb_str_modify(VALUE str) 01320 { 01321 if (!str_independent(str)) 01322 str_make_independent(str); 01323 ENC_CODERANGE_CLEAR(str); 01324 } 01325 01326 void 01327 rb_str_modify_expand(VALUE str, long expand) 01328 { 01329 if (expand < 0) { 01330 rb_raise(rb_eArgError, "negative expanding string size"); 01331 } 01332 if (!str_independent(str)) { 01333 str_make_independent_expand(str, expand); 01334 } 01335 else if (expand > 0) { 01336 long len = RSTRING_LEN(str); 01337 long capa = len + expand; 01338 if (!STR_EMBED_P(str)) { 01339 REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1); 01340 RSTRING(str)->as.heap.aux.capa = capa; 01341 } 01342 else if (capa > RSTRING_EMBED_LEN_MAX) { 01343 str_make_independent_expand(str, expand); 01344 } 01345 } 01346 ENC_CODERANGE_CLEAR(str); 01347 } 01348 01349 /* As rb_str_modify(), but don't clear coderange */ 01350 static void 01351 str_modify_keep_cr(VALUE str) 01352 { 01353 if (!str_independent(str)) 01354 str_make_independent(str); 01355 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN) 01356 /* Force re-scan later */ 01357 ENC_CODERANGE_CLEAR(str); 01358 } 01359 01360 static inline void 01361 str_discard(VALUE str) 01362 { 01363 str_modifiable(str); 01364 if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) { 01365 xfree(RSTRING_PTR(str)); 01366 RSTRING(str)->as.heap.ptr = 0; 01367 RSTRING(str)->as.heap.len = 0; 01368 } 01369 } 01370 01371 void 01372 rb_str_associate(VALUE str, VALUE add) 01373 { 01374 /* sanity check */ 01375 rb_check_frozen(str); 01376 if (STR_ASSOC_P(str)) { 01377 /* already associated */ 01378 rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add); 01379 } 01380 else { 01381 if (STR_SHARED_P(str)) { 01382 VALUE assoc = RSTRING(str)->as.heap.aux.shared; 01383 str_make_independent(str); 01384 if (STR_ASSOC_P(assoc)) { 01385 assoc = RSTRING(assoc)->as.heap.aux.shared; 01386 rb_ary_concat(assoc, add); 01387 add = assoc; 01388 } 01389 } 01390 else if (STR_EMBED_P(str)) { 01391 str_make_independent(str); 01392 } 01393 else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) { 01394 RESIZE_CAPA(str, RSTRING_LEN(str)); 01395 } 01396 FL_SET(str, STR_ASSOC); 01397 RBASIC(add)->klass = 0; 01398 RSTRING(str)->as.heap.aux.shared = add; 01399 } 01400 } 01401 01402 VALUE 01403 rb_str_associated(VALUE str) 01404 { 01405 if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared; 01406 if (STR_ASSOC_P(str)) { 01407 return RSTRING(str)->as.heap.aux.shared; 01408 } 01409 return Qfalse; 01410 } 01411 01412 VALUE 01413 rb_string_value(volatile VALUE *ptr) 01414 { 01415 VALUE s = *ptr; 01416 if (TYPE(s) != T_STRING) { 01417 s = rb_str_to_str(s); 01418 *ptr = s; 01419 } 01420 return s; 01421 } 01422 01423 char * 01424 rb_string_value_ptr(volatile VALUE *ptr) 01425 { 01426 VALUE str = rb_string_value(ptr); 01427 return RSTRING_PTR(str); 01428 } 01429 01430 char * 01431 rb_string_value_cstr(volatile VALUE *ptr) 01432 { 01433 VALUE str = rb_string_value(ptr); 01434 char *s = RSTRING_PTR(str); 01435 long len = RSTRING_LEN(str); 01436 01437 if (!s || memchr(s, 0, len)) { 01438 rb_raise(rb_eArgError, "string contains null byte"); 01439 } 01440 if (s[len]) { 01441 rb_str_modify(str); 01442 s = RSTRING_PTR(str); 01443 s[RSTRING_LEN(str)] = 0; 01444 } 01445 return s; 01446 } 01447 01448 VALUE 01449 rb_check_string_type(VALUE str) 01450 { 01451 str = rb_check_convert_type(str, T_STRING, "String", "to_str"); 01452 return str; 01453 } 01454 01455 /* 01456 * call-seq: 01457 * String.try_convert(obj) -> string or nil 01458 * 01459 * Try to convert <i>obj</i> into a String, using to_str method. 01460 * Returns converted string or nil if <i>obj</i> cannot be converted 01461 * for any reason. 01462 * 01463 * String.try_convert("str") #=> "str" 01464 * String.try_convert(/re/) #=> nil 01465 */ 01466 static VALUE 01467 rb_str_s_try_convert(VALUE dummy, VALUE str) 01468 { 01469 return rb_check_string_type(str); 01470 } 01471 01472 static char* 01473 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc) 01474 { 01475 long nth = *nthp; 01476 if (rb_enc_mbmaxlen(enc) == 1) { 01477 p += nth; 01478 } 01479 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 01480 p += nth * rb_enc_mbmaxlen(enc); 01481 } 01482 else if (rb_enc_asciicompat(enc)) { 01483 const char *p2, *e2; 01484 int n; 01485 01486 while (p < e && 0 < nth) { 01487 e2 = p + nth; 01488 if (e < e2) { 01489 *nthp = nth; 01490 return (char *)e; 01491 } 01492 if (ISASCII(*p)) { 01493 p2 = search_nonascii(p, e2); 01494 if (!p2) { 01495 *nthp = nth; 01496 return (char *)e2; 01497 } 01498 nth -= p2 - p; 01499 p = p2; 01500 } 01501 n = rb_enc_mbclen(p, e, enc); 01502 p += n; 01503 nth--; 01504 } 01505 *nthp = nth; 01506 if (nth != 0) { 01507 return (char *)e; 01508 } 01509 return (char *)p; 01510 } 01511 else { 01512 while (p < e && nth--) { 01513 p += rb_enc_mbclen(p, e, enc); 01514 } 01515 } 01516 if (p > e) p = e; 01517 *nthp = nth; 01518 return (char*)p; 01519 } 01520 01521 char* 01522 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc) 01523 { 01524 return str_nth_len(p, e, &nth, enc); 01525 } 01526 01527 static char* 01528 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte) 01529 { 01530 if (singlebyte) 01531 p += nth; 01532 else { 01533 p = str_nth_len(p, e, &nth, enc); 01534 } 01535 if (!p) return 0; 01536 if (p > e) p = e; 01537 return (char *)p; 01538 } 01539 01540 /* char offset to byte offset */ 01541 static long 01542 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte) 01543 { 01544 const char *pp = str_nth(p, e, nth, enc, singlebyte); 01545 if (!pp) return e - p; 01546 return pp - p; 01547 } 01548 01549 long 01550 rb_str_offset(VALUE str, long pos) 01551 { 01552 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 01553 STR_ENC_GET(str), single_byte_optimizable(str)); 01554 } 01555 01556 #ifdef NONASCII_MASK 01557 static char * 01558 str_utf8_nth(const char *p, const char *e, long *nthp) 01559 { 01560 long nth = *nthp; 01561 if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) { 01562 const VALUE *s, *t; 01563 const VALUE lowbits = sizeof(VALUE) - 1; 01564 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 01565 t = (const VALUE*)(~lowbits & (VALUE)e); 01566 while (p < (const char *)s) { 01567 if (is_utf8_lead_byte(*p)) nth--; 01568 p++; 01569 } 01570 do { 01571 nth -= count_utf8_lead_bytes_with_word(s); 01572 s++; 01573 } while (s < t && (int)sizeof(VALUE) <= nth); 01574 p = (char *)s; 01575 } 01576 while (p < e) { 01577 if (is_utf8_lead_byte(*p)) { 01578 if (nth == 0) break; 01579 nth--; 01580 } 01581 p++; 01582 } 01583 *nthp = nth; 01584 return (char *)p; 01585 } 01586 01587 static long 01588 str_utf8_offset(const char *p, const char *e, long nth) 01589 { 01590 const char *pp = str_utf8_nth(p, e, &nth); 01591 return pp - p; 01592 } 01593 #endif 01594 01595 /* byte offset to char offset */ 01596 long 01597 rb_str_sublen(VALUE str, long pos) 01598 { 01599 if (single_byte_optimizable(str) || pos < 0) 01600 return pos; 01601 else { 01602 char *p = RSTRING_PTR(str); 01603 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str)); 01604 } 01605 } 01606 01607 VALUE 01608 rb_str_subseq(VALUE str, long beg, long len) 01609 { 01610 VALUE str2; 01611 01612 if (RSTRING_LEN(str) == beg + len && 01613 RSTRING_EMBED_LEN_MAX < len) { 01614 str2 = rb_str_new_shared(rb_str_new_frozen(str)); 01615 rb_str_drop_bytes(str2, beg); 01616 } 01617 else { 01618 str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len); 01619 } 01620 01621 rb_enc_cr_str_copy_for_substr(str2, str); 01622 OBJ_INFECT(str2, str); 01623 01624 return str2; 01625 } 01626 01627 VALUE 01628 rb_str_substr(VALUE str, long beg, long len) 01629 { 01630 rb_encoding *enc = STR_ENC_GET(str); 01631 VALUE str2; 01632 char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str); 01633 01634 if (len < 0) return Qnil; 01635 if (!RSTRING_LEN(str)) { 01636 len = 0; 01637 } 01638 if (single_byte_optimizable(str)) { 01639 if (beg > RSTRING_LEN(str)) return Qnil; 01640 if (beg < 0) { 01641 beg += RSTRING_LEN(str); 01642 if (beg < 0) return Qnil; 01643 } 01644 if (beg + len > RSTRING_LEN(str)) 01645 len = RSTRING_LEN(str) - beg; 01646 if (len <= 0) { 01647 len = 0; 01648 p = 0; 01649 } 01650 else 01651 p = s + beg; 01652 goto sub; 01653 } 01654 if (beg < 0) { 01655 if (len > -beg) len = -beg; 01656 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) { 01657 beg = -beg; 01658 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0); 01659 p = e; 01660 if (!p) return Qnil; 01661 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0); 01662 if (!p) return Qnil; 01663 len = e - p; 01664 goto sub; 01665 } 01666 else { 01667 beg += str_strlen(str, enc); 01668 if (beg < 0) return Qnil; 01669 } 01670 } 01671 else if (beg > 0 && beg > RSTRING_LEN(str)) { 01672 return Qnil; 01673 } 01674 if (len == 0) { 01675 if (beg > str_strlen(str, enc)) return Qnil; 01676 p = 0; 01677 } 01678 #ifdef NONASCII_MASK 01679 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && 01680 enc == rb_utf8_encoding()) { 01681 p = str_utf8_nth(s, e, &beg); 01682 if (beg > 0) return Qnil; 01683 len = str_utf8_offset(p, e, len); 01684 } 01685 #endif 01686 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 01687 int char_sz = rb_enc_mbmaxlen(enc); 01688 01689 p = s + beg * char_sz; 01690 if (p > e) { 01691 return Qnil; 01692 } 01693 else if (len * char_sz > e - p) 01694 len = e - p; 01695 else 01696 len *= char_sz; 01697 } 01698 else if ((p = str_nth_len(s, e, &beg, enc)) == e) { 01699 if (beg > 0) return Qnil; 01700 len = 0; 01701 } 01702 else { 01703 len = str_offset(p, e, len, enc, 0); 01704 } 01705 sub: 01706 if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) { 01707 str2 = rb_str_new4(str); 01708 str2 = str_new3(rb_obj_class(str2), str2); 01709 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len; 01710 RSTRING(str2)->as.heap.len = len; 01711 } 01712 else { 01713 str2 = rb_str_new5(str, p, len); 01714 rb_enc_cr_str_copy_for_substr(str2, str); 01715 OBJ_INFECT(str2, str); 01716 } 01717 01718 return str2; 01719 } 01720 01721 VALUE 01722 rb_str_freeze(VALUE str) 01723 { 01724 if (STR_ASSOC_P(str)) { 01725 VALUE ary = RSTRING(str)->as.heap.aux.shared; 01726 OBJ_FREEZE(ary); 01727 } 01728 return rb_obj_freeze(str); 01729 } 01730 01731 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str)) 01732 #define rb_str_dup_frozen rb_str_new_frozen 01733 01734 VALUE 01735 rb_str_locktmp(VALUE str) 01736 { 01737 if (FL_TEST(str, STR_TMPLOCK)) { 01738 rb_raise(rb_eRuntimeError, "temporal locking already locked string"); 01739 } 01740 FL_SET(str, STR_TMPLOCK); 01741 return str; 01742 } 01743 01744 VALUE 01745 rb_str_unlocktmp(VALUE str) 01746 { 01747 if (!FL_TEST(str, STR_TMPLOCK)) { 01748 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string"); 01749 } 01750 FL_UNSET(str, STR_TMPLOCK); 01751 return str; 01752 } 01753 01754 void 01755 rb_str_set_len(VALUE str, long len) 01756 { 01757 long capa; 01758 01759 str_modifiable(str); 01760 if (STR_SHARED_P(str)) { 01761 rb_raise(rb_eRuntimeError, "can't set length of shared string"); 01762 } 01763 if (len > (capa = (long)rb_str_capacity(str))) { 01764 rb_bug("probable buffer overflow: %ld for %ld", len, capa); 01765 } 01766 STR_SET_LEN(str, len); 01767 RSTRING_PTR(str)[len] = '\0'; 01768 } 01769 01770 VALUE 01771 rb_str_resize(VALUE str, long len) 01772 { 01773 long slen; 01774 int independent; 01775 01776 if (len < 0) { 01777 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01778 } 01779 01780 independent = str_independent(str); 01781 ENC_CODERANGE_CLEAR(str); 01782 slen = RSTRING_LEN(str); 01783 if (len != slen) { 01784 if (STR_EMBED_P(str)) { 01785 if (len <= RSTRING_EMBED_LEN_MAX) { 01786 STR_SET_EMBED_LEN(str, len); 01787 RSTRING(str)->as.ary[len] = '\0'; 01788 return str; 01789 } 01790 str_make_independent_expand(str, len - slen); 01791 STR_SET_NOEMBED(str); 01792 } 01793 else if (len <= RSTRING_EMBED_LEN_MAX) { 01794 char *ptr = RSTRING(str)->as.heap.ptr; 01795 STR_SET_EMBED(str); 01796 if (slen > len) slen = len; 01797 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen); 01798 RSTRING(str)->as.ary[len] = '\0'; 01799 STR_SET_EMBED_LEN(str, len); 01800 if (independent) xfree(ptr); 01801 return str; 01802 } 01803 else if (!independent) { 01804 str_make_independent_expand(str, len - slen); 01805 } 01806 else if (slen < len || slen - len > 1024) { 01807 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1); 01808 } 01809 if (!STR_NOCAPA_P(str)) { 01810 RSTRING(str)->as.heap.aux.capa = len; 01811 } 01812 RSTRING(str)->as.heap.len = len; 01813 RSTRING(str)->as.heap.ptr[len] = '\0'; /* sentinel */ 01814 } 01815 return str; 01816 } 01817 01818 static VALUE 01819 str_buf_cat(VALUE str, const char *ptr, long len) 01820 { 01821 long capa, total, off = -1; 01822 01823 if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) { 01824 off = ptr - RSTRING_PTR(str); 01825 } 01826 rb_str_modify(str); 01827 if (len == 0) return 0; 01828 if (STR_ASSOC_P(str)) { 01829 FL_UNSET(str, STR_ASSOC); 01830 capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str); 01831 } 01832 else if (STR_EMBED_P(str)) { 01833 capa = RSTRING_EMBED_LEN_MAX; 01834 } 01835 else { 01836 capa = RSTRING(str)->as.heap.aux.capa; 01837 } 01838 if (RSTRING_LEN(str) >= LONG_MAX - len) { 01839 rb_raise(rb_eArgError, "string sizes too big"); 01840 } 01841 total = RSTRING_LEN(str)+len; 01842 if (capa <= total) { 01843 while (total > capa) { 01844 if (capa + 1 >= LONG_MAX / 2) { 01845 capa = (total + 4095) / 4096; 01846 break; 01847 } 01848 capa = (capa + 1) * 2; 01849 } 01850 RESIZE_CAPA(str, capa); 01851 } 01852 if (off != -1) { 01853 ptr = RSTRING_PTR(str) + off; 01854 } 01855 memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len); 01856 STR_SET_LEN(str, total); 01857 RSTRING_PTR(str)[total] = '\0'; /* sentinel */ 01858 01859 return str; 01860 } 01861 01862 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr)) 01863 01864 VALUE 01865 rb_str_buf_cat(VALUE str, const char *ptr, long len) 01866 { 01867 if (len == 0) return str; 01868 if (len < 0) { 01869 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01870 } 01871 return str_buf_cat(str, ptr, len); 01872 } 01873 01874 VALUE 01875 rb_str_buf_cat2(VALUE str, const char *ptr) 01876 { 01877 return rb_str_buf_cat(str, ptr, strlen(ptr)); 01878 } 01879 01880 VALUE 01881 rb_str_cat(VALUE str, const char *ptr, long len) 01882 { 01883 if (len < 0) { 01884 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01885 } 01886 if (STR_ASSOC_P(str)) { 01887 char *p; 01888 rb_str_modify_expand(str, len); 01889 p = RSTRING(str)->as.heap.ptr; 01890 memcpy(p + RSTRING(str)->as.heap.len, ptr, len); 01891 len = RSTRING(str)->as.heap.len += len; 01892 p[len] = '\0'; /* sentinel */ 01893 return str; 01894 } 01895 01896 return rb_str_buf_cat(str, ptr, len); 01897 } 01898 01899 VALUE 01900 rb_str_cat2(VALUE str, const char *ptr) 01901 { 01902 return rb_str_cat(str, ptr, strlen(ptr)); 01903 } 01904 01905 static VALUE 01906 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len, 01907 int ptr_encindex, int ptr_cr, int *ptr_cr_ret) 01908 { 01909 int str_encindex = ENCODING_GET(str); 01910 int res_encindex; 01911 int str_cr, res_cr; 01912 01913 str_cr = ENC_CODERANGE(str); 01914 01915 if (str_encindex == ptr_encindex) { 01916 if (str_cr == ENC_CODERANGE_UNKNOWN) 01917 ptr_cr = ENC_CODERANGE_UNKNOWN; 01918 else if (ptr_cr == ENC_CODERANGE_UNKNOWN) { 01919 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex)); 01920 } 01921 } 01922 else { 01923 rb_encoding *str_enc = rb_enc_from_index(str_encindex); 01924 rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex); 01925 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) { 01926 if (len == 0) 01927 return str; 01928 if (RSTRING_LEN(str) == 0) { 01929 rb_str_buf_cat(str, ptr, len); 01930 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr); 01931 return str; 01932 } 01933 goto incompatible; 01934 } 01935 if (ptr_cr == ENC_CODERANGE_UNKNOWN) { 01936 ptr_cr = coderange_scan(ptr, len, ptr_enc); 01937 } 01938 if (str_cr == ENC_CODERANGE_UNKNOWN) { 01939 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) { 01940 str_cr = rb_enc_str_coderange(str); 01941 } 01942 } 01943 } 01944 if (ptr_cr_ret) 01945 *ptr_cr_ret = ptr_cr; 01946 01947 if (str_encindex != ptr_encindex && 01948 str_cr != ENC_CODERANGE_7BIT && 01949 ptr_cr != ENC_CODERANGE_7BIT) { 01950 incompatible: 01951 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", 01952 rb_enc_name(rb_enc_from_index(str_encindex)), 01953 rb_enc_name(rb_enc_from_index(ptr_encindex))); 01954 } 01955 01956 if (str_cr == ENC_CODERANGE_UNKNOWN) { 01957 res_encindex = str_encindex; 01958 res_cr = ENC_CODERANGE_UNKNOWN; 01959 } 01960 else if (str_cr == ENC_CODERANGE_7BIT) { 01961 if (ptr_cr == ENC_CODERANGE_7BIT) { 01962 res_encindex = str_encindex; 01963 res_cr = ENC_CODERANGE_7BIT; 01964 } 01965 else { 01966 res_encindex = ptr_encindex; 01967 res_cr = ptr_cr; 01968 } 01969 } 01970 else if (str_cr == ENC_CODERANGE_VALID) { 01971 res_encindex = str_encindex; 01972 if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID) 01973 res_cr = str_cr; 01974 else 01975 res_cr = ptr_cr; 01976 } 01977 else { /* str_cr == ENC_CODERANGE_BROKEN */ 01978 res_encindex = str_encindex; 01979 res_cr = str_cr; 01980 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN; 01981 } 01982 01983 if (len < 0) { 01984 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01985 } 01986 str_buf_cat(str, ptr, len); 01987 ENCODING_CODERANGE_SET(str, res_encindex, res_cr); 01988 return str; 01989 } 01990 01991 VALUE 01992 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc) 01993 { 01994 return rb_enc_cr_str_buf_cat(str, ptr, len, 01995 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL); 01996 } 01997 01998 VALUE 01999 rb_str_buf_cat_ascii(VALUE str, const char *ptr) 02000 { 02001 /* ptr must reference NUL terminated ASCII string. */ 02002 int encindex = ENCODING_GET(str); 02003 rb_encoding *enc = rb_enc_from_index(encindex); 02004 if (rb_enc_asciicompat(enc)) { 02005 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr), 02006 encindex, ENC_CODERANGE_7BIT, 0); 02007 } 02008 else { 02009 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc)); 02010 while (*ptr) { 02011 unsigned int c = (unsigned char)*ptr; 02012 int len = rb_enc_codelen(c, enc); 02013 rb_enc_mbcput(c, buf, enc); 02014 rb_enc_cr_str_buf_cat(str, buf, len, 02015 encindex, ENC_CODERANGE_VALID, 0); 02016 ptr++; 02017 } 02018 return str; 02019 } 02020 } 02021 02022 VALUE 02023 rb_str_buf_append(VALUE str, VALUE str2) 02024 { 02025 int str2_cr; 02026 02027 str2_cr = ENC_CODERANGE(str2); 02028 02029 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2), 02030 ENCODING_GET(str2), str2_cr, &str2_cr); 02031 02032 OBJ_INFECT(str, str2); 02033 ENC_CODERANGE_SET(str2, str2_cr); 02034 02035 return str; 02036 } 02037 02038 VALUE 02039 rb_str_append(VALUE str, VALUE str2) 02040 { 02041 rb_encoding *enc; 02042 int cr, cr2; 02043 long len2; 02044 02045 StringValue(str2); 02046 if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) { 02047 long len = RSTRING_LEN(str) + len2; 02048 enc = rb_enc_check(str, str2); 02049 cr = ENC_CODERANGE(str); 02050 if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2; 02051 rb_str_modify_expand(str, len2); 02052 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, 02053 RSTRING_PTR(str2), len2+1); 02054 RSTRING(str)->as.heap.len = len; 02055 rb_enc_associate(str, enc); 02056 ENC_CODERANGE_SET(str, cr); 02057 OBJ_INFECT(str, str2); 02058 return str; 02059 } 02060 return rb_str_buf_append(str, str2); 02061 } 02062 02063 /* 02064 * call-seq: 02065 * str << integer -> str 02066 * str.concat(integer) -> str 02067 * str << obj -> str 02068 * str.concat(obj) -> str 02069 * 02070 * Append---Concatenates the given object to <i>str</i>. If the object is a 02071 * <code>Integer</code>, it is considered as a codepoint, and is converted 02072 * to a character before concatenation. 02073 * 02074 * a = "hello " 02075 * a << "world" #=> "hello world" 02076 * a.concat(33) #=> "hello world!" 02077 */ 02078 02079 VALUE 02080 rb_str_concat(VALUE str1, VALUE str2) 02081 { 02082 unsigned int code; 02083 rb_encoding *enc = STR_ENC_GET(str1); 02084 02085 if (FIXNUM_P(str2) || TYPE(str2) == T_BIGNUM) { 02086 if (rb_num_to_uint(str2, &code) == 0) { 02087 } 02088 else if (FIXNUM_P(str2)) { 02089 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2)); 02090 } 02091 else { 02092 rb_raise(rb_eRangeError, "bignum out of char range"); 02093 } 02094 } 02095 else { 02096 return rb_str_append(str1, str2); 02097 } 02098 02099 if (enc == rb_usascii_encoding()) { 02100 /* US-ASCII automatically extended to ASCII-8BIT */ 02101 char buf[1] = {(char)code}; 02102 if (code > 0xFF) { 02103 rb_raise(rb_eRangeError, "%u out of char range", code); 02104 } 02105 rb_str_cat(str1, buf, 1); 02106 if (code > 127) { 02107 rb_enc_associate(str1, rb_ascii8bit_encoding()); 02108 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID); 02109 } 02110 } 02111 else { 02112 long pos = RSTRING_LEN(str1); 02113 int cr = ENC_CODERANGE(str1); 02114 int len; 02115 char *buf; 02116 02117 switch (len = rb_enc_codelen(code, enc)) { 02118 case ONIGERR_INVALID_CODE_POINT_VALUE: 02119 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc)); 02120 break; 02121 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE: 02122 case 0: 02123 rb_raise(rb_eRangeError, "%u out of char range", code); 02124 break; 02125 } 02126 buf = ALLOCA_N(char, len + 1); 02127 rb_enc_mbcput(code, buf, enc); 02128 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) { 02129 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc)); 02130 } 02131 rb_str_resize(str1, pos+len); 02132 strncpy(RSTRING_PTR(str1) + pos, buf, len); 02133 if (cr == ENC_CODERANGE_7BIT && code > 127) 02134 cr = ENC_CODERANGE_VALID; 02135 ENC_CODERANGE_SET(str1, cr); 02136 } 02137 return str1; 02138 } 02139 02140 /* 02141 * call-seq: 02142 * str.prepend(other_str) -> str 02143 * 02144 * Prepend---Prepend the given string to <i>str</i>. 02145 * 02146 * a = "world" 02147 * a.prepend("hello ") #=> "hello world" 02148 * a #=> "hello world" 02149 */ 02150 02151 static VALUE 02152 rb_str_prepend(VALUE str, VALUE str2) 02153 { 02154 StringValue(str2); 02155 StringValue(str); 02156 rb_str_update(str, 0L, 0L, str2); 02157 return str; 02158 } 02159 02160 st_index_t 02161 rb_str_hash(VALUE str) 02162 { 02163 int e = ENCODING_GET(str); 02164 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) { 02165 e = 0; 02166 } 02167 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e; 02168 } 02169 02170 int 02171 rb_str_hash_cmp(VALUE str1, VALUE str2) 02172 { 02173 long len; 02174 02175 if (!rb_str_comparable(str1, str2)) return 1; 02176 if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) && 02177 memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) { 02178 return 0; 02179 } 02180 return 1; 02181 } 02182 02183 /* 02184 * call-seq: 02185 * str.hash -> fixnum 02186 * 02187 * Return a hash based on the string's length and content. 02188 */ 02189 02190 static VALUE 02191 rb_str_hash_m(VALUE str) 02192 { 02193 st_index_t hval = rb_str_hash(str); 02194 return INT2FIX(hval); 02195 } 02196 02197 #define lesser(a,b) (((a)>(b))?(b):(a)) 02198 02199 int 02200 rb_str_comparable(VALUE str1, VALUE str2) 02201 { 02202 int idx1, idx2; 02203 int rc1, rc2; 02204 02205 if (RSTRING_LEN(str1) == 0) return TRUE; 02206 if (RSTRING_LEN(str2) == 0) return TRUE; 02207 idx1 = ENCODING_GET(str1); 02208 idx2 = ENCODING_GET(str2); 02209 if (idx1 == idx2) return TRUE; 02210 rc1 = rb_enc_str_coderange(str1); 02211 rc2 = rb_enc_str_coderange(str2); 02212 if (rc1 == ENC_CODERANGE_7BIT) { 02213 if (rc2 == ENC_CODERANGE_7BIT) return TRUE; 02214 if (rb_enc_asciicompat(rb_enc_from_index(idx2))) 02215 return TRUE; 02216 } 02217 if (rc2 == ENC_CODERANGE_7BIT) { 02218 if (rb_enc_asciicompat(rb_enc_from_index(idx1))) 02219 return TRUE; 02220 } 02221 return FALSE; 02222 } 02223 02224 int 02225 rb_str_cmp(VALUE str1, VALUE str2) 02226 { 02227 long len1, len2; 02228 const char *ptr1, *ptr2; 02229 int retval; 02230 02231 if (str1 == str2) return 0; 02232 RSTRING_GETMEM(str1, ptr1, len1); 02233 RSTRING_GETMEM(str2, ptr2, len2); 02234 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) { 02235 if (len1 == len2) { 02236 if (!rb_str_comparable(str1, str2)) { 02237 if (ENCODING_GET(str1) > ENCODING_GET(str2)) 02238 return 1; 02239 return -1; 02240 } 02241 return 0; 02242 } 02243 if (len1 > len2) return 1; 02244 return -1; 02245 } 02246 if (retval > 0) return 1; 02247 return -1; 02248 } 02249 02250 /* expect tail call optimization */ 02251 static VALUE 02252 str_eql(const VALUE str1, const VALUE str2) 02253 { 02254 const long len = RSTRING_LEN(str1); 02255 const char *ptr1, *ptr2; 02256 02257 if (len != RSTRING_LEN(str2)) return Qfalse; 02258 if (!rb_str_comparable(str1, str2)) return Qfalse; 02259 if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2))) 02260 return Qtrue; 02261 if (memcmp(ptr1, ptr2, len) == 0) 02262 return Qtrue; 02263 return Qfalse; 02264 } 02265 /* 02266 * call-seq: 02267 * str == obj -> true or false 02268 * 02269 * Equality---If <i>obj</i> is not a <code>String</code>, returns 02270 * <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i> 02271 * <code><=></code> <i>obj</i> returns zero. 02272 */ 02273 02274 VALUE 02275 rb_str_equal(VALUE str1, VALUE str2) 02276 { 02277 if (str1 == str2) return Qtrue; 02278 if (TYPE(str2) != T_STRING) { 02279 if (!rb_respond_to(str2, rb_intern("to_str"))) { 02280 return Qfalse; 02281 } 02282 return rb_equal(str2, str1); 02283 } 02284 return str_eql(str1, str2); 02285 } 02286 02287 /* 02288 * call-seq: 02289 * str.eql?(other) -> true or false 02290 * 02291 * Two strings are equal if they have the same length and content. 02292 */ 02293 02294 static VALUE 02295 rb_str_eql(VALUE str1, VALUE str2) 02296 { 02297 if (str1 == str2) return Qtrue; 02298 if (TYPE(str2) != T_STRING) return Qfalse; 02299 return str_eql(str1, str2); 02300 } 02301 02302 /* 02303 * call-seq: 02304 * str <=> other_str -> -1, 0, +1 or nil 02305 * 02306 * Comparison---Returns -1 if <i>other_str</i> is greater than, 0 if 02307 * <i>other_str</i> is equal to, and +1 if <i>other_str</i> is less than 02308 * <i>str</i>. If the strings are of different lengths, and the strings are 02309 * equal when compared up to the shortest length, then the longer string is 02310 * considered greater than the shorter one. In older versions of Ruby, setting 02311 * <code>$=</code> allowed case-insensitive comparisons; this is now deprecated 02312 * in favor of using <code>String#casecmp</code>. 02313 * 02314 * <code><=></code> is the basis for the methods <code><</code>, 02315 * <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>, 02316 * included from module <code>Comparable</code>. The method 02317 * <code>String#==</code> does not use <code>Comparable#==</code>. 02318 * 02319 * "abcdef" <=> "abcde" #=> 1 02320 * "abcdef" <=> "abcdef" #=> 0 02321 * "abcdef" <=> "abcdefg" #=> -1 02322 * "abcdef" <=> "ABCDEF" #=> 1 02323 */ 02324 02325 static VALUE 02326 rb_str_cmp_m(VALUE str1, VALUE str2) 02327 { 02328 long result; 02329 02330 if (TYPE(str2) != T_STRING) { 02331 if (!rb_respond_to(str2, rb_intern("to_str"))) { 02332 return Qnil; 02333 } 02334 else if (!rb_respond_to(str2, rb_intern("<=>"))) { 02335 return Qnil; 02336 } 02337 else { 02338 VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1); 02339 02340 if (NIL_P(tmp)) return Qnil; 02341 if (!FIXNUM_P(tmp)) { 02342 return rb_funcall(LONG2FIX(0), '-', 1, tmp); 02343 } 02344 result = -FIX2LONG(tmp); 02345 } 02346 } 02347 else { 02348 result = rb_str_cmp(str1, str2); 02349 } 02350 return LONG2NUM(result); 02351 } 02352 02353 /* 02354 * call-seq: 02355 * str.casecmp(other_str) -> -1, 0, +1 or nil 02356 * 02357 * Case-insensitive version of <code>String#<=></code>. 02358 * 02359 * "abcdef".casecmp("abcde") #=> 1 02360 * "aBcDeF".casecmp("abcdef") #=> 0 02361 * "abcdef".casecmp("abcdefg") #=> -1 02362 * "abcdef".casecmp("ABCDEF") #=> 0 02363 */ 02364 02365 static VALUE 02366 rb_str_casecmp(VALUE str1, VALUE str2) 02367 { 02368 long len; 02369 rb_encoding *enc; 02370 char *p1, *p1end, *p2, *p2end; 02371 02372 StringValue(str2); 02373 enc = rb_enc_compatible(str1, str2); 02374 if (!enc) { 02375 return Qnil; 02376 } 02377 02378 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1); 02379 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2); 02380 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) { 02381 while (p1 < p1end && p2 < p2end) { 02382 if (*p1 != *p2) { 02383 unsigned int c1 = TOUPPER(*p1 & 0xff); 02384 unsigned int c2 = TOUPPER(*p2 & 0xff); 02385 if (c1 != c2) 02386 return INT2FIX(c1 < c2 ? -1 : 1); 02387 } 02388 p1++; 02389 p2++; 02390 } 02391 } 02392 else { 02393 while (p1 < p1end && p2 < p2end) { 02394 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc); 02395 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc); 02396 02397 if (0 <= c1 && 0 <= c2) { 02398 c1 = TOUPPER(c1); 02399 c2 = TOUPPER(c2); 02400 if (c1 != c2) 02401 return INT2FIX(c1 < c2 ? -1 : 1); 02402 } 02403 else { 02404 int r; 02405 l1 = rb_enc_mbclen(p1, p1end, enc); 02406 l2 = rb_enc_mbclen(p2, p2end, enc); 02407 len = l1 < l2 ? l1 : l2; 02408 r = memcmp(p1, p2, len); 02409 if (r != 0) 02410 return INT2FIX(r < 0 ? -1 : 1); 02411 if (l1 != l2) 02412 return INT2FIX(l1 < l2 ? -1 : 1); 02413 } 02414 p1 += l1; 02415 p2 += l2; 02416 } 02417 } 02418 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0); 02419 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1); 02420 return INT2FIX(-1); 02421 } 02422 02423 static long 02424 rb_str_index(VALUE str, VALUE sub, long offset) 02425 { 02426 long pos; 02427 char *s, *sptr, *e; 02428 long len, slen; 02429 rb_encoding *enc; 02430 02431 enc = rb_enc_check(str, sub); 02432 if (is_broken_string(sub)) { 02433 return -1; 02434 } 02435 len = str_strlen(str, enc); 02436 slen = str_strlen(sub, enc); 02437 if (offset < 0) { 02438 offset += len; 02439 if (offset < 0) return -1; 02440 } 02441 if (len - offset < slen) return -1; 02442 s = RSTRING_PTR(str); 02443 e = s + RSTRING_LEN(str); 02444 if (offset) { 02445 offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str)); 02446 s += offset; 02447 } 02448 if (slen == 0) return offset; 02449 /* need proceed one character at a time */ 02450 sptr = RSTRING_PTR(sub); 02451 slen = RSTRING_LEN(sub); 02452 len = RSTRING_LEN(str) - offset; 02453 for (;;) { 02454 char *t; 02455 pos = rb_memsearch(sptr, slen, s, len, enc); 02456 if (pos < 0) return pos; 02457 t = rb_enc_right_char_head(s, s+pos, e, enc); 02458 if (t == s + pos) break; 02459 if ((len -= t - s) <= 0) return -1; 02460 offset += t - s; 02461 s = t; 02462 } 02463 return pos + offset; 02464 } 02465 02466 02467 /* 02468 * call-seq: 02469 * str.index(substring [, offset]) -> fixnum or nil 02470 * str.index(regexp [, offset]) -> fixnum or nil 02471 * 02472 * Returns the index of the first occurrence of the given <i>substring</i> or 02473 * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not 02474 * found. If the second parameter is present, it specifies the position in the 02475 * string to begin the search. 02476 * 02477 * "hello".index('e') #=> 1 02478 * "hello".index('lo') #=> 3 02479 * "hello".index('a') #=> nil 02480 * "hello".index(?e) #=> 1 02481 * "hello".index(/[aeiou]/, -3) #=> 4 02482 */ 02483 02484 static VALUE 02485 rb_str_index_m(int argc, VALUE *argv, VALUE str) 02486 { 02487 VALUE sub; 02488 VALUE initpos; 02489 long pos; 02490 02491 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) { 02492 pos = NUM2LONG(initpos); 02493 } 02494 else { 02495 pos = 0; 02496 } 02497 if (pos < 0) { 02498 pos += str_strlen(str, STR_ENC_GET(str)); 02499 if (pos < 0) { 02500 if (TYPE(sub) == T_REGEXP) { 02501 rb_backref_set(Qnil); 02502 } 02503 return Qnil; 02504 } 02505 } 02506 02507 switch (TYPE(sub)) { 02508 case T_REGEXP: 02509 if (pos > str_strlen(str, STR_ENC_GET(str))) 02510 return Qnil; 02511 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 02512 rb_enc_check(str, sub), single_byte_optimizable(str)); 02513 02514 pos = rb_reg_search(sub, str, pos, 0); 02515 pos = rb_str_sublen(str, pos); 02516 break; 02517 02518 default: { 02519 VALUE tmp; 02520 02521 tmp = rb_check_string_type(sub); 02522 if (NIL_P(tmp)) { 02523 rb_raise(rb_eTypeError, "type mismatch: %s given", 02524 rb_obj_classname(sub)); 02525 } 02526 sub = tmp; 02527 } 02528 /* fall through */ 02529 case T_STRING: 02530 pos = rb_str_index(str, sub, pos); 02531 pos = rb_str_sublen(str, pos); 02532 break; 02533 } 02534 02535 if (pos == -1) return Qnil; 02536 return LONG2NUM(pos); 02537 } 02538 02539 static long 02540 rb_str_rindex(VALUE str, VALUE sub, long pos) 02541 { 02542 long len, slen; 02543 char *s, *sbeg, *e, *t; 02544 rb_encoding *enc; 02545 int singlebyte = single_byte_optimizable(str); 02546 02547 enc = rb_enc_check(str, sub); 02548 if (is_broken_string(sub)) { 02549 return -1; 02550 } 02551 len = str_strlen(str, enc); 02552 slen = str_strlen(sub, enc); 02553 /* substring longer than string */ 02554 if (len < slen) return -1; 02555 if (len - pos < slen) { 02556 pos = len - slen; 02557 } 02558 if (len == 0) { 02559 return pos; 02560 } 02561 sbeg = RSTRING_PTR(str); 02562 e = RSTRING_END(str); 02563 t = RSTRING_PTR(sub); 02564 slen = RSTRING_LEN(sub); 02565 s = str_nth(sbeg, e, pos, enc, singlebyte); 02566 while (s) { 02567 if (memcmp(s, t, slen) == 0) { 02568 return pos; 02569 } 02570 if (pos == 0) break; 02571 pos--; 02572 s = rb_enc_prev_char(sbeg, s, e, enc); 02573 } 02574 return -1; 02575 } 02576 02577 02578 /* 02579 * call-seq: 02580 * str.rindex(substring [, fixnum]) -> fixnum or nil 02581 * str.rindex(regexp [, fixnum]) -> fixnum or nil 02582 * 02583 * Returns the index of the last occurrence of the given <i>substring</i> or 02584 * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not 02585 * found. If the second parameter is present, it specifies the position in the 02586 * string to end the search---characters beyond this point will not be 02587 * considered. 02588 * 02589 * "hello".rindex('e') #=> 1 02590 * "hello".rindex('l') #=> 3 02591 * "hello".rindex('a') #=> nil 02592 * "hello".rindex(?e) #=> 1 02593 * "hello".rindex(/[aeiou]/, -2) #=> 1 02594 */ 02595 02596 static VALUE 02597 rb_str_rindex_m(int argc, VALUE *argv, VALUE str) 02598 { 02599 VALUE sub; 02600 VALUE vpos; 02601 rb_encoding *enc = STR_ENC_GET(str); 02602 long pos, len = str_strlen(str, enc); 02603 02604 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) { 02605 pos = NUM2LONG(vpos); 02606 if (pos < 0) { 02607 pos += len; 02608 if (pos < 0) { 02609 if (TYPE(sub) == T_REGEXP) { 02610 rb_backref_set(Qnil); 02611 } 02612 return Qnil; 02613 } 02614 } 02615 if (pos > len) pos = len; 02616 } 02617 else { 02618 pos = len; 02619 } 02620 02621 switch (TYPE(sub)) { 02622 case T_REGEXP: 02623 /* enc = rb_get_check(str, sub); */ 02624 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 02625 STR_ENC_GET(str), single_byte_optimizable(str)); 02626 02627 if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) { 02628 pos = rb_reg_search(sub, str, pos, 1); 02629 pos = rb_str_sublen(str, pos); 02630 } 02631 if (pos >= 0) return LONG2NUM(pos); 02632 break; 02633 02634 default: { 02635 VALUE tmp; 02636 02637 tmp = rb_check_string_type(sub); 02638 if (NIL_P(tmp)) { 02639 rb_raise(rb_eTypeError, "type mismatch: %s given", 02640 rb_obj_classname(sub)); 02641 } 02642 sub = tmp; 02643 } 02644 /* fall through */ 02645 case T_STRING: 02646 pos = rb_str_rindex(str, sub, pos); 02647 if (pos >= 0) return LONG2NUM(pos); 02648 break; 02649 } 02650 return Qnil; 02651 } 02652 02653 /* 02654 * call-seq: 02655 * str =~ obj -> fixnum or nil 02656 * 02657 * Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match 02658 * against <i>str</i>,and returns the position the match starts, or 02659 * <code>nil</code> if there is no match. Otherwise, invokes 02660 * <i>obj.=~</i>, passing <i>str</i> as an argument. The default 02661 * <code>=~</code> in <code>Object</code> returns <code>nil</code>. 02662 * 02663 * "cat o' 9 tails" =~ /\d/ #=> 7 02664 * "cat o' 9 tails" =~ 9 #=> nil 02665 */ 02666 02667 static VALUE 02668 rb_str_match(VALUE x, VALUE y) 02669 { 02670 switch (TYPE(y)) { 02671 case T_STRING: 02672 rb_raise(rb_eTypeError, "type mismatch: String given"); 02673 02674 case T_REGEXP: 02675 return rb_reg_match(y, x); 02676 02677 default: 02678 return rb_funcall(y, rb_intern("=~"), 1, x); 02679 } 02680 } 02681 02682 02683 static VALUE get_pat(VALUE, int); 02684 02685 02686 /* 02687 * call-seq: 02688 * str.match(pattern) -> matchdata or nil 02689 * str.match(pattern, pos) -> matchdata or nil 02690 * 02691 * Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one), 02692 * then invokes its <code>match</code> method on <i>str</i>. If the second 02693 * parameter is present, it specifies the position in the string to begin the 02694 * search. 02695 * 02696 * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l"> 02697 * 'hello'.match('(.)\1')[0] #=> "ll" 02698 * 'hello'.match(/(.)\1/)[0] #=> "ll" 02699 * 'hello'.match('xx') #=> nil 02700 * 02701 * If a block is given, invoke the block with MatchData if match succeed, so 02702 * that you can write 02703 * 02704 * str.match(pat) {|m| ...} 02705 * 02706 * instead of 02707 * 02708 * if m = str.match(pat) 02709 * ... 02710 * end 02711 * 02712 * The return value is a value from block execution in this case. 02713 */ 02714 02715 static VALUE 02716 rb_str_match_m(int argc, VALUE *argv, VALUE str) 02717 { 02718 VALUE re, result; 02719 if (argc < 1) 02720 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 02721 re = argv[0]; 02722 argv[0] = str; 02723 result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv); 02724 if (!NIL_P(result) && rb_block_given_p()) { 02725 return rb_yield(result); 02726 } 02727 return result; 02728 } 02729 02730 enum neighbor_char { 02731 NEIGHBOR_NOT_CHAR, 02732 NEIGHBOR_FOUND, 02733 NEIGHBOR_WRAPPED 02734 }; 02735 02736 static enum neighbor_char 02737 enc_succ_char(char *p, long len, rb_encoding *enc) 02738 { 02739 long i; 02740 int l; 02741 while (1) { 02742 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--) 02743 p[i] = '\0'; 02744 if (i < 0) 02745 return NEIGHBOR_WRAPPED; 02746 ++((unsigned char*)p)[i]; 02747 l = rb_enc_precise_mbclen(p, p+len, enc); 02748 if (MBCLEN_CHARFOUND_P(l)) { 02749 l = MBCLEN_CHARFOUND_LEN(l); 02750 if (l == len) { 02751 return NEIGHBOR_FOUND; 02752 } 02753 else { 02754 memset(p+l, 0xff, len-l); 02755 } 02756 } 02757 if (MBCLEN_INVALID_P(l) && i < len-1) { 02758 long len2; 02759 int l2; 02760 for (len2 = len-1; 0 < len2; len2--) { 02761 l2 = rb_enc_precise_mbclen(p, p+len2, enc); 02762 if (!MBCLEN_INVALID_P(l2)) 02763 break; 02764 } 02765 memset(p+len2+1, 0xff, len-(len2+1)); 02766 } 02767 } 02768 } 02769 02770 static enum neighbor_char 02771 enc_pred_char(char *p, long len, rb_encoding *enc) 02772 { 02773 long i; 02774 int l; 02775 while (1) { 02776 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--) 02777 p[i] = '\xff'; 02778 if (i < 0) 02779 return NEIGHBOR_WRAPPED; 02780 --((unsigned char*)p)[i]; 02781 l = rb_enc_precise_mbclen(p, p+len, enc); 02782 if (MBCLEN_CHARFOUND_P(l)) { 02783 l = MBCLEN_CHARFOUND_LEN(l); 02784 if (l == len) { 02785 return NEIGHBOR_FOUND; 02786 } 02787 else { 02788 memset(p+l, 0, len-l); 02789 } 02790 } 02791 if (MBCLEN_INVALID_P(l) && i < len-1) { 02792 long len2; 02793 int l2; 02794 for (len2 = len-1; 0 < len2; len2--) { 02795 l2 = rb_enc_precise_mbclen(p, p+len2, enc); 02796 if (!MBCLEN_INVALID_P(l2)) 02797 break; 02798 } 02799 memset(p+len2+1, 0, len-(len2+1)); 02800 } 02801 } 02802 } 02803 02804 /* 02805 overwrite +p+ by succeeding letter in +enc+ and returns 02806 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED. 02807 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry. 02808 assuming each ranges are successive, and mbclen 02809 never change in each ranges. 02810 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one 02811 character. 02812 */ 02813 static enum neighbor_char 02814 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry) 02815 { 02816 enum neighbor_char ret; 02817 unsigned int c; 02818 int ctype; 02819 int range; 02820 char save[ONIGENC_CODE_TO_MBC_MAXLEN]; 02821 02822 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 02823 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc)) 02824 ctype = ONIGENC_CTYPE_DIGIT; 02825 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc)) 02826 ctype = ONIGENC_CTYPE_ALPHA; 02827 else 02828 return NEIGHBOR_NOT_CHAR; 02829 02830 MEMCPY(save, p, char, len); 02831 ret = enc_succ_char(p, len, enc); 02832 if (ret == NEIGHBOR_FOUND) { 02833 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 02834 if (rb_enc_isctype(c, ctype, enc)) 02835 return NEIGHBOR_FOUND; 02836 } 02837 MEMCPY(p, save, char, len); 02838 range = 1; 02839 while (1) { 02840 MEMCPY(save, p, char, len); 02841 ret = enc_pred_char(p, len, enc); 02842 if (ret == NEIGHBOR_FOUND) { 02843 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 02844 if (!rb_enc_isctype(c, ctype, enc)) { 02845 MEMCPY(p, save, char, len); 02846 break; 02847 } 02848 } 02849 else { 02850 MEMCPY(p, save, char, len); 02851 break; 02852 } 02853 range++; 02854 } 02855 if (range == 1) { 02856 return NEIGHBOR_NOT_CHAR; 02857 } 02858 02859 if (ctype != ONIGENC_CTYPE_DIGIT) { 02860 MEMCPY(carry, p, char, len); 02861 return NEIGHBOR_WRAPPED; 02862 } 02863 02864 MEMCPY(carry, p, char, len); 02865 enc_succ_char(carry, len, enc); 02866 return NEIGHBOR_WRAPPED; 02867 } 02868 02869 02870 /* 02871 * call-seq: 02872 * str.succ -> new_str 02873 * str.next -> new_str 02874 * 02875 * Returns the successor to <i>str</i>. The successor is calculated by 02876 * incrementing characters starting from the rightmost alphanumeric (or 02877 * the rightmost character if there are no alphanumerics) in the 02878 * string. Incrementing a digit always results in another digit, and 02879 * incrementing a letter results in another letter of the same case. 02880 * Incrementing nonalphanumerics uses the underlying character set's 02881 * collating sequence. 02882 * 02883 * If the increment generates a ``carry,'' the character to the left of 02884 * it is incremented. This process repeats until there is no carry, 02885 * adding an additional character if necessary. 02886 * 02887 * "abcd".succ #=> "abce" 02888 * "THX1138".succ #=> "THX1139" 02889 * "<<koala>>".succ #=> "<<koalb>>" 02890 * "1999zzz".succ #=> "2000aaa" 02891 * "ZZZ9999".succ #=> "AAAA0000" 02892 * "***".succ #=> "**+" 02893 */ 02894 02895 VALUE 02896 rb_str_succ(VALUE orig) 02897 { 02898 rb_encoding *enc; 02899 VALUE str; 02900 char *sbeg, *s, *e, *last_alnum = 0; 02901 int c = -1; 02902 long l; 02903 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1"; 02904 long carry_pos = 0, carry_len = 1; 02905 enum neighbor_char neighbor = NEIGHBOR_FOUND; 02906 02907 str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig)); 02908 rb_enc_cr_str_copy_for_substr(str, orig); 02909 OBJ_INFECT(str, orig); 02910 if (RSTRING_LEN(str) == 0) return str; 02911 02912 enc = STR_ENC_GET(orig); 02913 sbeg = RSTRING_PTR(str); 02914 s = e = sbeg + RSTRING_LEN(str); 02915 02916 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { 02917 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) { 02918 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) : 02919 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) { 02920 s = last_alnum; 02921 break; 02922 } 02923 } 02924 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; 02925 neighbor = enc_succ_alnum_char(s, l, enc, carry); 02926 switch (neighbor) { 02927 case NEIGHBOR_NOT_CHAR: 02928 continue; 02929 case NEIGHBOR_FOUND: 02930 return str; 02931 case NEIGHBOR_WRAPPED: 02932 last_alnum = s; 02933 break; 02934 } 02935 c = 1; 02936 carry_pos = s - sbeg; 02937 carry_len = l; 02938 } 02939 if (c == -1) { /* str contains no alnum */ 02940 s = e; 02941 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { 02942 enum neighbor_char neighbor; 02943 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; 02944 neighbor = enc_succ_char(s, l, enc); 02945 if (neighbor == NEIGHBOR_FOUND) 02946 return str; 02947 if (rb_enc_precise_mbclen(s, s+l, enc) != l) { 02948 /* wrapped to \0...\0. search next valid char. */ 02949 enc_succ_char(s, l, enc); 02950 } 02951 if (!rb_enc_asciicompat(enc)) { 02952 MEMCPY(carry, s, char, l); 02953 carry_len = l; 02954 } 02955 carry_pos = s - sbeg; 02956 } 02957 } 02958 RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len); 02959 s = RSTRING_PTR(str) + carry_pos; 02960 memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos); 02961 memmove(s, carry, carry_len); 02962 STR_SET_LEN(str, RSTRING_LEN(str) + carry_len); 02963 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 02964 rb_enc_str_coderange(str); 02965 return str; 02966 } 02967 02968 02969 /* 02970 * call-seq: 02971 * str.succ! -> str 02972 * str.next! -> str 02973 * 02974 * Equivalent to <code>String#succ</code>, but modifies the receiver in 02975 * place. 02976 */ 02977 02978 static VALUE 02979 rb_str_succ_bang(VALUE str) 02980 { 02981 rb_str_shared_replace(str, rb_str_succ(str)); 02982 02983 return str; 02984 } 02985 02986 02987 /* 02988 * call-seq: 02989 * str.upto(other_str, exclusive=false) {|s| block } -> str 02990 * str.upto(other_str, exclusive=false) -> an_enumerator 02991 * 02992 * Iterates through successive values, starting at <i>str</i> and 02993 * ending at <i>other_str</i> inclusive, passing each value in turn to 02994 * the block. The <code>String#succ</code> method is used to generate 02995 * each value. If optional second argument exclusive is omitted or is false, 02996 * the last value will be included; otherwise it will be excluded. 02997 * 02998 * If no block is given, an enumerator is returned instead. 02999 * 03000 * "a8".upto("b6") {|s| print s, ' ' } 03001 * for s in "a8".."b6" 03002 * print s, ' ' 03003 * end 03004 * 03005 * <em>produces:</em> 03006 * 03007 * a8 a9 b0 b1 b2 b3 b4 b5 b6 03008 * a8 a9 b0 b1 b2 b3 b4 b5 b6 03009 * 03010 * If <i>str</i> and <i>other_str</i> contains only ascii numeric characters, 03011 * both are recognized as decimal numbers. In addition, the width of 03012 * string (e.g. leading zeros) is handled appropriately. 03013 * 03014 * "9".upto("11").to_a #=> ["9", "10", "11"] 03015 * "25".upto("5").to_a #=> [] 03016 * "07".upto("11").to_a #=> ["07", "08", "09", "10", "11"] 03017 */ 03018 03019 static VALUE 03020 rb_str_upto(int argc, VALUE *argv, VALUE beg) 03021 { 03022 VALUE end, exclusive; 03023 VALUE current, after_end; 03024 ID succ; 03025 int n, excl, ascii; 03026 rb_encoding *enc; 03027 03028 rb_scan_args(argc, argv, "11", &end, &exclusive); 03029 RETURN_ENUMERATOR(beg, argc, argv); 03030 excl = RTEST(exclusive); 03031 CONST_ID(succ, "succ"); 03032 StringValue(end); 03033 enc = rb_enc_check(beg, end); 03034 ascii = (is_ascii_string(beg) && is_ascii_string(end)); 03035 /* single character */ 03036 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) { 03037 char c = RSTRING_PTR(beg)[0]; 03038 char e = RSTRING_PTR(end)[0]; 03039 03040 if (c > e || (excl && c == e)) return beg; 03041 for (;;) { 03042 rb_yield(rb_enc_str_new(&c, 1, enc)); 03043 if (!excl && c == e) break; 03044 c++; 03045 if (excl && c == e) break; 03046 } 03047 return beg; 03048 } 03049 /* both edges are all digits */ 03050 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) { 03051 char *s, *send; 03052 VALUE b, e; 03053 int width; 03054 03055 s = RSTRING_PTR(beg); send = RSTRING_END(beg); 03056 width = rb_long2int(send - s); 03057 while (s < send) { 03058 if (!ISDIGIT(*s)) goto no_digits; 03059 s++; 03060 } 03061 s = RSTRING_PTR(end); send = RSTRING_END(end); 03062 while (s < send) { 03063 if (!ISDIGIT(*s)) goto no_digits; 03064 s++; 03065 } 03066 b = rb_str_to_inum(beg, 10, FALSE); 03067 e = rb_str_to_inum(end, 10, FALSE); 03068 if (FIXNUM_P(b) && FIXNUM_P(e)) { 03069 long bi = FIX2LONG(b); 03070 long ei = FIX2LONG(e); 03071 rb_encoding *usascii = rb_usascii_encoding(); 03072 03073 while (bi <= ei) { 03074 if (excl && bi == ei) break; 03075 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi)); 03076 bi++; 03077 } 03078 } 03079 else { 03080 ID op = excl ? '<' : rb_intern("<="); 03081 VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d")); 03082 03083 args[0] = INT2FIX(width); 03084 while (rb_funcall(b, op, 1, e)) { 03085 args[1] = b; 03086 rb_yield(rb_str_format(numberof(args), args, fmt)); 03087 b = rb_funcall(b, succ, 0, 0); 03088 } 03089 } 03090 return beg; 03091 } 03092 /* normal case */ 03093 no_digits: 03094 n = rb_str_cmp(beg, end); 03095 if (n > 0 || (excl && n == 0)) return beg; 03096 03097 after_end = rb_funcall(end, succ, 0, 0); 03098 current = rb_str_dup(beg); 03099 while (!rb_str_equal(current, after_end)) { 03100 VALUE next = Qnil; 03101 if (excl || !rb_str_equal(current, end)) 03102 next = rb_funcall(current, succ, 0, 0); 03103 rb_yield(current); 03104 if (NIL_P(next)) break; 03105 current = next; 03106 StringValue(current); 03107 if (excl && rb_str_equal(current, end)) break; 03108 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0) 03109 break; 03110 } 03111 03112 return beg; 03113 } 03114 03115 static VALUE 03116 rb_str_subpat(VALUE str, VALUE re, VALUE backref) 03117 { 03118 if (rb_reg_search(re, str, 0, 0) >= 0) { 03119 VALUE match = rb_backref_get(); 03120 int nth = rb_reg_backref_number(match, backref); 03121 return rb_reg_nth_match(nth, match); 03122 } 03123 return Qnil; 03124 } 03125 03126 static VALUE 03127 rb_str_aref(VALUE str, VALUE indx) 03128 { 03129 long idx; 03130 03131 switch (TYPE(indx)) { 03132 case T_FIXNUM: 03133 idx = FIX2LONG(indx); 03134 03135 num_index: 03136 str = rb_str_substr(str, idx, 1); 03137 if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil; 03138 return str; 03139 03140 case T_REGEXP: 03141 return rb_str_subpat(str, indx, INT2FIX(0)); 03142 03143 case T_STRING: 03144 if (rb_str_index(str, indx, 0) != -1) 03145 return rb_str_dup(indx); 03146 return Qnil; 03147 03148 default: 03149 /* check if indx is Range */ 03150 { 03151 long beg, len; 03152 VALUE tmp; 03153 03154 len = str_strlen(str, STR_ENC_GET(str)); 03155 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { 03156 case Qfalse: 03157 break; 03158 case Qnil: 03159 return Qnil; 03160 default: 03161 tmp = rb_str_substr(str, beg, len); 03162 return tmp; 03163 } 03164 } 03165 idx = NUM2LONG(indx); 03166 goto num_index; 03167 } 03168 return Qnil; /* not reached */ 03169 } 03170 03171 03172 /* 03173 * call-seq: 03174 * str[fixnum] -> new_str or nil 03175 * str[fixnum, fixnum] -> new_str or nil 03176 * str[range] -> new_str or nil 03177 * str[regexp] -> new_str or nil 03178 * str[regexp, fixnum] -> new_str or nil 03179 * str[other_str] -> new_str or nil 03180 * str.slice(fixnum) -> new_str or nil 03181 * str.slice(fixnum, fixnum) -> new_str or nil 03182 * str.slice(range) -> new_str or nil 03183 * str.slice(regexp) -> new_str or nil 03184 * str.slice(regexp, fixnum) -> new_str or nil 03185 * str.slice(regexp, capname) -> new_str or nil 03186 * str.slice(other_str) -> new_str or nil 03187 * 03188 * Element Reference---If passed a single <code>Fixnum</code>, returns a 03189 * substring of one character at that position. If passed two <code>Fixnum</code> 03190 * objects, returns a substring starting at the offset given by the first, and 03191 * with a length given by the second. If passed a range, its beginning and end 03192 * are interpreted as offsets delimiting the substring to be returned. In all 03193 * three cases, if an offset is negative, it is counted from the end of <i>str</i>. 03194 * Returns <code>nil</code> if the initial offset falls outside the string or 03195 * the length is negative. 03196 * 03197 * If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is 03198 * returned. If a numeric or name parameter follows the regular expression, that 03199 * component of the <code>MatchData</code> is returned instead. If a 03200 * <code>String</code> is given, that string is returned if it occurs in 03201 * <i>str</i>. In both cases, <code>nil</code> is returned if there is no 03202 * match. 03203 * 03204 * a = "hello there" 03205 * a[1] #=> "e" 03206 * a[2, 3] #=> "llo" 03207 * a[2..3] #=> "ll" 03208 * a[-3, 2] #=> "er" 03209 * a[7..-2] #=> "her" 03210 * a[-4..-2] #=> "her" 03211 * a[-2..-4] #=> "" 03212 * a[12..-1] #=> nil 03213 * a[/[aeiou](.)\1/] #=> "ell" 03214 * a[/[aeiou](.)\1/, 0] #=> "ell" 03215 * a[/[aeiou](.)\1/, 1] #=> "l" 03216 * a[/[aeiou](.)\1/, 2] #=> nil 03217 * a["lo"] #=> "lo" 03218 * a["bye"] #=> nil 03219 */ 03220 03221 static VALUE 03222 rb_str_aref_m(int argc, VALUE *argv, VALUE str) 03223 { 03224 if (argc == 2) { 03225 if (TYPE(argv[0]) == T_REGEXP) { 03226 return rb_str_subpat(str, argv[0], argv[1]); 03227 } 03228 return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1])); 03229 } 03230 if (argc != 1) { 03231 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 03232 } 03233 return rb_str_aref(str, argv[0]); 03234 } 03235 03236 VALUE 03237 rb_str_drop_bytes(VALUE str, long len) 03238 { 03239 char *ptr = RSTRING_PTR(str); 03240 long olen = RSTRING_LEN(str), nlen; 03241 03242 str_modifiable(str); 03243 if (len > olen) len = olen; 03244 nlen = olen - len; 03245 if (nlen <= RSTRING_EMBED_LEN_MAX) { 03246 char *oldptr = ptr; 03247 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED)); 03248 STR_SET_EMBED(str); 03249 STR_SET_EMBED_LEN(str, nlen); 03250 ptr = RSTRING(str)->as.ary; 03251 memmove(ptr, oldptr + len, nlen); 03252 if (fl == STR_NOEMBED) xfree(oldptr); 03253 } 03254 else { 03255 if (!STR_SHARED_P(str)) rb_str_new4(str); 03256 ptr = RSTRING(str)->as.heap.ptr += len; 03257 RSTRING(str)->as.heap.len = nlen; 03258 } 03259 ptr[nlen] = 0; 03260 ENC_CODERANGE_CLEAR(str); 03261 return str; 03262 } 03263 03264 static void 03265 rb_str_splice_0(VALUE str, long beg, long len, VALUE val) 03266 { 03267 if (beg == 0 && RSTRING_LEN(val) == 0) { 03268 rb_str_drop_bytes(str, len); 03269 OBJ_INFECT(str, val); 03270 return; 03271 } 03272 03273 rb_str_modify(str); 03274 if (len < RSTRING_LEN(val)) { 03275 /* expand string */ 03276 RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1); 03277 } 03278 03279 if (RSTRING_LEN(val) != len) { 03280 memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val), 03281 RSTRING_PTR(str) + beg + len, 03282 RSTRING_LEN(str) - (beg + len)); 03283 } 03284 if (RSTRING_LEN(val) < beg && len < 0) { 03285 MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len); 03286 } 03287 if (RSTRING_LEN(val) > 0) { 03288 memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val)); 03289 } 03290 STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len); 03291 if (RSTRING_PTR(str)) { 03292 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 03293 } 03294 OBJ_INFECT(str, val); 03295 } 03296 03297 static void 03298 rb_str_splice(VALUE str, long beg, long len, VALUE val) 03299 { 03300 long slen; 03301 char *p, *e; 03302 rb_encoding *enc; 03303 int singlebyte = single_byte_optimizable(str); 03304 int cr; 03305 03306 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len); 03307 03308 StringValue(val); 03309 enc = rb_enc_check(str, val); 03310 slen = str_strlen(str, enc); 03311 03312 if (slen < beg) { 03313 out_of_range: 03314 rb_raise(rb_eIndexError, "index %ld out of string", beg); 03315 } 03316 if (beg < 0) { 03317 if (-beg > slen) { 03318 goto out_of_range; 03319 } 03320 beg += slen; 03321 } 03322 if (slen < len || slen < beg + len) { 03323 len = slen - beg; 03324 } 03325 str_modify_keep_cr(str); 03326 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte); 03327 if (!p) p = RSTRING_END(str); 03328 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte); 03329 if (!e) e = RSTRING_END(str); 03330 /* error check */ 03331 beg = p - RSTRING_PTR(str); /* physical position */ 03332 len = e - p; /* physical length */ 03333 rb_str_splice_0(str, beg, len, val); 03334 rb_enc_associate(str, enc); 03335 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val)); 03336 if (cr != ENC_CODERANGE_BROKEN) 03337 ENC_CODERANGE_SET(str, cr); 03338 } 03339 03340 void 03341 rb_str_update(VALUE str, long beg, long len, VALUE val) 03342 { 03343 rb_str_splice(str, beg, len, val); 03344 } 03345 03346 static void 03347 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val) 03348 { 03349 int nth; 03350 VALUE match; 03351 long start, end, len; 03352 rb_encoding *enc; 03353 struct re_registers *regs; 03354 03355 if (rb_reg_search(re, str, 0, 0) < 0) { 03356 rb_raise(rb_eIndexError, "regexp not matched"); 03357 } 03358 match = rb_backref_get(); 03359 nth = rb_reg_backref_number(match, backref); 03360 regs = RMATCH_REGS(match); 03361 if (nth >= regs->num_regs) { 03362 out_of_range: 03363 rb_raise(rb_eIndexError, "index %d out of regexp", nth); 03364 } 03365 if (nth < 0) { 03366 if (-nth >= regs->num_regs) { 03367 goto out_of_range; 03368 } 03369 nth += regs->num_regs; 03370 } 03371 03372 start = BEG(nth); 03373 if (start == -1) { 03374 rb_raise(rb_eIndexError, "regexp group %d not matched", nth); 03375 } 03376 end = END(nth); 03377 len = end - start; 03378 StringValue(val); 03379 enc = rb_enc_check(str, val); 03380 rb_str_splice_0(str, start, len, val); 03381 rb_enc_associate(str, enc); 03382 } 03383 03384 static VALUE 03385 rb_str_aset(VALUE str, VALUE indx, VALUE val) 03386 { 03387 long idx, beg; 03388 03389 switch (TYPE(indx)) { 03390 case T_FIXNUM: 03391 idx = FIX2LONG(indx); 03392 num_index: 03393 rb_str_splice(str, idx, 1, val); 03394 return val; 03395 03396 case T_REGEXP: 03397 rb_str_subpat_set(str, indx, INT2FIX(0), val); 03398 return val; 03399 03400 case T_STRING: 03401 beg = rb_str_index(str, indx, 0); 03402 if (beg < 0) { 03403 rb_raise(rb_eIndexError, "string not matched"); 03404 } 03405 beg = rb_str_sublen(str, beg); 03406 rb_str_splice(str, beg, str_strlen(indx, 0), val); 03407 return val; 03408 03409 default: 03410 /* check if indx is Range */ 03411 { 03412 long beg, len; 03413 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) { 03414 rb_str_splice(str, beg, len, val); 03415 return val; 03416 } 03417 } 03418 idx = NUM2LONG(indx); 03419 goto num_index; 03420 } 03421 } 03422 03423 /* 03424 * call-seq: 03425 * str[fixnum] = new_str 03426 * str[fixnum, fixnum] = new_str 03427 * str[range] = aString 03428 * str[regexp] = new_str 03429 * str[regexp, fixnum] = new_str 03430 * str[regexp, name] = new_str 03431 * str[other_str] = new_str 03432 * 03433 * Element Assignment---Replaces some or all of the content of <i>str</i>. The 03434 * portion of the string affected is determined using the same criteria as 03435 * <code>String#[]</code>. If the replacement string is not the same length as 03436 * the text it is replacing, the string will be adjusted accordingly. If the 03437 * regular expression or string is used as the index doesn't match a position 03438 * in the string, <code>IndexError</code> is raised. If the regular expression 03439 * form is used, the optional second <code>Fixnum</code> allows you to specify 03440 * which portion of the match to replace (effectively using the 03441 * <code>MatchData</code> indexing rules. The forms that take a 03442 * <code>Fixnum</code> will raise an <code>IndexError</code> if the value is 03443 * out of range; the <code>Range</code> form will raise a 03444 * <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code> 03445 * forms will silently ignore the assignment. 03446 */ 03447 03448 static VALUE 03449 rb_str_aset_m(int argc, VALUE *argv, VALUE str) 03450 { 03451 if (argc == 3) { 03452 if (TYPE(argv[0]) == T_REGEXP) { 03453 rb_str_subpat_set(str, argv[0], argv[1], argv[2]); 03454 } 03455 else { 03456 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]); 03457 } 03458 return argv[2]; 03459 } 03460 if (argc != 2) { 03461 rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc); 03462 } 03463 return rb_str_aset(str, argv[0], argv[1]); 03464 } 03465 03466 /* 03467 * call-seq: 03468 * str.insert(index, other_str) -> str 03469 * 03470 * Inserts <i>other_str</i> before the character at the given 03471 * <i>index</i>, modifying <i>str</i>. Negative indices count from the 03472 * end of the string, and insert <em>after</em> the given character. 03473 * The intent is insert <i>aString</i> so that it starts at the given 03474 * <i>index</i>. 03475 * 03476 * "abcd".insert(0, 'X') #=> "Xabcd" 03477 * "abcd".insert(3, 'X') #=> "abcXd" 03478 * "abcd".insert(4, 'X') #=> "abcdX" 03479 * "abcd".insert(-3, 'X') #=> "abXcd" 03480 * "abcd".insert(-1, 'X') #=> "abcdX" 03481 */ 03482 03483 static VALUE 03484 rb_str_insert(VALUE str, VALUE idx, VALUE str2) 03485 { 03486 long pos = NUM2LONG(idx); 03487 03488 if (pos == -1) { 03489 return rb_str_append(str, str2); 03490 } 03491 else if (pos < 0) { 03492 pos++; 03493 } 03494 rb_str_splice(str, pos, 0, str2); 03495 return str; 03496 } 03497 03498 03499 /* 03500 * call-seq: 03501 * str.slice!(fixnum) -> fixnum or nil 03502 * str.slice!(fixnum, fixnum) -> new_str or nil 03503 * str.slice!(range) -> new_str or nil 03504 * str.slice!(regexp) -> new_str or nil 03505 * str.slice!(other_str) -> new_str or nil 03506 * 03507 * Deletes the specified portion from <i>str</i>, and returns the portion 03508 * deleted. 03509 * 03510 * string = "this is a string" 03511 * string.slice!(2) #=> "i" 03512 * string.slice!(3..6) #=> " is " 03513 * string.slice!(/s.*t/) #=> "sa st" 03514 * string.slice!("r") #=> "r" 03515 * string #=> "thing" 03516 */ 03517 03518 static VALUE 03519 rb_str_slice_bang(int argc, VALUE *argv, VALUE str) 03520 { 03521 VALUE result; 03522 VALUE buf[3]; 03523 int i; 03524 03525 if (argc < 1 || 2 < argc) { 03526 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 03527 } 03528 for (i=0; i<argc; i++) { 03529 buf[i] = argv[i]; 03530 } 03531 str_modify_keep_cr(str); 03532 result = rb_str_aref_m(argc, buf, str); 03533 if (!NIL_P(result)) { 03534 buf[i] = rb_str_new(0,0); 03535 rb_str_aset_m(argc+1, buf, str); 03536 } 03537 return result; 03538 } 03539 03540 static VALUE 03541 get_pat(VALUE pat, int quote) 03542 { 03543 VALUE val; 03544 03545 switch (TYPE(pat)) { 03546 case T_REGEXP: 03547 return pat; 03548 03549 case T_STRING: 03550 break; 03551 03552 default: 03553 val = rb_check_string_type(pat); 03554 if (NIL_P(val)) { 03555 Check_Type(pat, T_REGEXP); 03556 } 03557 pat = val; 03558 } 03559 03560 if (quote) { 03561 pat = rb_reg_quote(pat); 03562 } 03563 03564 return rb_reg_regcomp(pat); 03565 } 03566 03567 03568 /* 03569 * call-seq: 03570 * str.sub!(pattern, replacement) -> str or nil 03571 * str.sub!(pattern) {|match| block } -> str or nil 03572 * 03573 * Performs the substitutions of <code>String#sub</code> in place, 03574 * returning <i>str</i>, or <code>nil</code> if no substitutions were 03575 * performed. 03576 */ 03577 03578 static VALUE 03579 rb_str_sub_bang(int argc, VALUE *argv, VALUE str) 03580 { 03581 VALUE pat, repl, hash = Qnil; 03582 int iter = 0; 03583 int tainted = 0; 03584 int untrusted = 0; 03585 long plen; 03586 03587 if (argc == 1 && rb_block_given_p()) { 03588 iter = 1; 03589 } 03590 else if (argc == 2) { 03591 repl = argv[1]; 03592 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash"); 03593 if (NIL_P(hash)) { 03594 StringValue(repl); 03595 } 03596 if (OBJ_TAINTED(repl)) tainted = 1; 03597 if (OBJ_UNTRUSTED(repl)) untrusted = 1; 03598 } 03599 else { 03600 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 03601 } 03602 03603 pat = get_pat(argv[0], 1); 03604 str_modifiable(str); 03605 if (rb_reg_search(pat, str, 0, 0) >= 0) { 03606 rb_encoding *enc; 03607 int cr = ENC_CODERANGE(str); 03608 VALUE match = rb_backref_get(); 03609 struct re_registers *regs = RMATCH_REGS(match); 03610 long beg0 = BEG(0); 03611 long end0 = END(0); 03612 char *p, *rp; 03613 long len, rlen; 03614 03615 if (iter || !NIL_P(hash)) { 03616 p = RSTRING_PTR(str); len = RSTRING_LEN(str); 03617 03618 if (iter) { 03619 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); 03620 } 03621 else { 03622 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0)); 03623 repl = rb_obj_as_string(repl); 03624 } 03625 str_mod_check(str, p, len); 03626 rb_check_frozen(str); 03627 } 03628 else { 03629 repl = rb_reg_regsub(repl, str, regs, pat); 03630 } 03631 enc = rb_enc_compatible(str, repl); 03632 if (!enc) { 03633 rb_encoding *str_enc = STR_ENC_GET(str); 03634 p = RSTRING_PTR(str); len = RSTRING_LEN(str); 03635 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT || 03636 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) { 03637 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", 03638 rb_enc_name(str_enc), 03639 rb_enc_name(STR_ENC_GET(repl))); 03640 } 03641 enc = STR_ENC_GET(repl); 03642 } 03643 rb_str_modify(str); 03644 rb_enc_associate(str, enc); 03645 if (OBJ_TAINTED(repl)) tainted = 1; 03646 if (OBJ_UNTRUSTED(repl)) untrusted = 1; 03647 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) { 03648 int cr2 = ENC_CODERANGE(repl); 03649 if (cr2 == ENC_CODERANGE_BROKEN || 03650 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT)) 03651 cr = ENC_CODERANGE_UNKNOWN; 03652 else 03653 cr = cr2; 03654 } 03655 plen = end0 - beg0; 03656 rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl); 03657 len = RSTRING_LEN(str); 03658 if (rlen > plen) { 03659 RESIZE_CAPA(str, len + rlen - plen); 03660 } 03661 p = RSTRING_PTR(str); 03662 if (rlen != plen) { 03663 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen); 03664 } 03665 memcpy(p + beg0, rp, rlen); 03666 len += rlen - plen; 03667 STR_SET_LEN(str, len); 03668 RSTRING_PTR(str)[len] = '\0'; 03669 ENC_CODERANGE_SET(str, cr); 03670 if (tainted) OBJ_TAINT(str); 03671 if (untrusted) OBJ_UNTRUST(str); 03672 03673 return str; 03674 } 03675 return Qnil; 03676 } 03677 03678 03679 /* 03680 * call-seq: 03681 * str.sub(pattern, replacement) -> new_str 03682 * str.sub(pattern, hash) -> new_str 03683 * str.sub(pattern) {|match| block } -> new_str 03684 * 03685 * Returns a copy of <i>str</i> with the <em>first</em> occurrence of 03686 * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is 03687 * typically a <code>Regexp</code>; if given as a <code>String</code>, any 03688 * regular expression metacharacters it contains will be interpreted 03689 * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd', 03690 * instead of a digit. 03691 * 03692 * If <i>replacement</i> is a <code>String</code> it will be substituted for 03693 * the matched text. It may contain back-references to the pattern's capture 03694 * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or 03695 * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a 03696 * double-quoted string, both back-references must be preceded by an 03697 * additional backslash. However, within <i>replacement</i> the special match 03698 * variables, such as <code>&$</code>, will not refer to the current match. 03699 * 03700 * If the second argument is a <code>Hash</code>, and the matched text is one 03701 * of its keys, the corresponding value is the replacement string. 03702 * 03703 * In the block form, the current match string is passed in as a parameter, 03704 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>, 03705 * <code>$&</code>, and <code>$'</code> will be set appropriately. The value 03706 * returned by the block will be substituted for the match on each call. 03707 * 03708 * The result inherits any tainting in the original string or any supplied 03709 * replacement string. 03710 * 03711 * "hello".sub(/[aeiou]/, '*') #=> "h*llo" 03712 * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo" 03713 * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello" 03714 * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo" 03715 * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV) 03716 * #=> "Is /bin/bash your preferred shell?" 03717 */ 03718 03719 static VALUE 03720 rb_str_sub(int argc, VALUE *argv, VALUE str) 03721 { 03722 str = rb_str_dup(str); 03723 rb_str_sub_bang(argc, argv, str); 03724 return str; 03725 } 03726 03727 static VALUE 03728 str_gsub(int argc, VALUE *argv, VALUE str, int bang) 03729 { 03730 VALUE pat, val, repl, match, dest, hash = Qnil; 03731 struct re_registers *regs; 03732 long beg, n; 03733 long beg0, end0; 03734 long offset, blen, slen, len, last; 03735 int iter = 0; 03736 char *sp, *cp; 03737 int tainted = 0; 03738 rb_encoding *str_enc; 03739 03740 switch (argc) { 03741 case 1: 03742 RETURN_ENUMERATOR(str, argc, argv); 03743 iter = 1; 03744 break; 03745 case 2: 03746 repl = argv[1]; 03747 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash"); 03748 if (NIL_P(hash)) { 03749 StringValue(repl); 03750 } 03751 if (OBJ_TAINTED(repl)) tainted = 1; 03752 break; 03753 default: 03754 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 03755 } 03756 03757 pat = get_pat(argv[0], 1); 03758 beg = rb_reg_search(pat, str, 0, 0); 03759 if (beg < 0) { 03760 if (bang) return Qnil; /* no match, no substitution */ 03761 return rb_str_dup(str); 03762 } 03763 03764 offset = 0; 03765 n = 0; 03766 blen = RSTRING_LEN(str) + 30; /* len + margin */ 03767 dest = rb_str_buf_new(blen); 03768 sp = RSTRING_PTR(str); 03769 slen = RSTRING_LEN(str); 03770 cp = sp; 03771 str_enc = STR_ENC_GET(str); 03772 rb_enc_associate(dest, str_enc); 03773 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID); 03774 03775 do { 03776 n++; 03777 match = rb_backref_get(); 03778 regs = RMATCH_REGS(match); 03779 beg0 = BEG(0); 03780 end0 = END(0); 03781 if (iter || !NIL_P(hash)) { 03782 if (iter) { 03783 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); 03784 } 03785 else { 03786 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0))); 03787 val = rb_obj_as_string(val); 03788 } 03789 str_mod_check(str, sp, slen); 03790 if (val == dest) { /* paranoid check [ruby-dev:24827] */ 03791 rb_raise(rb_eRuntimeError, "block should not cheat"); 03792 } 03793 } 03794 else { 03795 val = rb_reg_regsub(repl, str, regs, pat); 03796 } 03797 03798 if (OBJ_TAINTED(val)) tainted = 1; 03799 03800 len = beg - offset; /* copy pre-match substr */ 03801 if (len) { 03802 rb_enc_str_buf_cat(dest, cp, len, str_enc); 03803 } 03804 03805 rb_str_buf_append(dest, val); 03806 03807 last = offset; 03808 offset = end0; 03809 if (beg0 == end0) { 03810 /* 03811 * Always consume at least one character of the input string 03812 * in order to prevent infinite loops. 03813 */ 03814 if (RSTRING_LEN(str) <= end0) break; 03815 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc); 03816 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc); 03817 offset = end0 + len; 03818 } 03819 cp = RSTRING_PTR(str) + offset; 03820 if (offset > RSTRING_LEN(str)) break; 03821 beg = rb_reg_search(pat, str, offset, 0); 03822 } while (beg >= 0); 03823 if (RSTRING_LEN(str) > offset) { 03824 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc); 03825 } 03826 rb_reg_search(pat, str, last, 0); 03827 if (bang) { 03828 rb_str_shared_replace(str, dest); 03829 } 03830 else { 03831 RBASIC(dest)->klass = rb_obj_class(str); 03832 OBJ_INFECT(dest, str); 03833 str = dest; 03834 } 03835 03836 if (tainted) OBJ_TAINT(str); 03837 return str; 03838 } 03839 03840 03841 /* 03842 * call-seq: 03843 * str.gsub!(pattern, replacement) -> str or nil 03844 * str.gsub!(pattern) {|match| block } -> str or nil 03845 * str.gsub!(pattern) -> an_enumerator 03846 * 03847 * Performs the substitutions of <code>String#gsub</code> in place, returning 03848 * <i>str</i>, or <code>nil</code> if no substitutions were performed. 03849 * If no block and no <i>replacement</i> is given, an enumerator is returned instead. 03850 */ 03851 03852 static VALUE 03853 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str) 03854 { 03855 str_modify_keep_cr(str); 03856 return str_gsub(argc, argv, str, 1); 03857 } 03858 03859 03860 /* 03861 * call-seq: 03862 * str.gsub(pattern, replacement) -> new_str 03863 * str.gsub(pattern, hash) -> new_str 03864 * str.gsub(pattern) {|match| block } -> new_str 03865 * str.gsub(pattern) -> enumerator 03866 * 03867 * Returns a copy of <i>str</i> with the <em>all</em> occurrences of 03868 * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is 03869 * typically a <code>Regexp</code>; if given as a <code>String</code>, any 03870 * regular expression metacharacters it contains will be interpreted 03871 * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd', 03872 * instead of a digit. 03873 * 03874 * If <i>replacement</i> is a <code>String</code> it will be substituted for 03875 * the matched text. It may contain back-references to the pattern's capture 03876 * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or 03877 * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a 03878 * double-quoted string, both back-references must be preceded by an 03879 * additional backslash. However, within <i>replacement</i> the special match 03880 * variables, such as <code>&$</code>, will not refer to the current match. 03881 * 03882 * If the second argument is a <code>Hash</code>, and the matched text is one 03883 * of its keys, the corresponding value is the replacement string. 03884 * 03885 * In the block form, the current match string is passed in as a parameter, 03886 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>, 03887 * <code>$&</code>, and <code>$'</code> will be set appropriately. The value 03888 * returned by the block will be substituted for the match on each call. 03889 * 03890 * The result inherits any tainting in the original string or any supplied 03891 * replacement string. 03892 * 03893 * When neither a block nor a second argument is supplied, an 03894 * <code>Enumerator</code> is returned. 03895 * 03896 * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*" 03897 * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>" 03898 * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 " 03899 * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}" 03900 * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*" 03901 */ 03902 03903 static VALUE 03904 rb_str_gsub(int argc, VALUE *argv, VALUE str) 03905 { 03906 return str_gsub(argc, argv, str, 0); 03907 } 03908 03909 03910 /* 03911 * call-seq: 03912 * str.replace(other_str) -> str 03913 * 03914 * Replaces the contents and taintedness of <i>str</i> with the corresponding 03915 * values in <i>other_str</i>. 03916 * 03917 * s = "hello" #=> "hello" 03918 * s.replace "world" #=> "world" 03919 */ 03920 03921 VALUE 03922 rb_str_replace(VALUE str, VALUE str2) 03923 { 03924 str_modifiable(str); 03925 if (str == str2) return str; 03926 03927 StringValue(str2); 03928 str_discard(str); 03929 return str_replace(str, str2); 03930 } 03931 03932 /* 03933 * call-seq: 03934 * string.clear -> string 03935 * 03936 * Makes string empty. 03937 * 03938 * a = "abcde" 03939 * a.clear #=> "" 03940 */ 03941 03942 static VALUE 03943 rb_str_clear(VALUE str) 03944 { 03945 str_discard(str); 03946 STR_SET_EMBED(str); 03947 STR_SET_EMBED_LEN(str, 0); 03948 RSTRING_PTR(str)[0] = 0; 03949 if (rb_enc_asciicompat(STR_ENC_GET(str))) 03950 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 03951 else 03952 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); 03953 return str; 03954 } 03955 03956 /* 03957 * call-seq: 03958 * string.chr -> string 03959 * 03960 * Returns a one-character string at the beginning of the string. 03961 * 03962 * a = "abcde" 03963 * a.chr #=> "a" 03964 */ 03965 03966 static VALUE 03967 rb_str_chr(VALUE str) 03968 { 03969 return rb_str_substr(str, 0, 1); 03970 } 03971 03972 /* 03973 * call-seq: 03974 * str.getbyte(index) -> 0 .. 255 03975 * 03976 * returns the <i>index</i>th byte as an integer. 03977 */ 03978 static VALUE 03979 rb_str_getbyte(VALUE str, VALUE index) 03980 { 03981 long pos = NUM2LONG(index); 03982 03983 if (pos < 0) 03984 pos += RSTRING_LEN(str); 03985 if (pos < 0 || RSTRING_LEN(str) <= pos) 03986 return Qnil; 03987 03988 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]); 03989 } 03990 03991 /* 03992 * call-seq: 03993 * str.setbyte(index, int) -> int 03994 * 03995 * modifies the <i>index</i>th byte as <i>int</i>. 03996 */ 03997 static VALUE 03998 rb_str_setbyte(VALUE str, VALUE index, VALUE value) 03999 { 04000 long pos = NUM2LONG(index); 04001 int byte = NUM2INT(value); 04002 04003 rb_str_modify(str); 04004 04005 if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos) 04006 rb_raise(rb_eIndexError, "index %ld out of string", pos); 04007 if (pos < 0) 04008 pos += RSTRING_LEN(str); 04009 04010 RSTRING_PTR(str)[pos] = byte; 04011 04012 return value; 04013 } 04014 04015 static VALUE 04016 str_byte_substr(VALUE str, long beg, long len) 04017 { 04018 char *p, *s = RSTRING_PTR(str); 04019 long n = RSTRING_LEN(str); 04020 VALUE str2; 04021 04022 if (beg > n || len < 0) return Qnil; 04023 if (beg < 0) { 04024 beg += n; 04025 if (beg < 0) return Qnil; 04026 } 04027 if (beg + len > n) 04028 len = n - beg; 04029 if (len <= 0) { 04030 len = 0; 04031 p = 0; 04032 } 04033 else 04034 p = s + beg; 04035 04036 if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) { 04037 str2 = rb_str_new4(str); 04038 str2 = str_new3(rb_obj_class(str2), str2); 04039 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len; 04040 RSTRING(str2)->as.heap.len = len; 04041 } 04042 else { 04043 str2 = rb_str_new5(str, p, len); 04044 rb_enc_cr_str_copy_for_substr(str2, str); 04045 OBJ_INFECT(str2, str); 04046 } 04047 04048 return str2; 04049 } 04050 04051 static VALUE 04052 str_byte_aref(VALUE str, VALUE indx) 04053 { 04054 long idx; 04055 switch (TYPE(indx)) { 04056 case T_FIXNUM: 04057 idx = FIX2LONG(indx); 04058 04059 num_index: 04060 str = str_byte_substr(str, idx, 1); 04061 if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil; 04062 return str; 04063 04064 default: 04065 /* check if indx is Range */ 04066 { 04067 long beg, len = RSTRING_LEN(str); 04068 04069 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { 04070 case Qfalse: 04071 break; 04072 case Qnil: 04073 return Qnil; 04074 default: 04075 return str_byte_substr(str, beg, len); 04076 } 04077 } 04078 idx = NUM2LONG(indx); 04079 goto num_index; 04080 } 04081 return Qnil; /* not reached */ 04082 } 04083 04084 /* 04085 * call-seq: 04086 * str.byteslice(fixnum) -> new_str or nil 04087 * str.byteslice(fixnum, fixnum) -> new_str or nil 04088 * str.byteslice(range) -> new_str or nil 04089 * 04090 * Byte Reference---If passed a single <code>Fixnum</code>, returns a 04091 * substring of one byte at that position. If passed two <code>Fixnum</code> 04092 * objects, returns a substring starting at the offset given by the first, and 04093 * a length given by the second. If given a <code>Range</code>, a substring containing 04094 * bytes at offsets given by the range is returned. In all three cases, if 04095 * an offset is negative, it is counted from the end of <i>str</i>. Returns 04096 * <code>nil</code> if the initial offset falls outside the string, the length 04097 * is negative, or the beginning of the range is greater than the end. 04098 * The encoding of the resulted string keeps original encoding. 04099 * 04100 * "hello".byteslice(1) #=> "e" 04101 * "hello".byteslice(-1) #=> "o" 04102 * "hello".byteslice(1, 2) #=> "el" 04103 * "\x80\u3042".byteslice(1, 3) #=> "\u3042" 04104 * "\x03\u3042\xff".byteslice(1..3) #=> "\u3942" 04105 */ 04106 04107 static VALUE 04108 rb_str_byteslice(int argc, VALUE *argv, VALUE str) 04109 { 04110 if (argc == 2) { 04111 return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1])); 04112 } 04113 if (argc != 1) { 04114 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 04115 } 04116 return str_byte_aref(str, argv[0]); 04117 } 04118 04119 /* 04120 * call-seq: 04121 * str.reverse -> new_str 04122 * 04123 * Returns a new string with the characters from <i>str</i> in reverse order. 04124 * 04125 * "stressed".reverse #=> "desserts" 04126 */ 04127 04128 static VALUE 04129 rb_str_reverse(VALUE str) 04130 { 04131 rb_encoding *enc; 04132 VALUE rev; 04133 char *s, *e, *p; 04134 int single = 1; 04135 04136 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str); 04137 enc = STR_ENC_GET(str); 04138 rev = rb_str_new5(str, 0, RSTRING_LEN(str)); 04139 s = RSTRING_PTR(str); e = RSTRING_END(str); 04140 p = RSTRING_END(rev); 04141 04142 if (RSTRING_LEN(str) > 1) { 04143 if (single_byte_optimizable(str)) { 04144 while (s < e) { 04145 *--p = *s++; 04146 } 04147 } 04148 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) { 04149 while (s < e) { 04150 int clen = rb_enc_fast_mbclen(s, e, enc); 04151 04152 if (clen > 1 || (*s & 0x80)) single = 0; 04153 p -= clen; 04154 memcpy(p, s, clen); 04155 s += clen; 04156 } 04157 } 04158 else { 04159 while (s < e) { 04160 int clen = rb_enc_mbclen(s, e, enc); 04161 04162 if (clen > 1 || (*s & 0x80)) single = 0; 04163 p -= clen; 04164 memcpy(p, s, clen); 04165 s += clen; 04166 } 04167 } 04168 } 04169 STR_SET_LEN(rev, RSTRING_LEN(str)); 04170 OBJ_INFECT(rev, str); 04171 if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) { 04172 if (single) { 04173 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 04174 } 04175 else { 04176 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); 04177 } 04178 } 04179 rb_enc_cr_str_copy_for_substr(rev, str); 04180 04181 return rev; 04182 } 04183 04184 04185 /* 04186 * call-seq: 04187 * str.reverse! -> str 04188 * 04189 * Reverses <i>str</i> in place. 04190 */ 04191 04192 static VALUE 04193 rb_str_reverse_bang(VALUE str) 04194 { 04195 if (RSTRING_LEN(str) > 1) { 04196 if (single_byte_optimizable(str)) { 04197 char *s, *e, c; 04198 04199 str_modify_keep_cr(str); 04200 s = RSTRING_PTR(str); 04201 e = RSTRING_END(str) - 1; 04202 while (s < e) { 04203 c = *s; 04204 *s++ = *e; 04205 *e-- = c; 04206 } 04207 } 04208 else { 04209 rb_str_shared_replace(str, rb_str_reverse(str)); 04210 } 04211 } 04212 else { 04213 str_modify_keep_cr(str); 04214 } 04215 return str; 04216 } 04217 04218 04219 /* 04220 * call-seq: 04221 * str.include? other_str -> true or false 04222 * 04223 * Returns <code>true</code> if <i>str</i> contains the given string or 04224 * character. 04225 * 04226 * "hello".include? "lo" #=> true 04227 * "hello".include? "ol" #=> false 04228 * "hello".include? ?h #=> true 04229 */ 04230 04231 static VALUE 04232 rb_str_include(VALUE str, VALUE arg) 04233 { 04234 long i; 04235 04236 StringValue(arg); 04237 i = rb_str_index(str, arg, 0); 04238 04239 if (i == -1) return Qfalse; 04240 return Qtrue; 04241 } 04242 04243 04244 /* 04245 * call-seq: 04246 * str.to_i(base=10) -> integer 04247 * 04248 * Returns the result of interpreting leading characters in <i>str</i> as an 04249 * integer base <i>base</i> (between 2 and 36). Extraneous characters past the 04250 * end of a valid number are ignored. If there is not a valid number at the 04251 * start of <i>str</i>, <code>0</code> is returned. This method never raises an 04252 * exception when <i>base</i> is valid. 04253 * 04254 * "12345".to_i #=> 12345 04255 * "99 red balloons".to_i #=> 99 04256 * "0a".to_i #=> 0 04257 * "0a".to_i(16) #=> 10 04258 * "hello".to_i #=> 0 04259 * "1100101".to_i(2) #=> 101 04260 * "1100101".to_i(8) #=> 294977 04261 * "1100101".to_i(10) #=> 1100101 04262 * "1100101".to_i(16) #=> 17826049 04263 */ 04264 04265 static VALUE 04266 rb_str_to_i(int argc, VALUE *argv, VALUE str) 04267 { 04268 int base; 04269 04270 if (argc == 0) base = 10; 04271 else { 04272 VALUE b; 04273 04274 rb_scan_args(argc, argv, "01", &b); 04275 base = NUM2INT(b); 04276 } 04277 if (base < 0) { 04278 rb_raise(rb_eArgError, "invalid radix %d", base); 04279 } 04280 return rb_str_to_inum(str, base, FALSE); 04281 } 04282 04283 04284 /* 04285 * call-seq: 04286 * str.to_f -> float 04287 * 04288 * Returns the result of interpreting leading characters in <i>str</i> as a 04289 * floating point number. Extraneous characters past the end of a valid number 04290 * are ignored. If there is not a valid number at the start of <i>str</i>, 04291 * <code>0.0</code> is returned. This method never raises an exception. 04292 * 04293 * "123.45e1".to_f #=> 1234.5 04294 * "45.67 degrees".to_f #=> 45.67 04295 * "thx1138".to_f #=> 0.0 04296 */ 04297 04298 static VALUE 04299 rb_str_to_f(VALUE str) 04300 { 04301 return DBL2NUM(rb_str_to_dbl(str, FALSE)); 04302 } 04303 04304 04305 /* 04306 * call-seq: 04307 * str.to_s -> str 04308 * str.to_str -> str 04309 * 04310 * Returns the receiver. 04311 */ 04312 04313 static VALUE 04314 rb_str_to_s(VALUE str) 04315 { 04316 if (rb_obj_class(str) != rb_cString) { 04317 return str_duplicate(rb_cString, str); 04318 } 04319 return str; 04320 } 04321 04322 #if 0 04323 static void 04324 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc) 04325 { 04326 char s[RUBY_MAX_CHAR_LEN]; 04327 int n = rb_enc_codelen(c, enc); 04328 04329 rb_enc_mbcput(c, s, enc); 04330 rb_enc_str_buf_cat(str, s, n, enc); 04331 } 04332 #endif 04333 04334 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */ 04335 04336 int 04337 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p) 04338 { 04339 char buf[CHAR_ESC_LEN + 1]; 04340 int l; 04341 04342 #if SIZEOF_INT > 4 04343 c &= 0xffffffff; 04344 #endif 04345 if (unicode_p) { 04346 if (c < 0x7F && ISPRINT(c)) { 04347 snprintf(buf, CHAR_ESC_LEN, "%c", c); 04348 } 04349 else if (c < 0x10000) { 04350 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c); 04351 } 04352 else { 04353 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c); 04354 } 04355 } 04356 else { 04357 if (c < 0x100) { 04358 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c); 04359 } 04360 else { 04361 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c); 04362 } 04363 } 04364 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */ 04365 rb_str_buf_cat(result, buf, l); 04366 return l; 04367 } 04368 04369 /* 04370 * call-seq: 04371 * str.inspect -> string 04372 * 04373 * Returns a printable version of _str_, surrounded by quote marks, 04374 * with special characters escaped. 04375 * 04376 * str = "hello" 04377 * str[3] = "\b" 04378 * str.inspect #=> "\"hel\\bo\"" 04379 */ 04380 04381 VALUE 04382 rb_str_inspect(VALUE str) 04383 { 04384 rb_encoding *enc = STR_ENC_GET(str); 04385 const char *p, *pend, *prev; 04386 char buf[CHAR_ESC_LEN + 1]; 04387 VALUE result = rb_str_buf_new(0); 04388 rb_encoding *resenc = rb_default_internal_encoding(); 04389 int unicode_p = rb_enc_unicode_p(enc); 04390 int asciicompat = rb_enc_asciicompat(enc); 04391 static rb_encoding *utf16, *utf32; 04392 04393 if (!utf16) utf16 = rb_enc_find("UTF-16"); 04394 if (!utf32) utf32 = rb_enc_find("UTF-32"); 04395 if (resenc == NULL) resenc = rb_default_external_encoding(); 04396 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding(); 04397 rb_enc_associate(result, resenc); 04398 str_buf_cat2(result, "\""); 04399 04400 p = RSTRING_PTR(str); pend = RSTRING_END(str); 04401 prev = p; 04402 if (enc == utf16) { 04403 const unsigned char *q = (const unsigned char *)p; 04404 if (q[0] == 0xFE && q[1] == 0xFF) 04405 enc = rb_enc_find("UTF-16BE"); 04406 else if (q[0] == 0xFF && q[1] == 0xFE) 04407 enc = rb_enc_find("UTF-16LE"); 04408 else 04409 unicode_p = 0; 04410 } 04411 else if (enc == utf32) { 04412 const unsigned char *q = (const unsigned char *)p; 04413 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) 04414 enc = rb_enc_find("UTF-32BE"); 04415 else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) 04416 enc = rb_enc_find("UTF-32LE"); 04417 else 04418 unicode_p = 0; 04419 } 04420 while (p < pend) { 04421 unsigned int c, cc; 04422 int n; 04423 04424 n = rb_enc_precise_mbclen(p, pend, enc); 04425 if (!MBCLEN_CHARFOUND_P(n)) { 04426 if (p > prev) str_buf_cat(result, prev, p - prev); 04427 n = rb_enc_mbminlen(enc); 04428 if (pend < p + n) 04429 n = (int)(pend - p); 04430 while (n--) { 04431 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377); 04432 str_buf_cat(result, buf, strlen(buf)); 04433 prev = ++p; 04434 } 04435 continue; 04436 } 04437 n = MBCLEN_CHARFOUND_LEN(n); 04438 c = rb_enc_mbc_to_codepoint(p, pend, enc); 04439 p += n; 04440 if ((asciicompat || unicode_p) && 04441 (c == '"'|| c == '\\' || 04442 (c == '#' && 04443 p < pend && 04444 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) && 04445 (cc = rb_enc_codepoint(p,pend,enc), 04446 (cc == '$' || cc == '@' || cc == '{'))))) { 04447 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 04448 str_buf_cat2(result, "\\"); 04449 if (asciicompat || enc == resenc) { 04450 prev = p - n; 04451 continue; 04452 } 04453 } 04454 switch (c) { 04455 case '\n': cc = 'n'; break; 04456 case '\r': cc = 'r'; break; 04457 case '\t': cc = 't'; break; 04458 case '\f': cc = 'f'; break; 04459 case '\013': cc = 'v'; break; 04460 case '\010': cc = 'b'; break; 04461 case '\007': cc = 'a'; break; 04462 case 033: cc = 'e'; break; 04463 default: cc = 0; break; 04464 } 04465 if (cc) { 04466 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 04467 buf[0] = '\\'; 04468 buf[1] = (char)cc; 04469 str_buf_cat(result, buf, 2); 04470 prev = p; 04471 continue; 04472 } 04473 if ((enc == resenc && rb_enc_isprint(c, enc)) || 04474 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) { 04475 continue; 04476 } 04477 else { 04478 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 04479 rb_str_buf_cat_escaped_char(result, c, unicode_p); 04480 prev = p; 04481 continue; 04482 } 04483 } 04484 if (p > prev) str_buf_cat(result, prev, p - prev); 04485 str_buf_cat2(result, "\""); 04486 04487 OBJ_INFECT(result, str); 04488 return result; 04489 } 04490 04491 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{')) 04492 04493 /* 04494 * call-seq: 04495 * str.dump -> new_str 04496 * 04497 * Produces a version of <i>str</i> with all nonprinting characters replaced by 04498 * <code>\nnn</code> notation and all special characters escaped. 04499 */ 04500 04501 VALUE 04502 rb_str_dump(VALUE str) 04503 { 04504 rb_encoding *enc = rb_enc_get(str); 04505 long len; 04506 const char *p, *pend; 04507 char *q, *qend; 04508 VALUE result; 04509 int u8 = (enc == rb_utf8_encoding()); 04510 04511 len = 2; /* "" */ 04512 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); 04513 while (p < pend) { 04514 unsigned char c = *p++; 04515 switch (c) { 04516 case '"': case '\\': 04517 case '\n': case '\r': 04518 case '\t': case '\f': 04519 case '\013': case '\010': case '\007': case '\033': 04520 len += 2; 04521 break; 04522 04523 case '#': 04524 len += IS_EVSTR(p, pend) ? 2 : 1; 04525 break; 04526 04527 default: 04528 if (ISPRINT(c)) { 04529 len++; 04530 } 04531 else { 04532 if (u8) { /* \u{NN} */ 04533 int n = rb_enc_precise_mbclen(p-1, pend, enc); 04534 if (MBCLEN_CHARFOUND_P(n-1)) { 04535 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc); 04536 while (cc >>= 4) len++; 04537 len += 5; 04538 p += MBCLEN_CHARFOUND_LEN(n)-1; 04539 break; 04540 } 04541 } 04542 len += 4; /* \xNN */ 04543 } 04544 break; 04545 } 04546 } 04547 if (!rb_enc_asciicompat(enc)) { 04548 len += 19; /* ".force_encoding('')" */ 04549 len += strlen(enc->name); 04550 } 04551 04552 result = rb_str_new5(str, 0, len); 04553 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); 04554 q = RSTRING_PTR(result); qend = q + len + 1; 04555 04556 *q++ = '"'; 04557 while (p < pend) { 04558 unsigned char c = *p++; 04559 04560 if (c == '"' || c == '\\') { 04561 *q++ = '\\'; 04562 *q++ = c; 04563 } 04564 else if (c == '#') { 04565 if (IS_EVSTR(p, pend)) *q++ = '\\'; 04566 *q++ = '#'; 04567 } 04568 else if (c == '\n') { 04569 *q++ = '\\'; 04570 *q++ = 'n'; 04571 } 04572 else if (c == '\r') { 04573 *q++ = '\\'; 04574 *q++ = 'r'; 04575 } 04576 else if (c == '\t') { 04577 *q++ = '\\'; 04578 *q++ = 't'; 04579 } 04580 else if (c == '\f') { 04581 *q++ = '\\'; 04582 *q++ = 'f'; 04583 } 04584 else if (c == '\013') { 04585 *q++ = '\\'; 04586 *q++ = 'v'; 04587 } 04588 else if (c == '\010') { 04589 *q++ = '\\'; 04590 *q++ = 'b'; 04591 } 04592 else if (c == '\007') { 04593 *q++ = '\\'; 04594 *q++ = 'a'; 04595 } 04596 else if (c == '\033') { 04597 *q++ = '\\'; 04598 *q++ = 'e'; 04599 } 04600 else if (ISPRINT(c)) { 04601 *q++ = c; 04602 } 04603 else { 04604 *q++ = '\\'; 04605 if (u8) { 04606 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1; 04607 if (MBCLEN_CHARFOUND_P(n)) { 04608 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc); 04609 p += n; 04610 snprintf(q, qend-q, "u{%x}", cc); 04611 q += strlen(q); 04612 continue; 04613 } 04614 } 04615 snprintf(q, qend-q, "x%02X", c); 04616 q += 3; 04617 } 04618 } 04619 *q++ = '"'; 04620 *q = '\0'; 04621 if (!rb_enc_asciicompat(enc)) { 04622 snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name); 04623 enc = rb_ascii8bit_encoding(); 04624 } 04625 OBJ_INFECT(result, str); 04626 /* result from dump is ASCII */ 04627 rb_enc_associate(result, enc); 04628 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT); 04629 return result; 04630 } 04631 04632 04633 static void 04634 rb_str_check_dummy_enc(rb_encoding *enc) 04635 { 04636 if (rb_enc_dummy_p(enc)) { 04637 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s", 04638 rb_enc_name(enc)); 04639 } 04640 } 04641 04642 /* 04643 * call-seq: 04644 * str.upcase! -> str or nil 04645 * 04646 * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes 04647 * were made. 04648 * Note: case replacement is effective only in ASCII region. 04649 */ 04650 04651 static VALUE 04652 rb_str_upcase_bang(VALUE str) 04653 { 04654 rb_encoding *enc; 04655 char *s, *send; 04656 int modify = 0; 04657 int n; 04658 04659 str_modify_keep_cr(str); 04660 enc = STR_ENC_GET(str); 04661 rb_str_check_dummy_enc(enc); 04662 s = RSTRING_PTR(str); send = RSTRING_END(str); 04663 if (single_byte_optimizable(str)) { 04664 while (s < send) { 04665 unsigned int c = *(unsigned char*)s; 04666 04667 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { 04668 *s = 'A' + (c - 'a'); 04669 modify = 1; 04670 } 04671 s++; 04672 } 04673 } 04674 else { 04675 int ascompat = rb_enc_asciicompat(enc); 04676 04677 while (s < send) { 04678 unsigned int c; 04679 04680 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 04681 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { 04682 *s = 'A' + (c - 'a'); 04683 modify = 1; 04684 } 04685 s++; 04686 } 04687 else { 04688 c = rb_enc_codepoint_len(s, send, &n, enc); 04689 if (rb_enc_islower(c, enc)) { 04690 /* assuming toupper returns codepoint with same size */ 04691 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 04692 modify = 1; 04693 } 04694 s += n; 04695 } 04696 } 04697 } 04698 04699 if (modify) return str; 04700 return Qnil; 04701 } 04702 04703 04704 /* 04705 * call-seq: 04706 * str.upcase -> new_str 04707 * 04708 * Returns a copy of <i>str</i> with all lowercase letters replaced with their 04709 * uppercase counterparts. The operation is locale insensitive---only 04710 * characters ``a'' to ``z'' are affected. 04711 * Note: case replacement is effective only in ASCII region. 04712 * 04713 * "hEllO".upcase #=> "HELLO" 04714 */ 04715 04716 static VALUE 04717 rb_str_upcase(VALUE str) 04718 { 04719 str = rb_str_dup(str); 04720 rb_str_upcase_bang(str); 04721 return str; 04722 } 04723 04724 04725 /* 04726 * call-seq: 04727 * str.downcase! -> str or nil 04728 * 04729 * Downcases the contents of <i>str</i>, returning <code>nil</code> if no 04730 * changes were made. 04731 * Note: case replacement is effective only in ASCII region. 04732 */ 04733 04734 static VALUE 04735 rb_str_downcase_bang(VALUE str) 04736 { 04737 rb_encoding *enc; 04738 char *s, *send; 04739 int modify = 0; 04740 04741 str_modify_keep_cr(str); 04742 enc = STR_ENC_GET(str); 04743 rb_str_check_dummy_enc(enc); 04744 s = RSTRING_PTR(str); send = RSTRING_END(str); 04745 if (single_byte_optimizable(str)) { 04746 while (s < send) { 04747 unsigned int c = *(unsigned char*)s; 04748 04749 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { 04750 *s = 'a' + (c - 'A'); 04751 modify = 1; 04752 } 04753 s++; 04754 } 04755 } 04756 else { 04757 int ascompat = rb_enc_asciicompat(enc); 04758 04759 while (s < send) { 04760 unsigned int c; 04761 int n; 04762 04763 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 04764 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { 04765 *s = 'a' + (c - 'A'); 04766 modify = 1; 04767 } 04768 s++; 04769 } 04770 else { 04771 c = rb_enc_codepoint_len(s, send, &n, enc); 04772 if (rb_enc_isupper(c, enc)) { 04773 /* assuming toupper returns codepoint with same size */ 04774 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 04775 modify = 1; 04776 } 04777 s += n; 04778 } 04779 } 04780 } 04781 04782 if (modify) return str; 04783 return Qnil; 04784 } 04785 04786 04787 /* 04788 * call-seq: 04789 * str.downcase -> new_str 04790 * 04791 * Returns a copy of <i>str</i> with all uppercase letters replaced with their 04792 * lowercase counterparts. The operation is locale insensitive---only 04793 * characters ``A'' to ``Z'' are affected. 04794 * Note: case replacement is effective only in ASCII region. 04795 * 04796 * "hEllO".downcase #=> "hello" 04797 */ 04798 04799 static VALUE 04800 rb_str_downcase(VALUE str) 04801 { 04802 str = rb_str_dup(str); 04803 rb_str_downcase_bang(str); 04804 return str; 04805 } 04806 04807 04808 /* 04809 * call-seq: 04810 * str.capitalize! -> str or nil 04811 * 04812 * Modifies <i>str</i> by converting the first character to uppercase and the 04813 * remainder to lowercase. Returns <code>nil</code> if no changes are made. 04814 * Note: case conversion is effective only in ASCII region. 04815 * 04816 * a = "hello" 04817 * a.capitalize! #=> "Hello" 04818 * a #=> "Hello" 04819 * a.capitalize! #=> nil 04820 */ 04821 04822 static VALUE 04823 rb_str_capitalize_bang(VALUE str) 04824 { 04825 rb_encoding *enc; 04826 char *s, *send; 04827 int modify = 0; 04828 unsigned int c; 04829 int n; 04830 04831 str_modify_keep_cr(str); 04832 enc = STR_ENC_GET(str); 04833 rb_str_check_dummy_enc(enc); 04834 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 04835 s = RSTRING_PTR(str); send = RSTRING_END(str); 04836 04837 c = rb_enc_codepoint_len(s, send, &n, enc); 04838 if (rb_enc_islower(c, enc)) { 04839 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 04840 modify = 1; 04841 } 04842 s += n; 04843 while (s < send) { 04844 c = rb_enc_codepoint_len(s, send, &n, enc); 04845 if (rb_enc_isupper(c, enc)) { 04846 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 04847 modify = 1; 04848 } 04849 s += n; 04850 } 04851 04852 if (modify) return str; 04853 return Qnil; 04854 } 04855 04856 04857 /* 04858 * call-seq: 04859 * str.capitalize -> new_str 04860 * 04861 * Returns a copy of <i>str</i> with the first character converted to uppercase 04862 * and the remainder to lowercase. 04863 * Note: case conversion is effective only in ASCII region. 04864 * 04865 * "hello".capitalize #=> "Hello" 04866 * "HELLO".capitalize #=> "Hello" 04867 * "123ABC".capitalize #=> "123abc" 04868 */ 04869 04870 static VALUE 04871 rb_str_capitalize(VALUE str) 04872 { 04873 str = rb_str_dup(str); 04874 rb_str_capitalize_bang(str); 04875 return str; 04876 } 04877 04878 04879 /* 04880 * call-seq: 04881 * str.swapcase! -> str or nil 04882 * 04883 * Equivalent to <code>String#swapcase</code>, but modifies the receiver in 04884 * place, returning <i>str</i>, or <code>nil</code> if no changes were made. 04885 * Note: case conversion is effective only in ASCII region. 04886 */ 04887 04888 static VALUE 04889 rb_str_swapcase_bang(VALUE str) 04890 { 04891 rb_encoding *enc; 04892 char *s, *send; 04893 int modify = 0; 04894 int n; 04895 04896 str_modify_keep_cr(str); 04897 enc = STR_ENC_GET(str); 04898 rb_str_check_dummy_enc(enc); 04899 s = RSTRING_PTR(str); send = RSTRING_END(str); 04900 while (s < send) { 04901 unsigned int c = rb_enc_codepoint_len(s, send, &n, enc); 04902 04903 if (rb_enc_isupper(c, enc)) { 04904 /* assuming toupper returns codepoint with same size */ 04905 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 04906 modify = 1; 04907 } 04908 else if (rb_enc_islower(c, enc)) { 04909 /* assuming tolower returns codepoint with same size */ 04910 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 04911 modify = 1; 04912 } 04913 s += n; 04914 } 04915 04916 if (modify) return str; 04917 return Qnil; 04918 } 04919 04920 04921 /* 04922 * call-seq: 04923 * str.swapcase -> new_str 04924 * 04925 * Returns a copy of <i>str</i> with uppercase alphabetic characters converted 04926 * to lowercase and lowercase characters converted to uppercase. 04927 * Note: case conversion is effective only in ASCII region. 04928 * 04929 * "Hello".swapcase #=> "hELLO" 04930 * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11" 04931 */ 04932 04933 static VALUE 04934 rb_str_swapcase(VALUE str) 04935 { 04936 str = rb_str_dup(str); 04937 rb_str_swapcase_bang(str); 04938 return str; 04939 } 04940 04941 typedef unsigned char *USTR; 04942 04943 struct tr { 04944 int gen; 04945 unsigned int now, max; 04946 char *p, *pend; 04947 }; 04948 04949 static unsigned int 04950 trnext(struct tr *t, rb_encoding *enc) 04951 { 04952 int n; 04953 04954 for (;;) { 04955 if (!t->gen) { 04956 if (t->p == t->pend) return -1; 04957 if (t->p < t->pend - 1 && *t->p == '\\') { 04958 t->p++; 04959 } 04960 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc); 04961 t->p += n; 04962 if (t->p < t->pend - 1 && *t->p == '-') { 04963 t->p++; 04964 if (t->p < t->pend) { 04965 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc); 04966 t->p += n; 04967 if (t->now > c) { 04968 if (t->now < 0x80 && c < 0x80) { 04969 rb_raise(rb_eArgError, 04970 "invalid range \"%c-%c\" in string transliteration", 04971 t->now, c); 04972 } 04973 else { 04974 rb_raise(rb_eArgError, "invalid range in string transliteration"); 04975 } 04976 continue; /* not reached */ 04977 } 04978 t->gen = 1; 04979 t->max = c; 04980 } 04981 } 04982 return t->now; 04983 } 04984 else if (++t->now < t->max) { 04985 return t->now; 04986 } 04987 else { 04988 t->gen = 0; 04989 return t->max; 04990 } 04991 } 04992 } 04993 04994 static VALUE rb_str_delete_bang(int,VALUE*,VALUE); 04995 04996 static VALUE 04997 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) 04998 { 04999 const unsigned int errc = -1; 05000 unsigned int trans[256]; 05001 rb_encoding *enc, *e1, *e2; 05002 struct tr trsrc, trrepl; 05003 int cflag = 0; 05004 unsigned int c, c0, last = 0; 05005 int modify = 0, i, l; 05006 char *s, *send; 05007 VALUE hash = 0; 05008 int singlebyte = single_byte_optimizable(str); 05009 int cr; 05010 05011 #define CHECK_IF_ASCII(c) \ 05012 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \ 05013 (cr = ENC_CODERANGE_VALID) : 0) 05014 05015 StringValue(src); 05016 StringValue(repl); 05017 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 05018 if (RSTRING_LEN(repl) == 0) { 05019 return rb_str_delete_bang(1, &src, str); 05020 } 05021 05022 cr = ENC_CODERANGE(str); 05023 e1 = rb_enc_check(str, src); 05024 e2 = rb_enc_check(str, repl); 05025 if (e1 == e2) { 05026 enc = e1; 05027 } 05028 else { 05029 enc = rb_enc_check(src, repl); 05030 } 05031 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src); 05032 if (RSTRING_LEN(src) > 1 && 05033 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' && 05034 trsrc.p + l < trsrc.pend) { 05035 cflag = 1; 05036 trsrc.p += l; 05037 } 05038 trrepl.p = RSTRING_PTR(repl); 05039 trrepl.pend = trrepl.p + RSTRING_LEN(repl); 05040 trsrc.gen = trrepl.gen = 0; 05041 trsrc.now = trrepl.now = 0; 05042 trsrc.max = trrepl.max = 0; 05043 05044 if (cflag) { 05045 for (i=0; i<256; i++) { 05046 trans[i] = 1; 05047 } 05048 while ((c = trnext(&trsrc, enc)) != errc) { 05049 if (c < 256) { 05050 trans[c] = errc; 05051 } 05052 else { 05053 if (!hash) hash = rb_hash_new(); 05054 rb_hash_aset(hash, UINT2NUM(c), Qtrue); 05055 } 05056 } 05057 while ((c = trnext(&trrepl, enc)) != errc) 05058 /* retrieve last replacer */; 05059 last = trrepl.now; 05060 for (i=0; i<256; i++) { 05061 if (trans[i] != errc) { 05062 trans[i] = last; 05063 } 05064 } 05065 } 05066 else { 05067 unsigned int r; 05068 05069 for (i=0; i<256; i++) { 05070 trans[i] = errc; 05071 } 05072 while ((c = trnext(&trsrc, enc)) != errc) { 05073 r = trnext(&trrepl, enc); 05074 if (r == errc) r = trrepl.now; 05075 if (c < 256) { 05076 trans[c] = r; 05077 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0; 05078 } 05079 else { 05080 if (!hash) hash = rb_hash_new(); 05081 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r)); 05082 } 05083 } 05084 } 05085 05086 if (cr == ENC_CODERANGE_VALID) 05087 cr = ENC_CODERANGE_7BIT; 05088 str_modify_keep_cr(str); 05089 s = RSTRING_PTR(str); send = RSTRING_END(str); 05090 if (sflag) { 05091 int clen, tlen; 05092 long offset, max = RSTRING_LEN(str); 05093 unsigned int save = -1; 05094 char *buf = ALLOC_N(char, max), *t = buf; 05095 05096 while (s < send) { 05097 int may_modify = 0; 05098 05099 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1); 05100 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc); 05101 05102 s += clen; 05103 if (c < 256) { 05104 c = trans[c]; 05105 } 05106 else if (hash) { 05107 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c)); 05108 if (NIL_P(tmp)) { 05109 if (cflag) c = last; 05110 else c = errc; 05111 } 05112 else if (cflag) c = errc; 05113 else c = NUM2INT(tmp); 05114 } 05115 else { 05116 c = errc; 05117 } 05118 if (c != (unsigned int)-1) { 05119 if (save == c) { 05120 CHECK_IF_ASCII(c); 05121 continue; 05122 } 05123 save = c; 05124 tlen = rb_enc_codelen(c, enc); 05125 modify = 1; 05126 } 05127 else { 05128 save = -1; 05129 c = c0; 05130 if (enc != e1) may_modify = 1; 05131 } 05132 while (t - buf + tlen >= max) { 05133 offset = t - buf; 05134 max *= 2; 05135 REALLOC_N(buf, char, max); 05136 t = buf + offset; 05137 } 05138 rb_enc_mbcput(c, t, enc); 05139 if (may_modify && memcmp(s, t, tlen) != 0) { 05140 modify = 1; 05141 } 05142 CHECK_IF_ASCII(c); 05143 t += tlen; 05144 } 05145 if (!STR_EMBED_P(str)) { 05146 xfree(RSTRING(str)->as.heap.ptr); 05147 } 05148 *t = '\0'; 05149 RSTRING(str)->as.heap.ptr = buf; 05150 RSTRING(str)->as.heap.len = t - buf; 05151 STR_SET_NOEMBED(str); 05152 RSTRING(str)->as.heap.aux.capa = max; 05153 } 05154 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) { 05155 while (s < send) { 05156 c = (unsigned char)*s; 05157 if (trans[c] != errc) { 05158 if (!cflag) { 05159 c = trans[c]; 05160 *s = c; 05161 modify = 1; 05162 } 05163 else { 05164 *s = last; 05165 modify = 1; 05166 } 05167 } 05168 CHECK_IF_ASCII(c); 05169 s++; 05170 } 05171 } 05172 else { 05173 int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2); 05174 long offset; 05175 char *buf = ALLOC_N(char, max), *t = buf; 05176 05177 while (s < send) { 05178 int may_modify = 0; 05179 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1); 05180 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc); 05181 05182 if (c < 256) { 05183 c = trans[c]; 05184 } 05185 else if (hash) { 05186 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c)); 05187 if (NIL_P(tmp)) { 05188 if (cflag) c = last; 05189 else c = errc; 05190 } 05191 else if (cflag) c = errc; 05192 else c = NUM2INT(tmp); 05193 } 05194 else { 05195 c = cflag ? last : errc; 05196 } 05197 if (c != errc) { 05198 tlen = rb_enc_codelen(c, enc); 05199 modify = 1; 05200 } 05201 else { 05202 c = c0; 05203 if (enc != e1) may_modify = 1; 05204 } 05205 while (t - buf + tlen >= max) { 05206 offset = t - buf; 05207 max *= 2; 05208 REALLOC_N(buf, char, max); 05209 t = buf + offset; 05210 } 05211 if (s != t) { 05212 rb_enc_mbcput(c, t, enc); 05213 if (may_modify && memcmp(s, t, tlen) != 0) { 05214 modify = 1; 05215 } 05216 } 05217 CHECK_IF_ASCII(c); 05218 s += clen; 05219 t += tlen; 05220 } 05221 if (!STR_EMBED_P(str)) { 05222 xfree(RSTRING(str)->as.heap.ptr); 05223 } 05224 *t = '\0'; 05225 RSTRING(str)->as.heap.ptr = buf; 05226 RSTRING(str)->as.heap.len = t - buf; 05227 STR_SET_NOEMBED(str); 05228 RSTRING(str)->as.heap.aux.capa = max; 05229 } 05230 05231 if (modify) { 05232 if (cr != ENC_CODERANGE_BROKEN) 05233 ENC_CODERANGE_SET(str, cr); 05234 rb_enc_associate(str, enc); 05235 return str; 05236 } 05237 return Qnil; 05238 } 05239 05240 05241 /* 05242 * call-seq: 05243 * str.tr!(from_str, to_str) -> str or nil 05244 * 05245 * Translates <i>str</i> in place, using the same rules as 05246 * <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no 05247 * changes were made. 05248 */ 05249 05250 static VALUE 05251 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl) 05252 { 05253 return tr_trans(str, src, repl, 0); 05254 } 05255 05256 05257 /* 05258 * call-seq: 05259 * str.tr(from_str, to_str) => new_str 05260 * 05261 * Returns a copy of <i>str</i> with the characters in <i>from_str</i> 05262 * replaced by the corresponding characters in <i>to_str</i>. If 05263 * <i>to_str</i> is shorter than <i>from_str</i>, it is padded with its last 05264 * character in order to maintain the correspondence. 05265 * 05266 * "hello".tr('el', 'ip') #=> "hippo" 05267 * "hello".tr('aeiou', '*') #=> "h*ll*" 05268 * 05269 * Both strings may use the c1-c2 notation to denote ranges of characters, 05270 * and <i>from_str</i> may start with a <code>^</code>, which denotes all 05271 * characters except those listed. 05272 * 05273 * "hello".tr('a-y', 'b-z') #=> "ifmmp" 05274 * "hello".tr('^aeiou', '*') #=> "*e**o" 05275 */ 05276 05277 static VALUE 05278 rb_str_tr(VALUE str, VALUE src, VALUE repl) 05279 { 05280 str = rb_str_dup(str); 05281 tr_trans(str, src, repl, 0); 05282 return str; 05283 } 05284 05285 #define TR_TABLE_SIZE 257 05286 static void 05287 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first, 05288 VALUE *tablep, VALUE *ctablep, rb_encoding *enc) 05289 { 05290 const unsigned int errc = -1; 05291 char buf[256]; 05292 struct tr tr; 05293 unsigned int c; 05294 VALUE table = 0, ptable = 0; 05295 int i, l, cflag = 0; 05296 05297 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str); 05298 tr.gen = tr.now = tr.max = 0; 05299 05300 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') { 05301 cflag = 1; 05302 tr.p += l; 05303 } 05304 if (first) { 05305 for (i=0; i<256; i++) { 05306 stable[i] = 1; 05307 } 05308 stable[256] = cflag; 05309 } 05310 else if (stable[256] && !cflag) { 05311 stable[256] = 0; 05312 } 05313 for (i=0; i<256; i++) { 05314 buf[i] = cflag; 05315 } 05316 05317 while ((c = trnext(&tr, enc)) != errc) { 05318 if (c < 256) { 05319 buf[c & 0xff] = !cflag; 05320 } 05321 else { 05322 VALUE key = UINT2NUM(c); 05323 05324 if (!table) { 05325 table = rb_hash_new(); 05326 if (cflag) { 05327 ptable = *ctablep; 05328 *ctablep = table; 05329 } 05330 else { 05331 ptable = *tablep; 05332 *tablep = table; 05333 } 05334 } 05335 if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) { 05336 rb_hash_aset(table, key, Qtrue); 05337 } 05338 } 05339 } 05340 for (i=0; i<256; i++) { 05341 stable[i] = stable[i] && buf[i]; 05342 } 05343 } 05344 05345 05346 static int 05347 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel) 05348 { 05349 if (c < 256) { 05350 return table[c] != 0; 05351 } 05352 else { 05353 VALUE v = UINT2NUM(c); 05354 05355 if (del) { 05356 if (!NIL_P(rb_hash_lookup(del, v)) && 05357 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) { 05358 return TRUE; 05359 } 05360 } 05361 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) { 05362 return FALSE; 05363 } 05364 return table[256] ? TRUE : FALSE; 05365 } 05366 } 05367 05368 /* 05369 * call-seq: 05370 * str.delete!([other_str]+) -> str or nil 05371 * 05372 * Performs a <code>delete</code> operation in place, returning <i>str</i>, or 05373 * <code>nil</code> if <i>str</i> was not modified. 05374 */ 05375 05376 static VALUE 05377 rb_str_delete_bang(int argc, VALUE *argv, VALUE str) 05378 { 05379 char squeez[TR_TABLE_SIZE]; 05380 rb_encoding *enc = 0; 05381 char *s, *send, *t; 05382 VALUE del = 0, nodel = 0; 05383 int modify = 0; 05384 int i, ascompat, cr; 05385 05386 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 05387 if (argc < 1) { 05388 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)"); 05389 } 05390 for (i=0; i<argc; i++) { 05391 VALUE s = argv[i]; 05392 05393 StringValue(s); 05394 enc = rb_enc_check(str, s); 05395 tr_setup_table(s, squeez, i==0, &del, &nodel, enc); 05396 } 05397 05398 str_modify_keep_cr(str); 05399 ascompat = rb_enc_asciicompat(enc); 05400 s = t = RSTRING_PTR(str); 05401 send = RSTRING_END(str); 05402 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; 05403 while (s < send) { 05404 unsigned int c; 05405 int clen; 05406 05407 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 05408 if (squeez[c]) { 05409 modify = 1; 05410 } 05411 else { 05412 if (t != s) *t = c; 05413 t++; 05414 } 05415 s++; 05416 } 05417 else { 05418 c = rb_enc_codepoint_len(s, send, &clen, enc); 05419 05420 if (tr_find(c, squeez, del, nodel)) { 05421 modify = 1; 05422 } 05423 else { 05424 if (t != s) rb_enc_mbcput(c, t, enc); 05425 t += clen; 05426 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID; 05427 } 05428 s += clen; 05429 } 05430 } 05431 *t = '\0'; 05432 STR_SET_LEN(str, t - RSTRING_PTR(str)); 05433 ENC_CODERANGE_SET(str, cr); 05434 05435 if (modify) return str; 05436 return Qnil; 05437 } 05438 05439 05440 /* 05441 * call-seq: 05442 * str.delete([other_str]+) -> new_str 05443 * 05444 * Returns a copy of <i>str</i> with all characters in the intersection of its 05445 * arguments deleted. Uses the same rules for building the set of characters as 05446 * <code>String#count</code>. 05447 * 05448 * "hello".delete "l","lo" #=> "heo" 05449 * "hello".delete "lo" #=> "he" 05450 * "hello".delete "aeiou", "^e" #=> "hell" 05451 * "hello".delete "ej-m" #=> "ho" 05452 */ 05453 05454 static VALUE 05455 rb_str_delete(int argc, VALUE *argv, VALUE str) 05456 { 05457 str = rb_str_dup(str); 05458 rb_str_delete_bang(argc, argv, str); 05459 return str; 05460 } 05461 05462 05463 /* 05464 * call-seq: 05465 * str.squeeze!([other_str]*) -> str or nil 05466 * 05467 * Squeezes <i>str</i> in place, returning either <i>str</i>, or 05468 * <code>nil</code> if no changes were made. 05469 */ 05470 05471 static VALUE 05472 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str) 05473 { 05474 char squeez[TR_TABLE_SIZE]; 05475 rb_encoding *enc = 0; 05476 VALUE del = 0, nodel = 0; 05477 char *s, *send, *t; 05478 int i, modify = 0; 05479 int ascompat, singlebyte = single_byte_optimizable(str); 05480 unsigned int save; 05481 05482 if (argc == 0) { 05483 enc = STR_ENC_GET(str); 05484 } 05485 else { 05486 for (i=0; i<argc; i++) { 05487 VALUE s = argv[i]; 05488 05489 StringValue(s); 05490 enc = rb_enc_check(str, s); 05491 if (singlebyte && !single_byte_optimizable(s)) 05492 singlebyte = 0; 05493 tr_setup_table(s, squeez, i==0, &del, &nodel, enc); 05494 } 05495 } 05496 05497 str_modify_keep_cr(str); 05498 s = t = RSTRING_PTR(str); 05499 if (!s || RSTRING_LEN(str) == 0) return Qnil; 05500 send = RSTRING_END(str); 05501 save = -1; 05502 ascompat = rb_enc_asciicompat(enc); 05503 05504 if (singlebyte) { 05505 while (s < send) { 05506 unsigned int c = *(unsigned char*)s++; 05507 if (c != save || (argc > 0 && !squeez[c])) { 05508 *t++ = save = c; 05509 } 05510 } 05511 } else { 05512 while (s < send) { 05513 unsigned int c; 05514 int clen; 05515 05516 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 05517 if (c != save || (argc > 0 && !squeez[c])) { 05518 *t++ = save = c; 05519 } 05520 s++; 05521 } 05522 else { 05523 c = rb_enc_codepoint_len(s, send, &clen, enc); 05524 05525 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) { 05526 if (t != s) rb_enc_mbcput(c, t, enc); 05527 save = c; 05528 t += clen; 05529 } 05530 s += clen; 05531 } 05532 } 05533 } 05534 05535 *t = '\0'; 05536 if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) { 05537 STR_SET_LEN(str, t - RSTRING_PTR(str)); 05538 modify = 1; 05539 } 05540 05541 if (modify) return str; 05542 return Qnil; 05543 } 05544 05545 05546 /* 05547 * call-seq: 05548 * str.squeeze([other_str]*) -> new_str 05549 * 05550 * Builds a set of characters from the <i>other_str</i> parameter(s) using the 05551 * procedure described for <code>String#count</code>. Returns a new string 05552 * where runs of the same character that occur in this set are replaced by a 05553 * single character. If no arguments are given, all runs of identical 05554 * characters are replaced by a single character. 05555 * 05556 * "yellow moon".squeeze #=> "yelow mon" 05557 * " now is the".squeeze(" ") #=> " now is the" 05558 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls" 05559 */ 05560 05561 static VALUE 05562 rb_str_squeeze(int argc, VALUE *argv, VALUE str) 05563 { 05564 str = rb_str_dup(str); 05565 rb_str_squeeze_bang(argc, argv, str); 05566 return str; 05567 } 05568 05569 05570 /* 05571 * call-seq: 05572 * str.tr_s!(from_str, to_str) -> str or nil 05573 * 05574 * Performs <code>String#tr_s</code> processing on <i>str</i> in place, 05575 * returning <i>str</i>, or <code>nil</code> if no changes were made. 05576 */ 05577 05578 static VALUE 05579 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl) 05580 { 05581 return tr_trans(str, src, repl, 1); 05582 } 05583 05584 05585 /* 05586 * call-seq: 05587 * str.tr_s(from_str, to_str) -> new_str 05588 * 05589 * Processes a copy of <i>str</i> as described under <code>String#tr</code>, 05590 * then removes duplicate characters in regions that were affected by the 05591 * translation. 05592 * 05593 * "hello".tr_s('l', 'r') #=> "hero" 05594 * "hello".tr_s('el', '*') #=> "h*o" 05595 * "hello".tr_s('el', 'hx') #=> "hhxo" 05596 */ 05597 05598 static VALUE 05599 rb_str_tr_s(VALUE str, VALUE src, VALUE repl) 05600 { 05601 str = rb_str_dup(str); 05602 tr_trans(str, src, repl, 1); 05603 return str; 05604 } 05605 05606 05607 /* 05608 * call-seq: 05609 * str.count([other_str]+) -> fixnum 05610 * 05611 * Each <i>other_str</i> parameter defines a set of characters to count. The 05612 * intersection of these sets defines the characters to count in 05613 * <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is 05614 * negated. The sequence c1--c2 means all characters between c1 and c2. 05615 * 05616 * a = "hello world" 05617 * a.count "lo" #=> 5 05618 * a.count "lo", "o" #=> 2 05619 * a.count "hello", "^l" #=> 4 05620 * a.count "ej-m" #=> 4 05621 */ 05622 05623 static VALUE 05624 rb_str_count(int argc, VALUE *argv, VALUE str) 05625 { 05626 char table[TR_TABLE_SIZE]; 05627 rb_encoding *enc = 0; 05628 VALUE del = 0, nodel = 0; 05629 char *s, *send; 05630 int i; 05631 int ascompat; 05632 05633 if (argc < 1) { 05634 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)"); 05635 } 05636 for (i=0; i<argc; i++) { 05637 VALUE tstr = argv[i]; 05638 unsigned char c; 05639 05640 StringValue(tstr); 05641 enc = rb_enc_check(str, tstr); 05642 if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) && 05643 (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) { 05644 int n = 0; 05645 05646 s = RSTRING_PTR(str); 05647 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0); 05648 send = RSTRING_END(str); 05649 while (s < send) { 05650 if (*(unsigned char*)s++ == c) n++; 05651 } 05652 return INT2NUM(n); 05653 } 05654 tr_setup_table(tstr, table, i==0, &del, &nodel, enc); 05655 } 05656 05657 s = RSTRING_PTR(str); 05658 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0); 05659 send = RSTRING_END(str); 05660 ascompat = rb_enc_asciicompat(enc); 05661 i = 0; 05662 while (s < send) { 05663 unsigned int c; 05664 05665 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 05666 if (table[c]) { 05667 i++; 05668 } 05669 s++; 05670 } 05671 else { 05672 int clen; 05673 c = rb_enc_codepoint_len(s, send, &clen, enc); 05674 if (tr_find(c, table, del, nodel)) { 05675 i++; 05676 } 05677 s += clen; 05678 } 05679 } 05680 05681 return INT2NUM(i); 05682 } 05683 05684 static const char isspacetable[256] = { 05685 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 05686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05687 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05689 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05690 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05691 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05692 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05693 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05694 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05695 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05696 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05697 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05698 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05699 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05700 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 05701 }; 05702 05703 #define ascii_isspace(c) isspacetable[(unsigned char)(c)] 05704 05705 /* 05706 * call-seq: 05707 * str.split(pattern=$;, [limit]) -> anArray 05708 * 05709 * Divides <i>str</i> into substrings based on a delimiter, returning an array 05710 * of these substrings. 05711 * 05712 * If <i>pattern</i> is a <code>String</code>, then its contents are used as 05713 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single 05714 * space, <i>str</i> is split on whitespace, with leading whitespace and runs 05715 * of contiguous whitespace characters ignored. 05716 * 05717 * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the 05718 * pattern matches. Whenever the pattern matches a zero-length string, 05719 * <i>str</i> is split into individual characters. If <i>pattern</i> contains 05720 * groups, the respective matches will be returned in the array as well. 05721 * 05722 * If <i>pattern</i> is omitted, the value of <code>$;</code> is used. If 05723 * <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is 05724 * split on whitespace as if ` ' were specified. 05725 * 05726 * If the <i>limit</i> parameter is omitted, trailing null fields are 05727 * suppressed. If <i>limit</i> is a positive number, at most that number of 05728 * fields will be returned (if <i>limit</i> is <code>1</code>, the entire 05729 * string is returned as the only entry in an array). If negative, there is no 05730 * limit to the number of fields returned, and trailing null fields are not 05731 * suppressed. 05732 * 05733 * " now's the time".split #=> ["now's", "the", "time"] 05734 * " now's the time".split(' ') #=> ["now's", "the", "time"] 05735 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"] 05736 * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"] 05737 * "hello".split(//) #=> ["h", "e", "l", "l", "o"] 05738 * "hello".split(//, 3) #=> ["h", "e", "llo"] 05739 * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"] 05740 * 05741 * "mellow yellow".split("ello") #=> ["m", "w y", "w"] 05742 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"] 05743 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"] 05744 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""] 05745 */ 05746 05747 static VALUE 05748 rb_str_split_m(int argc, VALUE *argv, VALUE str) 05749 { 05750 rb_encoding *enc; 05751 VALUE spat; 05752 VALUE limit; 05753 enum {awk, string, regexp} split_type; 05754 long beg, end, i = 0; 05755 int lim = 0; 05756 VALUE result, tmp; 05757 05758 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) { 05759 lim = NUM2INT(limit); 05760 if (lim <= 0) limit = Qnil; 05761 else if (lim == 1) { 05762 if (RSTRING_LEN(str) == 0) 05763 return rb_ary_new2(0); 05764 return rb_ary_new3(1, str); 05765 } 05766 i = 1; 05767 } 05768 05769 enc = STR_ENC_GET(str); 05770 if (NIL_P(spat)) { 05771 if (!NIL_P(rb_fs)) { 05772 spat = rb_fs; 05773 goto fs_set; 05774 } 05775 split_type = awk; 05776 } 05777 else { 05778 fs_set: 05779 if (TYPE(spat) == T_STRING) { 05780 rb_encoding *enc2 = STR_ENC_GET(spat); 05781 05782 split_type = string; 05783 if (RSTRING_LEN(spat) == 0) { 05784 /* Special case - split into chars */ 05785 spat = rb_reg_regcomp(spat); 05786 split_type = regexp; 05787 } 05788 else if (rb_enc_asciicompat(enc2) == 1) { 05789 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){ 05790 split_type = awk; 05791 } 05792 } 05793 else { 05794 int l; 05795 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' && 05796 RSTRING_LEN(spat) == l) { 05797 split_type = awk; 05798 } 05799 } 05800 } 05801 else { 05802 spat = get_pat(spat, 1); 05803 split_type = regexp; 05804 } 05805 } 05806 05807 result = rb_ary_new(); 05808 beg = 0; 05809 if (split_type == awk) { 05810 char *ptr = RSTRING_PTR(str); 05811 char *eptr = RSTRING_END(str); 05812 char *bptr = ptr; 05813 int skip = 1; 05814 unsigned int c; 05815 05816 end = beg; 05817 if (is_ascii_string(str)) { 05818 while (ptr < eptr) { 05819 c = (unsigned char)*ptr++; 05820 if (skip) { 05821 if (ascii_isspace(c)) { 05822 beg = ptr - bptr; 05823 } 05824 else { 05825 end = ptr - bptr; 05826 skip = 0; 05827 if (!NIL_P(limit) && lim <= i) break; 05828 } 05829 } 05830 else if (ascii_isspace(c)) { 05831 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 05832 skip = 1; 05833 beg = ptr - bptr; 05834 if (!NIL_P(limit)) ++i; 05835 } 05836 else { 05837 end = ptr - bptr; 05838 } 05839 } 05840 } 05841 else { 05842 while (ptr < eptr) { 05843 int n; 05844 05845 c = rb_enc_codepoint_len(ptr, eptr, &n, enc); 05846 ptr += n; 05847 if (skip) { 05848 if (rb_isspace(c)) { 05849 beg = ptr - bptr; 05850 } 05851 else { 05852 end = ptr - bptr; 05853 skip = 0; 05854 if (!NIL_P(limit) && lim <= i) break; 05855 } 05856 } 05857 else if (rb_isspace(c)) { 05858 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 05859 skip = 1; 05860 beg = ptr - bptr; 05861 if (!NIL_P(limit)) ++i; 05862 } 05863 else { 05864 end = ptr - bptr; 05865 } 05866 } 05867 } 05868 } 05869 else if (split_type == string) { 05870 char *ptr = RSTRING_PTR(str); 05871 char *temp = ptr; 05872 char *eptr = RSTRING_END(str); 05873 char *sptr = RSTRING_PTR(spat); 05874 long slen = RSTRING_LEN(spat); 05875 05876 if (is_broken_string(str)) { 05877 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str))); 05878 } 05879 if (is_broken_string(spat)) { 05880 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat))); 05881 } 05882 enc = rb_enc_check(str, spat); 05883 while (ptr < eptr && 05884 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) { 05885 /* Check we are at the start of a char */ 05886 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc); 05887 if (t != ptr + end) { 05888 ptr = t; 05889 continue; 05890 } 05891 rb_ary_push(result, rb_str_subseq(str, ptr - temp, end)); 05892 ptr += end + slen; 05893 if (!NIL_P(limit) && lim <= ++i) break; 05894 } 05895 beg = ptr - temp; 05896 } 05897 else { 05898 char *ptr = RSTRING_PTR(str); 05899 long len = RSTRING_LEN(str); 05900 long start = beg; 05901 long idx; 05902 int last_null = 0; 05903 struct re_registers *regs; 05904 05905 while ((end = rb_reg_search(spat, str, start, 0)) >= 0) { 05906 regs = RMATCH_REGS(rb_backref_get()); 05907 if (start == end && BEG(0) == END(0)) { 05908 if (!ptr) { 05909 rb_ary_push(result, str_new_empty(str)); 05910 break; 05911 } 05912 else if (last_null == 1) { 05913 rb_ary_push(result, rb_str_subseq(str, beg, 05914 rb_enc_fast_mbclen(ptr+beg, 05915 ptr+len, 05916 enc))); 05917 beg = start; 05918 } 05919 else { 05920 if (ptr+start == ptr+len) 05921 start++; 05922 else 05923 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc); 05924 last_null = 1; 05925 continue; 05926 } 05927 } 05928 else { 05929 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 05930 beg = start = END(0); 05931 } 05932 last_null = 0; 05933 05934 for (idx=1; idx < regs->num_regs; idx++) { 05935 if (BEG(idx) == -1) continue; 05936 if (BEG(idx) == END(idx)) 05937 tmp = str_new_empty(str); 05938 else 05939 tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx)); 05940 rb_ary_push(result, tmp); 05941 } 05942 if (!NIL_P(limit) && lim <= ++i) break; 05943 } 05944 } 05945 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) { 05946 if (RSTRING_LEN(str) == beg) 05947 tmp = str_new_empty(str); 05948 else 05949 tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg); 05950 rb_ary_push(result, tmp); 05951 } 05952 if (NIL_P(limit) && lim == 0) { 05953 long len; 05954 while ((len = RARRAY_LEN(result)) > 0 && 05955 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0)) 05956 rb_ary_pop(result); 05957 } 05958 05959 return result; 05960 } 05961 05962 VALUE 05963 rb_str_split(VALUE str, const char *sep0) 05964 { 05965 VALUE sep; 05966 05967 StringValue(str); 05968 sep = rb_str_new2(sep0); 05969 return rb_str_split_m(1, &sep, str); 05970 } 05971 05972 05973 /* 05974 * call-seq: 05975 * str.each_line(separator=$/) {|substr| block } -> str 05976 * str.each_line(separator=$/) -> an_enumerator 05977 * 05978 * str.lines(separator=$/) {|substr| block } -> str 05979 * str.lines(separator=$/) -> an_enumerator 05980 * 05981 * Splits <i>str</i> using the supplied parameter as the record separator 05982 * (<code>$/</code> by default), passing each substring in turn to the supplied 05983 * block. If a zero-length record separator is supplied, the string is split 05984 * into paragraphs delimited by multiple successive newlines. 05985 * 05986 * If no block is given, an enumerator is returned instead. 05987 * 05988 * print "Example one\n" 05989 * "hello\nworld".each_line {|s| p s} 05990 * print "Example two\n" 05991 * "hello\nworld".each_line('l') {|s| p s} 05992 * print "Example three\n" 05993 * "hello\n\n\nworld".each_line('') {|s| p s} 05994 * 05995 * <em>produces:</em> 05996 * 05997 * Example one 05998 * "hello\n" 05999 * "world" 06000 * Example two 06001 * "hel" 06002 * "l" 06003 * "o\nworl" 06004 * "d" 06005 * Example three 06006 * "hello\n\n\n" 06007 * "world" 06008 */ 06009 06010 static VALUE 06011 rb_str_each_line(int argc, VALUE *argv, VALUE str) 06012 { 06013 rb_encoding *enc; 06014 VALUE rs; 06015 unsigned int newline; 06016 const char *p, *pend, *s, *ptr; 06017 long len, rslen; 06018 VALUE line; 06019 int n; 06020 VALUE orig = str; 06021 06022 if (argc == 0) { 06023 rs = rb_rs; 06024 } 06025 else { 06026 rb_scan_args(argc, argv, "01", &rs); 06027 } 06028 RETURN_ENUMERATOR(str, argc, argv); 06029 if (NIL_P(rs)) { 06030 rb_yield(str); 06031 return orig; 06032 } 06033 str = rb_str_new4(str); 06034 ptr = p = s = RSTRING_PTR(str); 06035 pend = p + RSTRING_LEN(str); 06036 len = RSTRING_LEN(str); 06037 StringValue(rs); 06038 if (rs == rb_default_rs) { 06039 enc = rb_enc_get(str); 06040 while (p < pend) { 06041 char *p0; 06042 06043 p = memchr(p, '\n', pend - p); 06044 if (!p) break; 06045 p0 = rb_enc_left_char_head(s, p, pend, enc); 06046 if (!rb_enc_is_newline(p0, pend, enc)) { 06047 p++; 06048 continue; 06049 } 06050 p = p0 + rb_enc_mbclen(p0, pend, enc); 06051 line = rb_str_new5(str, s, p - s); 06052 OBJ_INFECT(line, str); 06053 rb_enc_cr_str_copy_for_substr(line, str); 06054 rb_yield(line); 06055 str_mod_check(str, ptr, len); 06056 s = p; 06057 } 06058 goto finish; 06059 } 06060 06061 enc = rb_enc_check(str, rs); 06062 rslen = RSTRING_LEN(rs); 06063 if (rslen == 0) { 06064 newline = '\n'; 06065 } 06066 else { 06067 newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc); 06068 } 06069 06070 while (p < pend) { 06071 unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc); 06072 06073 again: 06074 if (rslen == 0 && c == newline) { 06075 p += n; 06076 if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) { 06077 goto again; 06078 } 06079 while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) { 06080 p += n; 06081 } 06082 p -= n; 06083 } 06084 if (c == newline && 06085 (rslen <= 1 || 06086 (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) { 06087 line = rb_str_new5(str, s, p - s + (rslen ? rslen : n)); 06088 OBJ_INFECT(line, str); 06089 rb_enc_cr_str_copy_for_substr(line, str); 06090 rb_yield(line); 06091 str_mod_check(str, ptr, len); 06092 s = p + (rslen ? rslen : n); 06093 } 06094 p += n; 06095 } 06096 06097 finish: 06098 if (s != pend) { 06099 line = rb_str_new5(str, s, pend - s); 06100 OBJ_INFECT(line, str); 06101 rb_enc_cr_str_copy_for_substr(line, str); 06102 rb_yield(line); 06103 } 06104 06105 return orig; 06106 } 06107 06108 06109 /* 06110 * call-seq: 06111 * str.bytes {|fixnum| block } -> str 06112 * str.bytes -> an_enumerator 06113 * 06114 * str.each_byte {|fixnum| block } -> str 06115 * str.each_byte -> an_enumerator 06116 * 06117 * Passes each byte in <i>str</i> to the given block, or returns 06118 * an enumerator if no block is given. 06119 * 06120 * "hello".each_byte {|c| print c, ' ' } 06121 * 06122 * <em>produces:</em> 06123 * 06124 * 104 101 108 108 111 06125 */ 06126 06127 static VALUE 06128 rb_str_each_byte(VALUE str) 06129 { 06130 long i; 06131 06132 RETURN_ENUMERATOR(str, 0, 0); 06133 for (i=0; i<RSTRING_LEN(str); i++) { 06134 rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff)); 06135 } 06136 return str; 06137 } 06138 06139 06140 /* 06141 * call-seq: 06142 * str.chars {|cstr| block } -> str 06143 * str.chars -> an_enumerator 06144 * 06145 * str.each_char {|cstr| block } -> str 06146 * str.each_char -> an_enumerator 06147 * 06148 * Passes each character in <i>str</i> to the given block, or returns 06149 * an enumerator if no block is given. 06150 * 06151 * "hello".each_char {|c| print c, ' ' } 06152 * 06153 * <em>produces:</em> 06154 * 06155 * h e l l o 06156 */ 06157 06158 static VALUE 06159 rb_str_each_char(VALUE str) 06160 { 06161 VALUE orig = str; 06162 long i, len, n; 06163 const char *ptr; 06164 rb_encoding *enc; 06165 06166 RETURN_ENUMERATOR(str, 0, 0); 06167 str = rb_str_new4(str); 06168 ptr = RSTRING_PTR(str); 06169 len = RSTRING_LEN(str); 06170 enc = rb_enc_get(str); 06171 switch (ENC_CODERANGE(str)) { 06172 case ENC_CODERANGE_VALID: 06173 case ENC_CODERANGE_7BIT: 06174 for (i = 0; i < len; i += n) { 06175 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc); 06176 rb_yield(rb_str_subseq(str, i, n)); 06177 } 06178 break; 06179 default: 06180 for (i = 0; i < len; i += n) { 06181 n = rb_enc_mbclen(ptr + i, ptr + len, enc); 06182 rb_yield(rb_str_subseq(str, i, n)); 06183 } 06184 } 06185 return orig; 06186 } 06187 06188 /* 06189 * call-seq: 06190 * str.codepoints {|integer| block } -> str 06191 * str.codepoints -> an_enumerator 06192 * 06193 * str.each_codepoint {|integer| block } -> str 06194 * str.each_codepoint -> an_enumerator 06195 * 06196 * Passes the <code>Integer</code> ordinal of each character in <i>str</i>, 06197 * also known as a <i>codepoint</i> when applied to Unicode strings to the 06198 * given block. 06199 * 06200 * If no block is given, an enumerator is returned instead. 06201 * 06202 * "hello\u0639".each_codepoint {|c| print c, ' ' } 06203 * 06204 * <em>produces:</em> 06205 * 06206 * 104 101 108 108 111 1593 06207 */ 06208 06209 static VALUE 06210 rb_str_each_codepoint(VALUE str) 06211 { 06212 VALUE orig = str; 06213 int n; 06214 unsigned int c; 06215 const char *ptr, *end; 06216 rb_encoding *enc; 06217 06218 if (single_byte_optimizable(str)) return rb_str_each_byte(str); 06219 RETURN_ENUMERATOR(str, 0, 0); 06220 str = rb_str_new4(str); 06221 ptr = RSTRING_PTR(str); 06222 end = RSTRING_END(str); 06223 enc = STR_ENC_GET(str); 06224 while (ptr < end) { 06225 c = rb_enc_codepoint_len(ptr, end, &n, enc); 06226 rb_yield(UINT2NUM(c)); 06227 ptr += n; 06228 } 06229 return orig; 06230 } 06231 06232 static long 06233 chopped_length(VALUE str) 06234 { 06235 rb_encoding *enc = STR_ENC_GET(str); 06236 const char *p, *p2, *beg, *end; 06237 06238 beg = RSTRING_PTR(str); 06239 end = beg + RSTRING_LEN(str); 06240 if (beg > end) return 0; 06241 p = rb_enc_prev_char(beg, end, end, enc); 06242 if (!p) return 0; 06243 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') { 06244 p2 = rb_enc_prev_char(beg, p, end, enc); 06245 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2; 06246 } 06247 return p - beg; 06248 } 06249 06250 /* 06251 * call-seq: 06252 * str.chop! -> str or nil 06253 * 06254 * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>, 06255 * or <code>nil</code> if <i>str</i> is the empty string. See also 06256 * <code>String#chomp!</code>. 06257 */ 06258 06259 static VALUE 06260 rb_str_chop_bang(VALUE str) 06261 { 06262 str_modify_keep_cr(str); 06263 if (RSTRING_LEN(str) > 0) { 06264 long len; 06265 len = chopped_length(str); 06266 STR_SET_LEN(str, len); 06267 RSTRING_PTR(str)[len] = '\0'; 06268 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { 06269 ENC_CODERANGE_CLEAR(str); 06270 } 06271 return str; 06272 } 06273 return Qnil; 06274 } 06275 06276 06277 /* 06278 * call-seq: 06279 * str.chop -> new_str 06280 * 06281 * Returns a new <code>String</code> with the last character removed. If the 06282 * string ends with <code>\r\n</code>, both characters are removed. Applying 06283 * <code>chop</code> to an empty string returns an empty 06284 * string. <code>String#chomp</code> is often a safer alternative, as it leaves 06285 * the string unchanged if it doesn't end in a record separator. 06286 * 06287 * "string\r\n".chop #=> "string" 06288 * "string\n\r".chop #=> "string\n" 06289 * "string\n".chop #=> "string" 06290 * "string".chop #=> "strin" 06291 * "x".chop.chop #=> "" 06292 */ 06293 06294 static VALUE 06295 rb_str_chop(VALUE str) 06296 { 06297 VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str)); 06298 rb_enc_cr_str_copy_for_substr(str2, str); 06299 OBJ_INFECT(str2, str); 06300 return str2; 06301 } 06302 06303 06304 /* 06305 * call-seq: 06306 * str.chomp!(separator=$/) -> str or nil 06307 * 06308 * Modifies <i>str</i> in place as described for <code>String#chomp</code>, 06309 * returning <i>str</i>, or <code>nil</code> if no modifications were made. 06310 */ 06311 06312 static VALUE 06313 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str) 06314 { 06315 rb_encoding *enc; 06316 VALUE rs; 06317 int newline; 06318 char *p, *pp, *e; 06319 long len, rslen; 06320 06321 str_modify_keep_cr(str); 06322 len = RSTRING_LEN(str); 06323 if (len == 0) return Qnil; 06324 p = RSTRING_PTR(str); 06325 e = p + len; 06326 if (argc == 0) { 06327 rs = rb_rs; 06328 if (rs == rb_default_rs) { 06329 smart_chomp: 06330 enc = rb_enc_get(str); 06331 if (rb_enc_mbminlen(enc) > 1) { 06332 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc); 06333 if (rb_enc_is_newline(pp, e, enc)) { 06334 e = pp; 06335 } 06336 pp = e - rb_enc_mbminlen(enc); 06337 if (pp >= p) { 06338 pp = rb_enc_left_char_head(p, pp, e, enc); 06339 if (rb_enc_ascget(pp, e, 0, enc) == '\r') { 06340 e = pp; 06341 } 06342 } 06343 if (e == RSTRING_END(str)) { 06344 return Qnil; 06345 } 06346 len = e - RSTRING_PTR(str); 06347 STR_SET_LEN(str, len); 06348 } 06349 else { 06350 if (RSTRING_PTR(str)[len-1] == '\n') { 06351 STR_DEC_LEN(str); 06352 if (RSTRING_LEN(str) > 0 && 06353 RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') { 06354 STR_DEC_LEN(str); 06355 } 06356 } 06357 else if (RSTRING_PTR(str)[len-1] == '\r') { 06358 STR_DEC_LEN(str); 06359 } 06360 else { 06361 return Qnil; 06362 } 06363 } 06364 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 06365 return str; 06366 } 06367 } 06368 else { 06369 rb_scan_args(argc, argv, "01", &rs); 06370 } 06371 if (NIL_P(rs)) return Qnil; 06372 StringValue(rs); 06373 rslen = RSTRING_LEN(rs); 06374 if (rslen == 0) { 06375 while (len>0 && p[len-1] == '\n') { 06376 len--; 06377 if (len>0 && p[len-1] == '\r') 06378 len--; 06379 } 06380 if (len < RSTRING_LEN(str)) { 06381 STR_SET_LEN(str, len); 06382 RSTRING_PTR(str)[len] = '\0'; 06383 return str; 06384 } 06385 return Qnil; 06386 } 06387 if (rslen > len) return Qnil; 06388 newline = RSTRING_PTR(rs)[rslen-1]; 06389 if (rslen == 1 && newline == '\n') 06390 goto smart_chomp; 06391 06392 enc = rb_enc_check(str, rs); 06393 if (is_broken_string(rs)) { 06394 return Qnil; 06395 } 06396 pp = e - rslen; 06397 if (p[len-1] == newline && 06398 (rslen <= 1 || 06399 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) { 06400 if (rb_enc_left_char_head(p, pp, e, enc) != pp) 06401 return Qnil; 06402 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { 06403 ENC_CODERANGE_CLEAR(str); 06404 } 06405 STR_SET_LEN(str, RSTRING_LEN(str) - rslen); 06406 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 06407 return str; 06408 } 06409 return Qnil; 06410 } 06411 06412 06413 /* 06414 * call-seq: 06415 * str.chomp(separator=$/) -> new_str 06416 * 06417 * Returns a new <code>String</code> with the given record separator removed 06418 * from the end of <i>str</i> (if present). If <code>$/</code> has not been 06419 * changed from the default Ruby record separator, then <code>chomp</code> also 06420 * removes carriage return characters (that is it will remove <code>\n</code>, 06421 * <code>\r</code>, and <code>\r\n</code>). 06422 * 06423 * "hello".chomp #=> "hello" 06424 * "hello\n".chomp #=> "hello" 06425 * "hello\r\n".chomp #=> "hello" 06426 * "hello\n\r".chomp #=> "hello\n" 06427 * "hello\r".chomp #=> "hello" 06428 * "hello \n there".chomp #=> "hello \n there" 06429 * "hello".chomp("llo") #=> "he" 06430 */ 06431 06432 static VALUE 06433 rb_str_chomp(int argc, VALUE *argv, VALUE str) 06434 { 06435 str = rb_str_dup(str); 06436 rb_str_chomp_bang(argc, argv, str); 06437 return str; 06438 } 06439 06440 /* 06441 * call-seq: 06442 * str.lstrip! -> self or nil 06443 * 06444 * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no 06445 * change was made. See also <code>String#rstrip!</code> and 06446 * <code>String#strip!</code>. 06447 * 06448 * " hello ".lstrip #=> "hello " 06449 * "hello".lstrip! #=> nil 06450 */ 06451 06452 static VALUE 06453 rb_str_lstrip_bang(VALUE str) 06454 { 06455 rb_encoding *enc; 06456 char *s, *t, *e; 06457 06458 str_modify_keep_cr(str); 06459 enc = STR_ENC_GET(str); 06460 s = RSTRING_PTR(str); 06461 if (!s || RSTRING_LEN(str) == 0) return Qnil; 06462 e = t = RSTRING_END(str); 06463 /* remove spaces at head */ 06464 while (s < e) { 06465 int n; 06466 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc); 06467 06468 if (!rb_isspace(cc)) break; 06469 s += n; 06470 } 06471 06472 if (s > RSTRING_PTR(str)) { 06473 STR_SET_LEN(str, t-s); 06474 memmove(RSTRING_PTR(str), s, RSTRING_LEN(str)); 06475 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 06476 return str; 06477 } 06478 return Qnil; 06479 } 06480 06481 06482 /* 06483 * call-seq: 06484 * str.lstrip -> new_str 06485 * 06486 * Returns a copy of <i>str</i> with leading whitespace removed. See also 06487 * <code>String#rstrip</code> and <code>String#strip</code>. 06488 * 06489 * " hello ".lstrip #=> "hello " 06490 * "hello".lstrip #=> "hello" 06491 */ 06492 06493 static VALUE 06494 rb_str_lstrip(VALUE str) 06495 { 06496 str = rb_str_dup(str); 06497 rb_str_lstrip_bang(str); 06498 return str; 06499 } 06500 06501 06502 /* 06503 * call-seq: 06504 * str.rstrip! -> self or nil 06505 * 06506 * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if 06507 * no change was made. See also <code>String#lstrip!</code> and 06508 * <code>String#strip!</code>. 06509 * 06510 * " hello ".rstrip #=> " hello" 06511 * "hello".rstrip! #=> nil 06512 */ 06513 06514 static VALUE 06515 rb_str_rstrip_bang(VALUE str) 06516 { 06517 rb_encoding *enc; 06518 char *s, *t, *e; 06519 06520 str_modify_keep_cr(str); 06521 enc = STR_ENC_GET(str); 06522 rb_str_check_dummy_enc(enc); 06523 s = RSTRING_PTR(str); 06524 if (!s || RSTRING_LEN(str) == 0) return Qnil; 06525 t = e = RSTRING_END(str); 06526 06527 /* remove trailing spaces or '\0's */ 06528 if (single_byte_optimizable(str)) { 06529 unsigned char c; 06530 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--; 06531 } 06532 else { 06533 char *tp; 06534 06535 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) { 06536 unsigned int c = rb_enc_codepoint(tp, e, enc); 06537 if (c && !rb_isspace(c)) break; 06538 t = tp; 06539 } 06540 } 06541 if (t < e) { 06542 long len = t-RSTRING_PTR(str); 06543 06544 STR_SET_LEN(str, len); 06545 RSTRING_PTR(str)[len] = '\0'; 06546 return str; 06547 } 06548 return Qnil; 06549 } 06550 06551 06552 /* 06553 * call-seq: 06554 * str.rstrip -> new_str 06555 * 06556 * Returns a copy of <i>str</i> with trailing whitespace removed. See also 06557 * <code>String#lstrip</code> and <code>String#strip</code>. 06558 * 06559 * " hello ".rstrip #=> " hello" 06560 * "hello".rstrip #=> "hello" 06561 */ 06562 06563 static VALUE 06564 rb_str_rstrip(VALUE str) 06565 { 06566 str = rb_str_dup(str); 06567 rb_str_rstrip_bang(str); 06568 return str; 06569 } 06570 06571 06572 /* 06573 * call-seq: 06574 * str.strip! -> str or nil 06575 * 06576 * Removes leading and trailing whitespace from <i>str</i>. Returns 06577 * <code>nil</code> if <i>str</i> was not altered. 06578 */ 06579 06580 static VALUE 06581 rb_str_strip_bang(VALUE str) 06582 { 06583 VALUE l = rb_str_lstrip_bang(str); 06584 VALUE r = rb_str_rstrip_bang(str); 06585 06586 if (NIL_P(l) && NIL_P(r)) return Qnil; 06587 return str; 06588 } 06589 06590 06591 /* 06592 * call-seq: 06593 * str.strip -> new_str 06594 * 06595 * Returns a copy of <i>str</i> with leading and trailing whitespace removed. 06596 * 06597 * " hello ".strip #=> "hello" 06598 * "\tgoodbye\r\n".strip #=> "goodbye" 06599 */ 06600 06601 static VALUE 06602 rb_str_strip(VALUE str) 06603 { 06604 str = rb_str_dup(str); 06605 rb_str_strip_bang(str); 06606 return str; 06607 } 06608 06609 static VALUE 06610 scan_once(VALUE str, VALUE pat, long *start) 06611 { 06612 VALUE result, match; 06613 struct re_registers *regs; 06614 int i; 06615 06616 if (rb_reg_search(pat, str, *start, 0) >= 0) { 06617 match = rb_backref_get(); 06618 regs = RMATCH_REGS(match); 06619 if (BEG(0) == END(0)) { 06620 rb_encoding *enc = STR_ENC_GET(str); 06621 /* 06622 * Always consume at least one character of the input string 06623 */ 06624 if (RSTRING_LEN(str) > END(0)) 06625 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0), 06626 RSTRING_END(str), enc); 06627 else 06628 *start = END(0)+1; 06629 } 06630 else { 06631 *start = END(0); 06632 } 06633 if (regs->num_regs == 1) { 06634 return rb_reg_nth_match(0, match); 06635 } 06636 result = rb_ary_new2(regs->num_regs); 06637 for (i=1; i < regs->num_regs; i++) { 06638 rb_ary_push(result, rb_reg_nth_match(i, match)); 06639 } 06640 06641 return result; 06642 } 06643 return Qnil; 06644 } 06645 06646 06647 /* 06648 * call-seq: 06649 * str.scan(pattern) -> array 06650 * str.scan(pattern) {|match, ...| block } -> str 06651 * 06652 * Both forms iterate through <i>str</i>, matching the pattern (which may be a 06653 * <code>Regexp</code> or a <code>String</code>). For each match, a result is 06654 * generated and either added to the result array or passed to the block. If 06655 * the pattern contains no groups, each individual result consists of the 06656 * matched string, <code>$&</code>. If the pattern contains groups, each 06657 * individual result is itself an array containing one entry per group. 06658 * 06659 * a = "cruel world" 06660 * a.scan(/\w+/) #=> ["cruel", "world"] 06661 * a.scan(/.../) #=> ["cru", "el ", "wor"] 06662 * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]] 06663 * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]] 06664 * 06665 * And the block form: 06666 * 06667 * a.scan(/\w+/) {|w| print "<<#{w}>> " } 06668 * print "\n" 06669 * a.scan(/(.)(.)/) {|x,y| print y, x } 06670 * print "\n" 06671 * 06672 * <em>produces:</em> 06673 * 06674 * <<cruel>> <<world>> 06675 * rceu lowlr 06676 */ 06677 06678 static VALUE 06679 rb_str_scan(VALUE str, VALUE pat) 06680 { 06681 VALUE result; 06682 long start = 0; 06683 long last = -1, prev = 0; 06684 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str); 06685 06686 pat = get_pat(pat, 1); 06687 if (!rb_block_given_p()) { 06688 VALUE ary = rb_ary_new(); 06689 06690 while (!NIL_P(result = scan_once(str, pat, &start))) { 06691 last = prev; 06692 prev = start; 06693 rb_ary_push(ary, result); 06694 } 06695 if (last >= 0) rb_reg_search(pat, str, last, 0); 06696 return ary; 06697 } 06698 06699 while (!NIL_P(result = scan_once(str, pat, &start))) { 06700 last = prev; 06701 prev = start; 06702 rb_yield(result); 06703 str_mod_check(str, p, len); 06704 } 06705 if (last >= 0) rb_reg_search(pat, str, last, 0); 06706 return str; 06707 } 06708 06709 06710 /* 06711 * call-seq: 06712 * str.hex -> integer 06713 * 06714 * Treats leading characters from <i>str</i> as a string of hexadecimal digits 06715 * (with an optional sign and an optional <code>0x</code>) and returns the 06716 * corresponding number. Zero is returned on error. 06717 * 06718 * "0x0a".hex #=> 10 06719 * "-1234".hex #=> -4660 06720 * "0".hex #=> 0 06721 * "wombat".hex #=> 0 06722 */ 06723 06724 static VALUE 06725 rb_str_hex(VALUE str) 06726 { 06727 rb_encoding *enc = rb_enc_get(str); 06728 06729 if (!rb_enc_asciicompat(enc)) { 06730 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc)); 06731 } 06732 return rb_str_to_inum(str, 16, FALSE); 06733 } 06734 06735 06736 /* 06737 * call-seq: 06738 * str.oct -> integer 06739 * 06740 * Treats leading characters of <i>str</i> as a string of octal digits (with an 06741 * optional sign) and returns the corresponding number. Returns 0 if the 06742 * conversion fails. 06743 * 06744 * "123".oct #=> 83 06745 * "-377".oct #=> -255 06746 * "bad".oct #=> 0 06747 * "0377bad".oct #=> 255 06748 */ 06749 06750 static VALUE 06751 rb_str_oct(VALUE str) 06752 { 06753 rb_encoding *enc = rb_enc_get(str); 06754 06755 if (!rb_enc_asciicompat(enc)) { 06756 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc)); 06757 } 06758 return rb_str_to_inum(str, -8, FALSE); 06759 } 06760 06761 06762 /* 06763 * call-seq: 06764 * str.crypt(other_str) -> new_str 06765 * 06766 * Applies a one-way cryptographic hash to <i>str</i> by invoking the standard 06767 * library function <code>crypt</code>. The argument is the salt string, which 06768 * should be two characters long, each character drawn from 06769 * <code>[a-zA-Z0-9./]</code>. 06770 */ 06771 06772 static VALUE 06773 rb_str_crypt(VALUE str, VALUE salt) 06774 { 06775 extern char *crypt(const char *, const char *); 06776 VALUE result; 06777 const char *s, *saltp; 06778 #ifdef BROKEN_CRYPT 06779 char salt_8bit_clean[3]; 06780 #endif 06781 06782 StringValue(salt); 06783 if (RSTRING_LEN(salt) < 2) 06784 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)"); 06785 06786 s = RSTRING_PTR(str); 06787 if (!s) s = ""; 06788 saltp = RSTRING_PTR(salt); 06789 #ifdef BROKEN_CRYPT 06790 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) { 06791 salt_8bit_clean[0] = saltp[0] & 0x7f; 06792 salt_8bit_clean[1] = saltp[1] & 0x7f; 06793 salt_8bit_clean[2] = '\0'; 06794 saltp = salt_8bit_clean; 06795 } 06796 #endif 06797 result = rb_str_new2(crypt(s, saltp)); 06798 OBJ_INFECT(result, str); 06799 OBJ_INFECT(result, salt); 06800 return result; 06801 } 06802 06803 06804 /* 06805 * call-seq: 06806 * str.intern -> symbol 06807 * str.to_sym -> symbol 06808 * 06809 * Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the 06810 * symbol if it did not previously exist. See <code>Symbol#id2name</code>. 06811 * 06812 * "Koala".intern #=> :Koala 06813 * s = 'cat'.to_sym #=> :cat 06814 * s == :cat #=> true 06815 * s = '@cat'.to_sym #=> :@cat 06816 * s == :@cat #=> true 06817 * 06818 * This can also be used to create symbols that cannot be represented using the 06819 * <code>:xxx</code> notation. 06820 * 06821 * 'cat and dog'.to_sym #=> :"cat and dog" 06822 */ 06823 06824 VALUE 06825 rb_str_intern(VALUE s) 06826 { 06827 VALUE str = RB_GC_GUARD(s); 06828 ID id; 06829 06830 id = rb_intern_str(str); 06831 return ID2SYM(id); 06832 } 06833 06834 06835 /* 06836 * call-seq: 06837 * str.ord -> integer 06838 * 06839 * Return the <code>Integer</code> ordinal of a one-character string. 06840 * 06841 * "a".ord #=> 97 06842 */ 06843 06844 VALUE 06845 rb_str_ord(VALUE s) 06846 { 06847 unsigned int c; 06848 06849 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s)); 06850 return UINT2NUM(c); 06851 } 06852 /* 06853 * call-seq: 06854 * str.sum(n=16) -> integer 06855 * 06856 * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>, 06857 * where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting 06858 * to 16. The result is simply the sum of the binary value of each character in 06859 * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good 06860 * checksum. 06861 */ 06862 06863 static VALUE 06864 rb_str_sum(int argc, VALUE *argv, VALUE str) 06865 { 06866 VALUE vbits; 06867 int bits; 06868 char *ptr, *p, *pend; 06869 long len; 06870 VALUE sum = INT2FIX(0); 06871 unsigned long sum0 = 0; 06872 06873 if (argc == 0) { 06874 bits = 16; 06875 } 06876 else { 06877 rb_scan_args(argc, argv, "01", &vbits); 06878 bits = NUM2INT(vbits); 06879 } 06880 ptr = p = RSTRING_PTR(str); 06881 len = RSTRING_LEN(str); 06882 pend = p + len; 06883 06884 while (p < pend) { 06885 if (FIXNUM_MAX - UCHAR_MAX < sum0) { 06886 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 06887 str_mod_check(str, ptr, len); 06888 sum0 = 0; 06889 } 06890 sum0 += (unsigned char)*p; 06891 p++; 06892 } 06893 06894 if (bits == 0) { 06895 if (sum0) { 06896 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 06897 } 06898 } 06899 else { 06900 if (sum == INT2FIX(0)) { 06901 if (bits < (int)sizeof(long)*CHAR_BIT) { 06902 sum0 &= (((unsigned long)1)<<bits)-1; 06903 } 06904 sum = LONG2FIX(sum0); 06905 } 06906 else { 06907 VALUE mod; 06908 06909 if (sum0) { 06910 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 06911 } 06912 06913 mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits)); 06914 mod = rb_funcall(mod, '-', 1, INT2FIX(1)); 06915 sum = rb_funcall(sum, '&', 1, mod); 06916 } 06917 } 06918 return sum; 06919 } 06920 06921 static VALUE 06922 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag) 06923 { 06924 rb_encoding *enc; 06925 VALUE w; 06926 long width, len, flen = 1, fclen = 1; 06927 VALUE res; 06928 char *p; 06929 const char *f = " "; 06930 long n, size, llen, rlen, llen2 = 0, rlen2 = 0; 06931 volatile VALUE pad; 06932 int singlebyte = 1, cr; 06933 06934 rb_scan_args(argc, argv, "11", &w, &pad); 06935 enc = STR_ENC_GET(str); 06936 width = NUM2LONG(w); 06937 if (argc == 2) { 06938 StringValue(pad); 06939 enc = rb_enc_check(str, pad); 06940 f = RSTRING_PTR(pad); 06941 flen = RSTRING_LEN(pad); 06942 fclen = str_strlen(pad, enc); 06943 singlebyte = single_byte_optimizable(pad); 06944 if (flen == 0 || fclen == 0) { 06945 rb_raise(rb_eArgError, "zero width padding"); 06946 } 06947 } 06948 len = str_strlen(str, enc); 06949 if (width < 0 || len >= width) return rb_str_dup(str); 06950 n = width - len; 06951 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2); 06952 rlen = n - llen; 06953 cr = ENC_CODERANGE(str); 06954 if (flen > 1) { 06955 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte); 06956 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte); 06957 } 06958 size = RSTRING_LEN(str); 06959 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen || 06960 (len *= flen) >= LONG_MAX - llen2 - rlen2 || 06961 (len += llen2 + rlen2) >= LONG_MAX - size) { 06962 rb_raise(rb_eArgError, "argument too big"); 06963 } 06964 len += size; 06965 res = rb_str_new5(str, 0, len); 06966 p = RSTRING_PTR(res); 06967 if (flen <= 1) { 06968 memset(p, *f, llen); 06969 p += llen; 06970 } 06971 else { 06972 while (llen >= fclen) { 06973 memcpy(p,f,flen); 06974 p += flen; 06975 llen -= fclen; 06976 } 06977 if (llen > 0) { 06978 memcpy(p, f, llen2); 06979 p += llen2; 06980 } 06981 } 06982 memcpy(p, RSTRING_PTR(str), size); 06983 p += size; 06984 if (flen <= 1) { 06985 memset(p, *f, rlen); 06986 p += rlen; 06987 } 06988 else { 06989 while (rlen >= fclen) { 06990 memcpy(p,f,flen); 06991 p += flen; 06992 rlen -= fclen; 06993 } 06994 if (rlen > 0) { 06995 memcpy(p, f, rlen2); 06996 p += rlen2; 06997 } 06998 } 06999 *p = '\0'; 07000 STR_SET_LEN(res, p-RSTRING_PTR(res)); 07001 OBJ_INFECT(res, str); 07002 if (!NIL_P(pad)) OBJ_INFECT(res, pad); 07003 rb_enc_associate(res, enc); 07004 if (argc == 2) 07005 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad)); 07006 if (cr != ENC_CODERANGE_BROKEN) 07007 ENC_CODERANGE_SET(res, cr); 07008 return res; 07009 } 07010 07011 07012 /* 07013 * call-seq: 07014 * str.ljust(integer, padstr=' ') -> new_str 07015 * 07016 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new 07017 * <code>String</code> of length <i>integer</i> with <i>str</i> left justified 07018 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>. 07019 * 07020 * "hello".ljust(4) #=> "hello" 07021 * "hello".ljust(20) #=> "hello " 07022 * "hello".ljust(20, '1234') #=> "hello123412341234123" 07023 */ 07024 07025 static VALUE 07026 rb_str_ljust(int argc, VALUE *argv, VALUE str) 07027 { 07028 return rb_str_justify(argc, argv, str, 'l'); 07029 } 07030 07031 07032 /* 07033 * call-seq: 07034 * str.rjust(integer, padstr=' ') -> new_str 07035 * 07036 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new 07037 * <code>String</code> of length <i>integer</i> with <i>str</i> right justified 07038 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>. 07039 * 07040 * "hello".rjust(4) #=> "hello" 07041 * "hello".rjust(20) #=> " hello" 07042 * "hello".rjust(20, '1234') #=> "123412341234123hello" 07043 */ 07044 07045 static VALUE 07046 rb_str_rjust(int argc, VALUE *argv, VALUE str) 07047 { 07048 return rb_str_justify(argc, argv, str, 'r'); 07049 } 07050 07051 07052 /* 07053 * call-seq: 07054 * str.center(integer, padstr) -> new_str 07055 * 07056 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new 07057 * <code>String</code> of length <i>integer</i> with <i>str</i> centered and 07058 * padded with <i>padstr</i>; otherwise, returns <i>str</i>. 07059 * 07060 * "hello".center(4) #=> "hello" 07061 * "hello".center(20) #=> " hello " 07062 * "hello".center(20, '123') #=> "1231231hello12312312" 07063 */ 07064 07065 static VALUE 07066 rb_str_center(int argc, VALUE *argv, VALUE str) 07067 { 07068 return rb_str_justify(argc, argv, str, 'c'); 07069 } 07070 07071 /* 07072 * call-seq: 07073 * str.partition(sep) -> [head, sep, tail] 07074 * str.partition(regexp) -> [head, match, tail] 07075 * 07076 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string 07077 * and returns the part before it, the match, and the part 07078 * after it. 07079 * If it is not found, returns two empty strings and <i>str</i>. 07080 * 07081 * "hello".partition("l") #=> ["he", "l", "lo"] 07082 * "hello".partition("x") #=> ["hello", "", ""] 07083 * "hello".partition(/.l/) #=> ["h", "el", "lo"] 07084 */ 07085 07086 static VALUE 07087 rb_str_partition(VALUE str, VALUE sep) 07088 { 07089 long pos; 07090 int regex = FALSE; 07091 07092 if (TYPE(sep) == T_REGEXP) { 07093 pos = rb_reg_search(sep, str, 0, 0); 07094 regex = TRUE; 07095 } 07096 else { 07097 VALUE tmp; 07098 07099 tmp = rb_check_string_type(sep); 07100 if (NIL_P(tmp)) { 07101 rb_raise(rb_eTypeError, "type mismatch: %s given", 07102 rb_obj_classname(sep)); 07103 } 07104 sep = tmp; 07105 pos = rb_str_index(str, sep, 0); 07106 } 07107 if (pos < 0) { 07108 failed: 07109 return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str)); 07110 } 07111 if (regex) { 07112 sep = rb_str_subpat(str, sep, INT2FIX(0)); 07113 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed; 07114 } 07115 return rb_ary_new3(3, rb_str_subseq(str, 0, pos), 07116 sep, 07117 rb_str_subseq(str, pos+RSTRING_LEN(sep), 07118 RSTRING_LEN(str)-pos-RSTRING_LEN(sep))); 07119 } 07120 07121 /* 07122 * call-seq: 07123 * str.rpartition(sep) -> [head, sep, tail] 07124 * str.rpartition(regexp) -> [head, match, tail] 07125 * 07126 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end 07127 * of the string, and returns the part before it, the match, and the part 07128 * after it. 07129 * If it is not found, returns two empty strings and <i>str</i>. 07130 * 07131 * "hello".rpartition("l") #=> ["hel", "l", "o"] 07132 * "hello".rpartition("x") #=> ["", "", "hello"] 07133 * "hello".rpartition(/.l/) #=> ["he", "ll", "o"] 07134 */ 07135 07136 static VALUE 07137 rb_str_rpartition(VALUE str, VALUE sep) 07138 { 07139 long pos = RSTRING_LEN(str); 07140 int regex = FALSE; 07141 07142 if (TYPE(sep) == T_REGEXP) { 07143 pos = rb_reg_search(sep, str, pos, 1); 07144 regex = TRUE; 07145 } 07146 else { 07147 VALUE tmp; 07148 07149 tmp = rb_check_string_type(sep); 07150 if (NIL_P(tmp)) { 07151 rb_raise(rb_eTypeError, "type mismatch: %s given", 07152 rb_obj_classname(sep)); 07153 } 07154 sep = tmp; 07155 pos = rb_str_sublen(str, pos); 07156 pos = rb_str_rindex(str, sep, pos); 07157 } 07158 if (pos < 0) { 07159 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str); 07160 } 07161 if (regex) { 07162 sep = rb_reg_nth_match(0, rb_backref_get()); 07163 } 07164 return rb_ary_new3(3, rb_str_substr(str, 0, pos), 07165 sep, 07166 rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str))); 07167 } 07168 07169 /* 07170 * call-seq: 07171 * str.start_with?([prefix]+) -> true or false 07172 * 07173 * Returns true if <i>str</i> starts with one of the prefixes given. 07174 * 07175 * p "hello".start_with?("hell") #=> true 07176 * 07177 * # returns true if one of the prefixes matches. 07178 * p "hello".start_with?("heaven", "hell") #=> true 07179 * p "hello".start_with?("heaven", "paradise") #=> false 07180 * 07181 * 07182 * 07183 */ 07184 07185 static VALUE 07186 rb_str_start_with(int argc, VALUE *argv, VALUE str) 07187 { 07188 int i; 07189 07190 for (i=0; i<argc; i++) { 07191 VALUE tmp = rb_check_string_type(argv[i]); 07192 if (NIL_P(tmp)) continue; 07193 rb_enc_check(str, tmp); 07194 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue; 07195 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) 07196 return Qtrue; 07197 } 07198 return Qfalse; 07199 } 07200 07201 /* 07202 * call-seq: 07203 * str.end_with?([suffix]+) -> true or false 07204 * 07205 * Returns true if <i>str</i> ends with one of the suffixes given. 07206 */ 07207 07208 static VALUE 07209 rb_str_end_with(int argc, VALUE *argv, VALUE str) 07210 { 07211 int i; 07212 char *p, *s, *e; 07213 rb_encoding *enc; 07214 07215 for (i=0; i<argc; i++) { 07216 VALUE tmp = rb_check_string_type(argv[i]); 07217 if (NIL_P(tmp)) continue; 07218 enc = rb_enc_check(str, tmp); 07219 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue; 07220 p = RSTRING_PTR(str); 07221 e = p + RSTRING_LEN(str); 07222 s = e - RSTRING_LEN(tmp); 07223 if (rb_enc_left_char_head(p, s, e, enc) != s) 07224 continue; 07225 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) 07226 return Qtrue; 07227 } 07228 return Qfalse; 07229 } 07230 07231 void 07232 rb_str_setter(VALUE val, ID id, VALUE *var) 07233 { 07234 if (!NIL_P(val) && TYPE(val) != T_STRING) { 07235 rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id)); 07236 } 07237 *var = val; 07238 } 07239 07240 07241 /* 07242 * call-seq: 07243 * str.force_encoding(encoding) -> str 07244 * 07245 * Changes the encoding to +encoding+ and returns self. 07246 */ 07247 07248 static VALUE 07249 rb_str_force_encoding(VALUE str, VALUE enc) 07250 { 07251 str_modifiable(str); 07252 rb_enc_associate(str, rb_to_encoding(enc)); 07253 ENC_CODERANGE_CLEAR(str); 07254 return str; 07255 } 07256 07257 /* 07258 * call-seq: 07259 * str.valid_encoding? -> true or false 07260 * 07261 * Returns true for a string which encoded correctly. 07262 * 07263 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true 07264 * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false 07265 * "\x80".force_encoding("UTF-8").valid_encoding? #=> false 07266 */ 07267 07268 static VALUE 07269 rb_str_valid_encoding_p(VALUE str) 07270 { 07271 int cr = rb_enc_str_coderange(str); 07272 07273 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue; 07274 } 07275 07276 /* 07277 * call-seq: 07278 * str.ascii_only? -> true or false 07279 * 07280 * Returns true for a string which has only ASCII characters. 07281 * 07282 * "abc".force_encoding("UTF-8").ascii_only? #=> true 07283 * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false 07284 */ 07285 07286 static VALUE 07287 rb_str_is_ascii_only_p(VALUE str) 07288 { 07289 int cr = rb_enc_str_coderange(str); 07290 07291 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse; 07292 } 07293 07308 VALUE 07309 rb_str_ellipsize(VALUE str, long len) 07310 { 07311 static const char ellipsis[] = "..."; 07312 const long ellipsislen = sizeof(ellipsis) - 1; 07313 rb_encoding *const enc = rb_enc_get(str); 07314 const long blen = RSTRING_LEN(str); 07315 const char *const p = RSTRING_PTR(str), *e = p + blen; 07316 VALUE estr, ret = 0; 07317 07318 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len); 07319 if (len * rb_enc_mbminlen(enc) >= blen || 07320 (e = rb_enc_nth(p, e, len, enc)) - p == blen) { 07321 ret = str; 07322 } 07323 else if (len <= ellipsislen || 07324 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) { 07325 if (rb_enc_asciicompat(enc)) { 07326 ret = rb_str_new_with_class(str, ellipsis, len); 07327 rb_enc_associate(ret, enc); 07328 } 07329 else { 07330 estr = rb_usascii_str_new(ellipsis, len); 07331 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil); 07332 } 07333 } 07334 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) { 07335 rb_str_cat(ret, ellipsis, ellipsislen); 07336 } 07337 else { 07338 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen), 07339 rb_enc_from_encoding(enc), 0, Qnil); 07340 rb_str_append(ret, estr); 07341 } 07342 return ret; 07343 } 07344 07345 /********************************************************************** 07346 * Document-class: Symbol 07347 * 07348 * <code>Symbol</code> objects represent names and some strings 07349 * inside the Ruby 07350 * interpreter. They are generated using the <code>:name</code> and 07351 * <code>:"string"</code> literals 07352 * syntax, and by the various <code>to_sym</code> methods. The same 07353 * <code>Symbol</code> object will be created for a given name or string 07354 * for the duration of a program's execution, regardless of the context 07355 * or meaning of that name. Thus if <code>Fred</code> is a constant in 07356 * one context, a method in another, and a class in a third, the 07357 * <code>Symbol</code> <code>:Fred</code> will be the same object in 07358 * all three contexts. 07359 * 07360 * module One 07361 * class Fred 07362 * end 07363 * $f1 = :Fred 07364 * end 07365 * module Two 07366 * Fred = 1 07367 * $f2 = :Fred 07368 * end 07369 * def Fred() 07370 * end 07371 * $f3 = :Fred 07372 * $f1.object_id #=> 2514190 07373 * $f2.object_id #=> 2514190 07374 * $f3.object_id #=> 2514190 07375 * 07376 */ 07377 07378 07379 /* 07380 * call-seq: 07381 * sym == obj -> true or false 07382 * 07383 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same 07384 * symbol, returns <code>true</code>. 07385 */ 07386 07387 static VALUE 07388 sym_equal(VALUE sym1, VALUE sym2) 07389 { 07390 if (sym1 == sym2) return Qtrue; 07391 return Qfalse; 07392 } 07393 07394 07395 static int 07396 sym_printable(const char *s, const char *send, rb_encoding *enc) 07397 { 07398 while (s < send) { 07399 int n; 07400 int c = rb_enc_codepoint_len(s, send, &n, enc); 07401 07402 if (!rb_enc_isprint(c, enc)) return FALSE; 07403 s += n; 07404 } 07405 return TRUE; 07406 } 07407 07408 /* 07409 * call-seq: 07410 * sym.inspect -> string 07411 * 07412 * Returns the representation of <i>sym</i> as a symbol literal. 07413 * 07414 * :fred.inspect #=> ":fred" 07415 */ 07416 07417 static VALUE 07418 sym_inspect(VALUE sym) 07419 { 07420 VALUE str; 07421 ID id = SYM2ID(sym); 07422 rb_encoding *enc; 07423 const char *ptr; 07424 long len; 07425 char *dest; 07426 rb_encoding *resenc = rb_default_internal_encoding(); 07427 07428 if (resenc == NULL) resenc = rb_default_external_encoding(); 07429 sym = rb_id2str(id); 07430 enc = STR_ENC_GET(sym); 07431 ptr = RSTRING_PTR(sym); 07432 len = RSTRING_LEN(sym); 07433 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) || 07434 !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) { 07435 str = rb_str_inspect(sym); 07436 len = RSTRING_LEN(str); 07437 rb_str_resize(str, len + 1); 07438 dest = RSTRING_PTR(str); 07439 memmove(dest + 1, dest, len); 07440 dest[0] = ':'; 07441 } 07442 else { 07443 char *dest; 07444 str = rb_enc_str_new(0, len + 1, enc); 07445 dest = RSTRING_PTR(str); 07446 dest[0] = ':'; 07447 memcpy(dest + 1, ptr, len); 07448 } 07449 return str; 07450 } 07451 07452 07453 /* 07454 * call-seq: 07455 * sym.id2name -> string 07456 * sym.to_s -> string 07457 * 07458 * Returns the name or string corresponding to <i>sym</i>. 07459 * 07460 * :fred.id2name #=> "fred" 07461 */ 07462 07463 07464 VALUE 07465 rb_sym_to_s(VALUE sym) 07466 { 07467 ID id = SYM2ID(sym); 07468 07469 return str_new3(rb_cString, rb_id2str(id)); 07470 } 07471 07472 07473 /* 07474 * call-seq: 07475 * sym.to_sym -> sym 07476 * sym.intern -> sym 07477 * 07478 * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding 07479 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned 07480 * in this case. 07481 */ 07482 07483 static VALUE 07484 sym_to_sym(VALUE sym) 07485 { 07486 return sym; 07487 } 07488 07489 static VALUE 07490 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv) 07491 { 07492 VALUE obj; 07493 07494 if (argc < 1) { 07495 rb_raise(rb_eArgError, "no receiver given"); 07496 } 07497 obj = argv[0]; 07498 return rb_funcall_passing_block(obj, (ID)sym, argc - 1, argv + 1); 07499 } 07500 07501 /* 07502 * call-seq: 07503 * sym.to_proc 07504 * 07505 * Returns a _Proc_ object which respond to the given method by _sym_. 07506 * 07507 * (1..3).collect(&:to_s) #=> ["1", "2", "3"] 07508 */ 07509 07510 static VALUE 07511 sym_to_proc(VALUE sym) 07512 { 07513 static VALUE sym_proc_cache = Qfalse; 07514 enum {SYM_PROC_CACHE_SIZE = 67}; 07515 VALUE proc; 07516 long id, index; 07517 VALUE *aryp; 07518 07519 if (!sym_proc_cache) { 07520 sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2); 07521 rb_gc_register_mark_object(sym_proc_cache); 07522 rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil); 07523 } 07524 07525 id = SYM2ID(sym); 07526 index = (id % SYM_PROC_CACHE_SIZE) << 1; 07527 07528 aryp = RARRAY_PTR(sym_proc_cache); 07529 if (aryp[index] == sym) { 07530 return aryp[index + 1]; 07531 } 07532 else { 07533 proc = rb_proc_new(sym_call, (VALUE)id); 07534 aryp[index] = sym; 07535 aryp[index + 1] = proc; 07536 return proc; 07537 } 07538 } 07539 07540 /* 07541 * call-seq: 07542 * 07543 * sym.succ 07544 * 07545 * Same as <code>sym.to_s.succ.intern</code>. 07546 */ 07547 07548 static VALUE 07549 sym_succ(VALUE sym) 07550 { 07551 return rb_str_intern(rb_str_succ(rb_sym_to_s(sym))); 07552 } 07553 07554 /* 07555 * call-seq: 07556 * 07557 * str <=> other -> -1, 0, +1 or nil 07558 * 07559 * Compares _sym_ with _other_ in string form. 07560 */ 07561 07562 static VALUE 07563 sym_cmp(VALUE sym, VALUE other) 07564 { 07565 if (!SYMBOL_P(other)) { 07566 return Qnil; 07567 } 07568 return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other)); 07569 } 07570 07571 /* 07572 * call-seq: 07573 * 07574 * sym.casecmp(other) -> -1, 0, +1 or nil 07575 * 07576 * Case-insensitive version of <code>Symbol#<=></code>. 07577 */ 07578 07579 static VALUE 07580 sym_casecmp(VALUE sym, VALUE other) 07581 { 07582 if (!SYMBOL_P(other)) { 07583 return Qnil; 07584 } 07585 return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other)); 07586 } 07587 07588 /* 07589 * call-seq: 07590 * sym =~ obj -> fixnum or nil 07591 * 07592 * Returns <code>sym.to_s =~ obj</code>. 07593 */ 07594 07595 static VALUE 07596 sym_match(VALUE sym, VALUE other) 07597 { 07598 return rb_str_match(rb_sym_to_s(sym), other); 07599 } 07600 07601 /* 07602 * call-seq: 07603 * sym[idx] -> char 07604 * sym[b, n] -> char 07605 * 07606 * Returns <code>sym.to_s[]</code>. 07607 */ 07608 07609 static VALUE 07610 sym_aref(int argc, VALUE *argv, VALUE sym) 07611 { 07612 return rb_str_aref_m(argc, argv, rb_sym_to_s(sym)); 07613 } 07614 07615 /* 07616 * call-seq: 07617 * sym.length -> integer 07618 * 07619 * Same as <code>sym.to_s.length</code>. 07620 */ 07621 07622 static VALUE 07623 sym_length(VALUE sym) 07624 { 07625 return rb_str_length(rb_id2str(SYM2ID(sym))); 07626 } 07627 07628 /* 07629 * call-seq: 07630 * sym.empty? -> true or false 07631 * 07632 * Returns that _sym_ is :"" or not. 07633 */ 07634 07635 static VALUE 07636 sym_empty(VALUE sym) 07637 { 07638 return rb_str_empty(rb_id2str(SYM2ID(sym))); 07639 } 07640 07641 /* 07642 * call-seq: 07643 * sym.upcase -> symbol 07644 * 07645 * Same as <code>sym.to_s.upcase.intern</code>. 07646 */ 07647 07648 static VALUE 07649 sym_upcase(VALUE sym) 07650 { 07651 return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym)))); 07652 } 07653 07654 /* 07655 * call-seq: 07656 * sym.downcase -> symbol 07657 * 07658 * Same as <code>sym.to_s.downcase.intern</code>. 07659 */ 07660 07661 static VALUE 07662 sym_downcase(VALUE sym) 07663 { 07664 return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym)))); 07665 } 07666 07667 /* 07668 * call-seq: 07669 * sym.capitalize -> symbol 07670 * 07671 * Same as <code>sym.to_s.capitalize.intern</code>. 07672 */ 07673 07674 static VALUE 07675 sym_capitalize(VALUE sym) 07676 { 07677 return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym)))); 07678 } 07679 07680 /* 07681 * call-seq: 07682 * sym.swapcase -> symbol 07683 * 07684 * Same as <code>sym.to_s.swapcase.intern</code>. 07685 */ 07686 07687 static VALUE 07688 sym_swapcase(VALUE sym) 07689 { 07690 return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym)))); 07691 } 07692 07693 /* 07694 * call-seq: 07695 * sym.encoding -> encoding 07696 * 07697 * Returns the Encoding object that represents the encoding of _sym_. 07698 */ 07699 07700 static VALUE 07701 sym_encoding(VALUE sym) 07702 { 07703 return rb_obj_encoding(rb_id2str(SYM2ID(sym))); 07704 } 07705 07706 ID 07707 rb_to_id(VALUE name) 07708 { 07709 VALUE tmp; 07710 07711 switch (TYPE(name)) { 07712 default: 07713 tmp = rb_check_string_type(name); 07714 if (NIL_P(tmp)) { 07715 tmp = rb_inspect(name); 07716 rb_raise(rb_eTypeError, "%s is not a symbol", 07717 RSTRING_PTR(tmp)); 07718 } 07719 name = tmp; 07720 /* fall through */ 07721 case T_STRING: 07722 name = rb_str_intern(name); 07723 /* fall through */ 07724 case T_SYMBOL: 07725 return SYM2ID(name); 07726 } 07727 return Qnil; /* not reached */ 07728 } 07729 07730 /* 07731 * A <code>String</code> object holds and manipulates an arbitrary sequence of 07732 * bytes, typically representing characters. String objects may be created 07733 * using <code>String::new</code> or as literals. 07734 * 07735 * Because of aliasing issues, users of strings should be aware of the methods 07736 * that modify the contents of a <code>String</code> object. Typically, 07737 * methods with names ending in ``!'' modify their receiver, while those 07738 * without a ``!'' return a new <code>String</code>. However, there are 07739 * exceptions, such as <code>String#[]=</code>. 07740 * 07741 */ 07742 07743 void 07744 Init_String(void) 07745 { 07746 #undef rb_intern 07747 #define rb_intern(str) rb_intern_const(str) 07748 07749 rb_cString = rb_define_class("String", rb_cObject); 07750 rb_include_module(rb_cString, rb_mComparable); 07751 rb_define_alloc_func(rb_cString, str_alloc); 07752 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1); 07753 rb_define_method(rb_cString, "initialize", rb_str_init, -1); 07754 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1); 07755 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1); 07756 rb_define_method(rb_cString, "==", rb_str_equal, 1); 07757 rb_define_method(rb_cString, "===", rb_str_equal, 1); 07758 rb_define_method(rb_cString, "eql?", rb_str_eql, 1); 07759 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0); 07760 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1); 07761 rb_define_method(rb_cString, "+", rb_str_plus, 1); 07762 rb_define_method(rb_cString, "*", rb_str_times, 1); 07763 rb_define_method(rb_cString, "%", rb_str_format_m, 1); 07764 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1); 07765 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1); 07766 rb_define_method(rb_cString, "insert", rb_str_insert, 2); 07767 rb_define_method(rb_cString, "length", rb_str_length, 0); 07768 rb_define_method(rb_cString, "size", rb_str_length, 0); 07769 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0); 07770 rb_define_method(rb_cString, "empty?", rb_str_empty, 0); 07771 rb_define_method(rb_cString, "=~", rb_str_match, 1); 07772 rb_define_method(rb_cString, "match", rb_str_match_m, -1); 07773 rb_define_method(rb_cString, "succ", rb_str_succ, 0); 07774 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0); 07775 rb_define_method(rb_cString, "next", rb_str_succ, 0); 07776 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0); 07777 rb_define_method(rb_cString, "upto", rb_str_upto, -1); 07778 rb_define_method(rb_cString, "index", rb_str_index_m, -1); 07779 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1); 07780 rb_define_method(rb_cString, "replace", rb_str_replace, 1); 07781 rb_define_method(rb_cString, "clear", rb_str_clear, 0); 07782 rb_define_method(rb_cString, "chr", rb_str_chr, 0); 07783 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1); 07784 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2); 07785 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1); 07786 07787 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1); 07788 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0); 07789 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0); 07790 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0); 07791 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0); 07792 rb_define_method(rb_cString, "dump", rb_str_dump, 0); 07793 07794 rb_define_method(rb_cString, "upcase", rb_str_upcase, 0); 07795 rb_define_method(rb_cString, "downcase", rb_str_downcase, 0); 07796 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0); 07797 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0); 07798 07799 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0); 07800 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0); 07801 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0); 07802 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0); 07803 07804 rb_define_method(rb_cString, "hex", rb_str_hex, 0); 07805 rb_define_method(rb_cString, "oct", rb_str_oct, 0); 07806 rb_define_method(rb_cString, "split", rb_str_split_m, -1); 07807 rb_define_method(rb_cString, "lines", rb_str_each_line, -1); 07808 rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0); 07809 rb_define_method(rb_cString, "chars", rb_str_each_char, 0); 07810 rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0); 07811 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0); 07812 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0); 07813 rb_define_method(rb_cString, "concat", rb_str_concat, 1); 07814 rb_define_method(rb_cString, "<<", rb_str_concat, 1); 07815 rb_define_method(rb_cString, "prepend", rb_str_prepend, 1); 07816 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1); 07817 rb_define_method(rb_cString, "intern", rb_str_intern, 0); 07818 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); 07819 rb_define_method(rb_cString, "ord", rb_str_ord, 0); 07820 07821 rb_define_method(rb_cString, "include?", rb_str_include, 1); 07822 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1); 07823 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1); 07824 07825 rb_define_method(rb_cString, "scan", rb_str_scan, 1); 07826 07827 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1); 07828 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1); 07829 rb_define_method(rb_cString, "center", rb_str_center, -1); 07830 07831 rb_define_method(rb_cString, "sub", rb_str_sub, -1); 07832 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1); 07833 rb_define_method(rb_cString, "chop", rb_str_chop, 0); 07834 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1); 07835 rb_define_method(rb_cString, "strip", rb_str_strip, 0); 07836 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0); 07837 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0); 07838 07839 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1); 07840 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1); 07841 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0); 07842 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1); 07843 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0); 07844 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0); 07845 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0); 07846 07847 rb_define_method(rb_cString, "tr", rb_str_tr, 2); 07848 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2); 07849 rb_define_method(rb_cString, "delete", rb_str_delete, -1); 07850 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1); 07851 rb_define_method(rb_cString, "count", rb_str_count, -1); 07852 07853 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2); 07854 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2); 07855 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1); 07856 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1); 07857 07858 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1); 07859 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0); 07860 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0); 07861 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0); 07862 07863 rb_define_method(rb_cString, "sum", rb_str_sum, -1); 07864 07865 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1); 07866 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1); 07867 07868 rb_define_method(rb_cString, "partition", rb_str_partition, 1); 07869 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1); 07870 07871 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */ 07872 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1); 07873 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0); 07874 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0); 07875 07876 id_to_s = rb_intern("to_s"); 07877 07878 rb_fs = Qnil; 07879 rb_define_variable("$;", &rb_fs); 07880 rb_define_variable("$-F", &rb_fs); 07881 07882 rb_cSymbol = rb_define_class("Symbol", rb_cObject); 07883 rb_include_module(rb_cSymbol, rb_mComparable); 07884 rb_undef_alloc_func(rb_cSymbol); 07885 rb_undef_method(CLASS_OF(rb_cSymbol), "new"); 07886 rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */ 07887 07888 rb_define_method(rb_cSymbol, "==", sym_equal, 1); 07889 rb_define_method(rb_cSymbol, "===", sym_equal, 1); 07890 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0); 07891 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0); 07892 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0); 07893 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0); 07894 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0); 07895 rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0); 07896 rb_define_method(rb_cSymbol, "succ", sym_succ, 0); 07897 rb_define_method(rb_cSymbol, "next", sym_succ, 0); 07898 07899 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1); 07900 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1); 07901 rb_define_method(rb_cSymbol, "=~", sym_match, 1); 07902 07903 rb_define_method(rb_cSymbol, "[]", sym_aref, -1); 07904 rb_define_method(rb_cSymbol, "slice", sym_aref, -1); 07905 rb_define_method(rb_cSymbol, "length", sym_length, 0); 07906 rb_define_method(rb_cSymbol, "size", sym_length, 0); 07907 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0); 07908 rb_define_method(rb_cSymbol, "match", sym_match, 1); 07909 07910 rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0); 07911 rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0); 07912 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0); 07913 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0); 07914 07915 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0); 07916 } 07917