Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /********************************************************************** 00002 00003 encoding.c - 00004 00005 $Author: naruse $ 00006 created at: Thu May 24 17:23:27 JST 2007 00007 00008 Copyright (C) 2007 Yukihiro Matsumoto 00009 00010 **********************************************************************/ 00011 00012 #include "ruby/ruby.h" 00013 #include "ruby/encoding.h" 00014 #include "internal.h" 00015 #include "regenc.h" 00016 #include <ctype.h> 00017 #ifndef NO_LOCALE_CHARMAP 00018 #ifdef __CYGWIN__ 00019 #include <windows.h> 00020 #endif 00021 #ifdef HAVE_LANGINFO_H 00022 #include <langinfo.h> 00023 #endif 00024 #endif 00025 #include "ruby/util.h" 00026 00027 #if defined __GNUC__ && __GNUC__ >= 4 00028 #pragma GCC visibility push(default) 00029 int rb_enc_register(const char *name, rb_encoding *encoding); 00030 void rb_enc_set_base(const char *name, const char *orig); 00031 void rb_encdb_declare(const char *name); 00032 int rb_encdb_replicate(const char *name, const char *orig); 00033 int rb_encdb_dummy(const char *name); 00034 int rb_encdb_alias(const char *alias, const char *orig); 00035 #pragma GCC visibility pop 00036 #endif 00037 00038 static ID id_encoding; 00039 VALUE rb_cEncoding; 00040 static VALUE rb_encoding_list; 00041 00042 struct rb_encoding_entry { 00043 const char *name; 00044 rb_encoding *enc; 00045 rb_encoding *base; 00046 }; 00047 00048 static struct { 00049 struct rb_encoding_entry *list; 00050 int count; 00051 int size; 00052 st_table *names; 00053 } enc_table; 00054 00055 void rb_enc_init(void); 00056 00057 #define ENCODING_COUNT ENCINDEX_BUILTIN_MAX 00058 #define UNSPECIFIED_ENCODING INT_MAX 00059 00060 #define ENCODING_NAMELEN_MAX 63 00061 #define valid_encoding_name_p(name) ((name) && strlen(name) <= ENCODING_NAMELEN_MAX) 00062 00063 #define enc_autoload_p(enc) (!rb_enc_mbmaxlen(enc)) 00064 00065 static int load_encoding(const char *name); 00066 00067 static size_t 00068 enc_memsize(const void *p) 00069 { 00070 return 0; 00071 } 00072 00073 static const rb_data_type_t encoding_data_type = { 00074 "encoding", 00075 {0, 0, enc_memsize,}, 00076 }; 00077 00078 #define is_data_encoding(obj) (RTYPEDDATA_P(obj) && RTYPEDDATA_TYPE(obj) == &encoding_data_type) 00079 00080 static VALUE 00081 enc_new(rb_encoding *encoding) 00082 { 00083 return TypedData_Wrap_Struct(rb_cEncoding, &encoding_data_type, encoding); 00084 } 00085 00086 static VALUE 00087 rb_enc_from_encoding_index(int idx) 00088 { 00089 VALUE list, enc; 00090 00091 if (!(list = rb_encoding_list)) { 00092 rb_bug("rb_enc_from_encoding_index(%d): no rb_encoding_list", idx); 00093 } 00094 enc = rb_ary_entry(list, idx); 00095 if (NIL_P(enc)) { 00096 rb_bug("rb_enc_from_encoding_index(%d): not created yet", idx); 00097 } 00098 return enc; 00099 } 00100 00101 VALUE 00102 rb_enc_from_encoding(rb_encoding *encoding) 00103 { 00104 int idx; 00105 if (!encoding) return Qnil; 00106 idx = ENC_TO_ENCINDEX(encoding); 00107 return rb_enc_from_encoding_index(idx); 00108 } 00109 00110 static int enc_autoload(rb_encoding *); 00111 00112 static int 00113 check_encoding(rb_encoding *enc) 00114 { 00115 int index = rb_enc_to_index(enc); 00116 if (rb_enc_from_index(index) != enc) 00117 return -1; 00118 if (enc_autoload_p(enc)) { 00119 index = enc_autoload(enc); 00120 } 00121 return index; 00122 } 00123 00124 static int 00125 enc_check_encoding(VALUE obj) 00126 { 00127 if (SPECIAL_CONST_P(obj) || !rb_typeddata_is_kind_of(obj, &encoding_data_type)) { 00128 return -1; 00129 } 00130 return check_encoding(RDATA(obj)->data); 00131 } 00132 00133 static int 00134 must_encoding(VALUE enc) 00135 { 00136 int index = enc_check_encoding(enc); 00137 if (index < 0) { 00138 rb_raise(rb_eTypeError, "wrong argument type %s (expected Encoding)", 00139 rb_obj_classname(enc)); 00140 } 00141 return index; 00142 } 00143 00144 int 00145 rb_to_encoding_index(VALUE enc) 00146 { 00147 int idx; 00148 00149 idx = enc_check_encoding(enc); 00150 if (idx >= 0) { 00151 return idx; 00152 } 00153 else if (NIL_P(enc = rb_check_string_type(enc))) { 00154 return -1; 00155 } 00156 if (!rb_enc_asciicompat(rb_enc_get(enc))) { 00157 return -1; 00158 } 00159 return rb_enc_find_index(StringValueCStr(enc)); 00160 } 00161 00162 static rb_encoding * 00163 to_encoding(VALUE enc) 00164 { 00165 int idx; 00166 00167 StringValue(enc); 00168 if (!rb_enc_asciicompat(rb_enc_get(enc))) { 00169 rb_raise(rb_eArgError, "invalid name encoding (non ASCII)"); 00170 } 00171 idx = rb_enc_find_index(StringValueCStr(enc)); 00172 if (idx < 0) { 00173 rb_raise(rb_eArgError, "unknown encoding name - %s", RSTRING_PTR(enc)); 00174 } 00175 return rb_enc_from_index(idx); 00176 } 00177 00178 rb_encoding * 00179 rb_to_encoding(VALUE enc) 00180 { 00181 if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data; 00182 return to_encoding(enc); 00183 } 00184 00185 void 00186 rb_gc_mark_encodings(void) 00187 { 00188 } 00189 00190 static int 00191 enc_table_expand(int newsize) 00192 { 00193 struct rb_encoding_entry *ent; 00194 int count = newsize; 00195 00196 if (enc_table.size >= newsize) return newsize; 00197 newsize = (newsize + 7) / 8 * 8; 00198 ent = realloc(enc_table.list, sizeof(*enc_table.list) * newsize); 00199 if (!ent) return -1; 00200 memset(ent + enc_table.size, 0, sizeof(*ent)*(newsize - enc_table.size)); 00201 enc_table.list = ent; 00202 enc_table.size = newsize; 00203 return count; 00204 } 00205 00206 static int 00207 enc_register_at(int index, const char *name, rb_encoding *encoding) 00208 { 00209 struct rb_encoding_entry *ent = &enc_table.list[index]; 00210 VALUE list; 00211 00212 if (!valid_encoding_name_p(name)) return -1; 00213 if (!ent->name) { 00214 ent->name = name = strdup(name); 00215 } 00216 else if (STRCASECMP(name, ent->name)) { 00217 return -1; 00218 } 00219 if (!ent->enc) { 00220 ent->enc = xmalloc(sizeof(rb_encoding)); 00221 } 00222 if (encoding) { 00223 *ent->enc = *encoding; 00224 } 00225 else { 00226 memset(ent->enc, 0, sizeof(*ent->enc)); 00227 } 00228 encoding = ent->enc; 00229 encoding->name = name; 00230 encoding->ruby_encoding_index = index; 00231 st_insert(enc_table.names, (st_data_t)name, (st_data_t)index); 00232 list = rb_encoding_list; 00233 if (list && NIL_P(rb_ary_entry(list, index))) { 00234 /* initialize encoding data */ 00235 rb_ary_store(list, index, enc_new(encoding)); 00236 } 00237 return index; 00238 } 00239 00240 static int 00241 enc_register(const char *name, rb_encoding *encoding) 00242 { 00243 int index = enc_table.count; 00244 00245 if ((index = enc_table_expand(index + 1)) < 0) return -1; 00246 enc_table.count = index; 00247 return enc_register_at(index - 1, name, encoding); 00248 } 00249 00250 static void set_encoding_const(const char *, rb_encoding *); 00251 int rb_enc_registered(const char *name); 00252 00253 int 00254 rb_enc_register(const char *name, rb_encoding *encoding) 00255 { 00256 int index = rb_enc_registered(name); 00257 00258 if (index >= 0) { 00259 rb_encoding *oldenc = rb_enc_from_index(index); 00260 if (STRCASECMP(name, rb_enc_name(oldenc))) { 00261 index = enc_register(name, encoding); 00262 } 00263 else if (enc_autoload_p(oldenc) || !ENC_DUMMY_P(oldenc)) { 00264 enc_register_at(index, name, encoding); 00265 } 00266 else { 00267 rb_raise(rb_eArgError, "encoding %s is already registered", name); 00268 } 00269 } 00270 else { 00271 index = enc_register(name, encoding); 00272 set_encoding_const(name, rb_enc_from_index(index)); 00273 } 00274 return index; 00275 } 00276 00277 void 00278 rb_encdb_declare(const char *name) 00279 { 00280 int idx = rb_enc_registered(name); 00281 if (idx < 0) { 00282 idx = enc_register(name, 0); 00283 } 00284 set_encoding_const(name, rb_enc_from_index(idx)); 00285 } 00286 00287 static void 00288 enc_check_duplication(const char *name) 00289 { 00290 if (rb_enc_registered(name) >= 0) { 00291 rb_raise(rb_eArgError, "encoding %s is already registered", name); 00292 } 00293 } 00294 00295 static rb_encoding* 00296 set_base_encoding(int index, rb_encoding *base) 00297 { 00298 rb_encoding *enc = enc_table.list[index].enc; 00299 00300 enc_table.list[index].base = base; 00301 if (rb_enc_dummy_p(base)) ENC_SET_DUMMY(enc); 00302 return enc; 00303 } 00304 00305 /* for encdb.h 00306 * Set base encoding for encodings which are not replicas 00307 * but not in their own files. 00308 */ 00309 void 00310 rb_enc_set_base(const char *name, const char *orig) 00311 { 00312 int idx = rb_enc_registered(name); 00313 int origidx = rb_enc_registered(orig); 00314 set_base_encoding(idx, rb_enc_from_index(origidx)); 00315 } 00316 00317 int 00318 rb_enc_replicate(const char *name, rb_encoding *encoding) 00319 { 00320 int idx; 00321 00322 enc_check_duplication(name); 00323 idx = enc_register(name, encoding); 00324 set_base_encoding(idx, encoding); 00325 set_encoding_const(name, rb_enc_from_index(idx)); 00326 return idx; 00327 } 00328 00329 /* 00330 * call-seq: 00331 * enc.replicate(name) -> encoding 00332 * 00333 * Returns a replicated encoding of _enc_ whose name is _name_. 00334 * The new encoding should have the same byte structure of _enc_. 00335 * If _name_ is used by another encoding, raise ArgumentError. 00336 * 00337 */ 00338 static VALUE 00339 enc_replicate(VALUE encoding, VALUE name) 00340 { 00341 return rb_enc_from_encoding_index( 00342 rb_enc_replicate(StringValueCStr(name), 00343 rb_to_encoding(encoding))); 00344 } 00345 00346 static int 00347 enc_replicate_with_index(const char *name, rb_encoding *origenc, int idx) 00348 { 00349 if (idx < 0) { 00350 idx = enc_register(name, origenc); 00351 } 00352 else { 00353 idx = enc_register_at(idx, name, origenc); 00354 } 00355 if (idx >= 0) { 00356 set_base_encoding(idx, origenc); 00357 set_encoding_const(name, rb_enc_from_index(idx)); 00358 } 00359 return idx; 00360 } 00361 00362 int 00363 rb_encdb_replicate(const char *name, const char *orig) 00364 { 00365 int origidx = rb_enc_registered(orig); 00366 int idx = rb_enc_registered(name); 00367 00368 if (origidx < 0) { 00369 origidx = enc_register(orig, 0); 00370 } 00371 return enc_replicate_with_index(name, rb_enc_from_index(origidx), idx); 00372 } 00373 00374 int 00375 rb_define_dummy_encoding(const char *name) 00376 { 00377 int index = rb_enc_replicate(name, rb_ascii8bit_encoding()); 00378 rb_encoding *enc = enc_table.list[index].enc; 00379 00380 ENC_SET_DUMMY(enc); 00381 return index; 00382 } 00383 00384 int 00385 rb_encdb_dummy(const char *name) 00386 { 00387 int index = enc_replicate_with_index(name, rb_ascii8bit_encoding(), 00388 rb_enc_registered(name)); 00389 rb_encoding *enc = enc_table.list[index].enc; 00390 00391 ENC_SET_DUMMY(enc); 00392 return index; 00393 } 00394 00395 /* 00396 * call-seq: 00397 * enc.dummy? -> true or false 00398 * 00399 * Returns true for dummy encodings. 00400 * A dummy encoding is an encoding for which character handling is not properly 00401 * implemented. 00402 * It is used for stateful encodings. 00403 * 00404 * Encoding::ISO_2022_JP.dummy? #=> true 00405 * Encoding::UTF_8.dummy? #=> false 00406 * 00407 */ 00408 static VALUE 00409 enc_dummy_p(VALUE enc) 00410 { 00411 return ENC_DUMMY_P(enc_table.list[must_encoding(enc)].enc) ? Qtrue : Qfalse; 00412 } 00413 00414 /* 00415 * call-seq: 00416 * enc.ascii_compatible? -> true or false 00417 * 00418 * Returns whether ASCII-compatible or not. 00419 * 00420 * Encoding::UTF_8.ascii_compatible? #=> true 00421 * Encoding::UTF_16BE.ascii_compatible? #=> false 00422 * 00423 */ 00424 static VALUE 00425 enc_ascii_compatible_p(VALUE enc) 00426 { 00427 return rb_enc_asciicompat(enc_table.list[must_encoding(enc)].enc) ? Qtrue : Qfalse; 00428 } 00429 00430 /* 00431 * Returns 1 when the encoding is Unicode series other than UTF-7 else 0. 00432 */ 00433 int 00434 rb_enc_unicode_p(rb_encoding *enc) 00435 { 00436 const char *name = rb_enc_name(enc); 00437 return name[0] == 'U' && name[1] == 'T' && name[2] == 'F' && name[4] != '7'; 00438 } 00439 00440 /* 00441 * Returns copied alias name when the key is added for st_table, 00442 * else returns NULL. 00443 */ 00444 static int 00445 enc_alias_internal(const char *alias, int idx) 00446 { 00447 return st_insert2(enc_table.names, (st_data_t)alias, (st_data_t)idx, 00448 (st_data_t(*)(st_data_t))strdup); 00449 } 00450 00451 static int 00452 enc_alias(const char *alias, int idx) 00453 { 00454 if (!valid_encoding_name_p(alias)) return -1; 00455 if (!enc_alias_internal(alias, idx)) 00456 set_encoding_const(alias, rb_enc_from_index(idx)); 00457 return idx; 00458 } 00459 00460 int 00461 rb_enc_alias(const char *alias, const char *orig) 00462 { 00463 int idx; 00464 00465 enc_check_duplication(alias); 00466 if (!enc_table.list) { 00467 rb_enc_init(); 00468 } 00469 if ((idx = rb_enc_find_index(orig)) < 0) { 00470 return -1; 00471 } 00472 return enc_alias(alias, idx); 00473 } 00474 00475 int 00476 rb_encdb_alias(const char *alias, const char *orig) 00477 { 00478 int idx = rb_enc_registered(orig); 00479 00480 if (idx < 0) { 00481 idx = enc_register(orig, 0); 00482 } 00483 return enc_alias(alias, idx); 00484 } 00485 00486 enum { 00487 ENCINDEX_ASCII, 00488 ENCINDEX_UTF_8, 00489 ENCINDEX_US_ASCII, 00490 ENCINDEX_BUILTIN_MAX 00491 }; 00492 00493 extern rb_encoding OnigEncodingUTF_8; 00494 extern rb_encoding OnigEncodingUS_ASCII; 00495 00496 void 00497 rb_enc_init(void) 00498 { 00499 enc_table_expand(ENCODING_COUNT + 1); 00500 if (!enc_table.names) { 00501 enc_table.names = st_init_strcasetable(); 00502 } 00503 #define ENC_REGISTER(enc) enc_register_at(ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc) 00504 ENC_REGISTER(ASCII); 00505 ENC_REGISTER(UTF_8); 00506 ENC_REGISTER(US_ASCII); 00507 #undef ENC_REGISTER 00508 enc_table.count = ENCINDEX_BUILTIN_MAX; 00509 } 00510 00511 rb_encoding * 00512 rb_enc_from_index(int index) 00513 { 00514 if (!enc_table.list) { 00515 rb_enc_init(); 00516 } 00517 if (index < 0 || enc_table.count <= index) { 00518 return 0; 00519 } 00520 return enc_table.list[index].enc; 00521 } 00522 00523 int 00524 rb_enc_registered(const char *name) 00525 { 00526 st_data_t idx = 0; 00527 00528 if (!name) return -1; 00529 if (!enc_table.list) return -1; 00530 if (st_lookup(enc_table.names, (st_data_t)name, &idx)) { 00531 return (int)idx; 00532 } 00533 return -1; 00534 } 00535 00536 static VALUE 00537 require_enc(VALUE enclib) 00538 { 00539 int safe = rb_safe_level(); 00540 return rb_require_safe(enclib, safe > 3 ? 3 : safe); 00541 } 00542 00543 static int 00544 load_encoding(const char *name) 00545 { 00546 VALUE enclib = rb_sprintf("enc/%s.so", name); 00547 VALUE verbose = ruby_verbose; 00548 VALUE debug = ruby_debug; 00549 VALUE loaded; 00550 char *s = RSTRING_PTR(enclib) + 4, *e = RSTRING_END(enclib) - 3; 00551 int idx; 00552 00553 while (s < e) { 00554 if (!ISALNUM(*s)) *s = '_'; 00555 else if (ISUPPER(*s)) *s = TOLOWER(*s); 00556 ++s; 00557 } 00558 FL_UNSET(enclib, FL_TAINT|FL_UNTRUSTED); 00559 OBJ_FREEZE(enclib); 00560 ruby_verbose = Qfalse; 00561 ruby_debug = Qfalse; 00562 loaded = rb_protect(require_enc, enclib, 0); 00563 ruby_verbose = verbose; 00564 ruby_debug = debug; 00565 rb_set_errinfo(Qnil); 00566 if (NIL_P(loaded)) return -1; 00567 if ((idx = rb_enc_registered(name)) < 0) return -1; 00568 if (enc_autoload_p(enc_table.list[idx].enc)) return -1; 00569 return idx; 00570 } 00571 00572 static int 00573 enc_autoload(rb_encoding *enc) 00574 { 00575 int i; 00576 rb_encoding *base = enc_table.list[ENC_TO_ENCINDEX(enc)].base; 00577 00578 if (base) { 00579 i = 0; 00580 do { 00581 if (i >= enc_table.count) return -1; 00582 } while (enc_table.list[i].enc != base && (++i, 1)); 00583 if (enc_autoload_p(base)) { 00584 if (enc_autoload(base) < 0) return -1; 00585 } 00586 i = ENC_TO_ENCINDEX(enc); 00587 enc_register_at(i, rb_enc_name(enc), base); 00588 } 00589 else { 00590 i = load_encoding(rb_enc_name(enc)); 00591 } 00592 return i; 00593 } 00594 00595 int 00596 rb_enc_find_index(const char *name) 00597 { 00598 int i = rb_enc_registered(name); 00599 rb_encoding *enc; 00600 00601 if (i < 0) { 00602 i = load_encoding(name); 00603 } 00604 else if (!(enc = rb_enc_from_index(i))) { 00605 if (i != UNSPECIFIED_ENCODING) { 00606 rb_raise(rb_eArgError, "encoding %s is not registered", name); 00607 } 00608 } 00609 else if (enc_autoload_p(enc)) { 00610 if (enc_autoload(enc) < 0) { 00611 rb_warn("failed to load encoding (%s); use ASCII-8BIT instead", 00612 name); 00613 return 0; 00614 } 00615 } 00616 return i; 00617 } 00618 00619 rb_encoding * 00620 rb_enc_find(const char *name) 00621 { 00622 int idx = rb_enc_find_index(name); 00623 if (idx < 0) idx = 0; 00624 return rb_enc_from_index(idx); 00625 } 00626 00627 static inline int 00628 enc_capable(VALUE obj) 00629 { 00630 if (SPECIAL_CONST_P(obj)) return SYMBOL_P(obj); 00631 switch (BUILTIN_TYPE(obj)) { 00632 case T_STRING: 00633 case T_REGEXP: 00634 case T_FILE: 00635 return TRUE; 00636 case T_DATA: 00637 if (is_data_encoding(obj)) return TRUE; 00638 default: 00639 return FALSE; 00640 } 00641 } 00642 00643 ID 00644 rb_id_encoding(void) 00645 { 00646 CONST_ID(id_encoding, "encoding"); 00647 return id_encoding; 00648 } 00649 00650 int 00651 rb_enc_get_index(VALUE obj) 00652 { 00653 int i = -1; 00654 VALUE tmp; 00655 00656 if (SPECIAL_CONST_P(obj)) { 00657 if (!SYMBOL_P(obj)) return -1; 00658 obj = rb_id2str(SYM2ID(obj)); 00659 } 00660 switch (BUILTIN_TYPE(obj)) { 00661 as_default: 00662 default: 00663 case T_STRING: 00664 case T_REGEXP: 00665 i = ENCODING_GET_INLINED(obj); 00666 if (i == ENCODING_INLINE_MAX) { 00667 VALUE iv; 00668 00669 iv = rb_ivar_get(obj, rb_id_encoding()); 00670 i = NUM2INT(iv); 00671 } 00672 break; 00673 case T_FILE: 00674 tmp = rb_funcall(obj, rb_intern("internal_encoding"), 0, 0); 00675 if (NIL_P(tmp)) obj = rb_funcall(obj, rb_intern("external_encoding"), 0, 0); 00676 else obj = tmp; 00677 if (NIL_P(obj)) break; 00678 case T_DATA: 00679 if (is_data_encoding(obj)) { 00680 i = enc_check_encoding(obj); 00681 } 00682 else { 00683 goto as_default; 00684 } 00685 break; 00686 } 00687 return i; 00688 } 00689 00690 static void 00691 enc_set_index(VALUE obj, int idx) 00692 { 00693 if (idx < ENCODING_INLINE_MAX) { 00694 ENCODING_SET_INLINED(obj, idx); 00695 return; 00696 } 00697 ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX); 00698 rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx)); 00699 } 00700 00701 void 00702 rb_enc_set_index(VALUE obj, int idx) 00703 { 00704 rb_check_frozen(obj); 00705 enc_set_index(obj, idx); 00706 } 00707 00708 VALUE 00709 rb_enc_associate_index(VALUE obj, int idx) 00710 { 00711 /* enc_check_capable(obj);*/ 00712 rb_check_frozen(obj); 00713 if (rb_enc_get_index(obj) == idx) 00714 return obj; 00715 if (SPECIAL_CONST_P(obj)) { 00716 rb_raise(rb_eArgError, "cannot set encoding"); 00717 } 00718 if (!ENC_CODERANGE_ASCIIONLY(obj) || 00719 !rb_enc_asciicompat(rb_enc_from_index(idx))) { 00720 ENC_CODERANGE_CLEAR(obj); 00721 } 00722 enc_set_index(obj, idx); 00723 return obj; 00724 } 00725 00726 VALUE 00727 rb_enc_associate(VALUE obj, rb_encoding *enc) 00728 { 00729 return rb_enc_associate_index(obj, rb_enc_to_index(enc)); 00730 } 00731 00732 rb_encoding* 00733 rb_enc_get(VALUE obj) 00734 { 00735 return rb_enc_from_index(rb_enc_get_index(obj)); 00736 } 00737 00738 rb_encoding* 00739 rb_enc_check(VALUE str1, VALUE str2) 00740 { 00741 rb_encoding *enc = rb_enc_compatible(str1, str2); 00742 if (!enc) 00743 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", 00744 rb_enc_name(rb_enc_get(str1)), 00745 rb_enc_name(rb_enc_get(str2))); 00746 return enc; 00747 } 00748 00749 rb_encoding* 00750 rb_enc_compatible(VALUE str1, VALUE str2) 00751 { 00752 int idx1, idx2; 00753 rb_encoding *enc1, *enc2; 00754 int isstr1, isstr2; 00755 00756 idx1 = rb_enc_get_index(str1); 00757 idx2 = rb_enc_get_index(str2); 00758 00759 if (idx1 < 0 || idx2 < 0) 00760 return 0; 00761 00762 if (idx1 == idx2) { 00763 return rb_enc_from_index(idx1); 00764 } 00765 enc1 = rb_enc_from_index(idx1); 00766 enc2 = rb_enc_from_index(idx2); 00767 00768 isstr2 = RB_TYPE_P(str2, T_STRING); 00769 if (isstr2 && RSTRING_LEN(str2) == 0) 00770 return enc1; 00771 isstr1 = RB_TYPE_P(str1, T_STRING); 00772 if (isstr1 && RSTRING_LEN(str1) == 0) 00773 return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2; 00774 if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) { 00775 return 0; 00776 } 00777 00778 /* objects whose encoding is the same of contents */ 00779 if (!isstr2 && idx2 == ENCINDEX_US_ASCII) 00780 return enc1; 00781 if (!isstr1 && idx1 == ENCINDEX_US_ASCII) 00782 return enc2; 00783 00784 if (!isstr1) { 00785 VALUE tmp = str1; 00786 int idx0 = idx1; 00787 str1 = str2; 00788 str2 = tmp; 00789 idx1 = idx2; 00790 idx2 = idx0; 00791 idx0 = isstr1; 00792 isstr1 = isstr2; 00793 isstr2 = idx0; 00794 } 00795 if (isstr1) { 00796 int cr1, cr2; 00797 00798 cr1 = rb_enc_str_coderange(str1); 00799 if (isstr2) { 00800 cr2 = rb_enc_str_coderange(str2); 00801 if (cr1 != cr2) { 00802 /* may need to handle ENC_CODERANGE_BROKEN */ 00803 if (cr1 == ENC_CODERANGE_7BIT) return enc2; 00804 if (cr2 == ENC_CODERANGE_7BIT) return enc1; 00805 } 00806 if (cr2 == ENC_CODERANGE_7BIT) { 00807 return enc1; 00808 } 00809 } 00810 if (cr1 == ENC_CODERANGE_7BIT) 00811 return enc2; 00812 } 00813 return 0; 00814 } 00815 00816 void 00817 rb_enc_copy(VALUE obj1, VALUE obj2) 00818 { 00819 rb_enc_associate_index(obj1, rb_enc_get_index(obj2)); 00820 } 00821 00822 00823 /* 00824 * call-seq: 00825 * obj.encoding -> encoding 00826 * 00827 * Returns the Encoding object that represents the encoding of obj. 00828 */ 00829 00830 VALUE 00831 rb_obj_encoding(VALUE obj) 00832 { 00833 rb_encoding *enc = rb_enc_get(obj); 00834 if (!enc) { 00835 rb_raise(rb_eTypeError, "unknown encoding"); 00836 } 00837 return rb_enc_from_encoding(enc); 00838 } 00839 00840 int 00841 rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc) 00842 { 00843 return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); 00844 } 00845 00846 int 00847 rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc) 00848 { 00849 int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); 00850 if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= e-p) 00851 return MBCLEN_CHARFOUND_LEN(n); 00852 else { 00853 int min = rb_enc_mbminlen(enc); 00854 return min <= e-p ? min : (int)(e-p); 00855 } 00856 } 00857 00858 int 00859 rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc) 00860 { 00861 int n; 00862 if (e <= p) 00863 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1); 00864 n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); 00865 if (e-p < n) 00866 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(int)(e-p)); 00867 return n; 00868 } 00869 00870 int 00871 rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc) 00872 { 00873 unsigned int c, l; 00874 if (e <= p) 00875 return -1; 00876 if (rb_enc_asciicompat(enc)) { 00877 c = (unsigned char)*p; 00878 if (!ISASCII(c)) 00879 return -1; 00880 if (len) *len = 1; 00881 return c; 00882 } 00883 l = rb_enc_precise_mbclen(p, e, enc); 00884 if (!MBCLEN_CHARFOUND_P(l)) 00885 return -1; 00886 c = rb_enc_mbc_to_codepoint(p, e, enc); 00887 if (!rb_enc_isascii(c, enc)) 00888 return -1; 00889 if (len) *len = l; 00890 return c; 00891 } 00892 00893 unsigned int 00894 rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc) 00895 { 00896 int r; 00897 if (e <= p) 00898 rb_raise(rb_eArgError, "empty string"); 00899 r = rb_enc_precise_mbclen(p, e, enc); 00900 if (MBCLEN_CHARFOUND_P(r)) { 00901 if (len_p) *len_p = MBCLEN_CHARFOUND_LEN(r); 00902 return rb_enc_mbc_to_codepoint(p, e, enc); 00903 } 00904 else 00905 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc)); 00906 } 00907 00908 #undef rb_enc_codepoint 00909 unsigned int 00910 rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc) 00911 { 00912 return rb_enc_codepoint_len(p, e, 0, enc); 00913 } 00914 00915 int 00916 rb_enc_codelen(int c, rb_encoding *enc) 00917 { 00918 int n = ONIGENC_CODE_TO_MBCLEN(enc,c); 00919 if (n == 0) { 00920 rb_raise(rb_eArgError, "invalid codepoint 0x%x in %s", c, rb_enc_name(enc)); 00921 } 00922 return n; 00923 } 00924 00925 int 00926 rb_enc_toupper(int c, rb_encoding *enc) 00927 { 00928 return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c):(c)); 00929 } 00930 00931 int 00932 rb_enc_tolower(int c, rb_encoding *enc) 00933 { 00934 return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c):(c)); 00935 } 00936 00937 /* 00938 * call-seq: 00939 * enc.inspect -> string 00940 * 00941 * Returns a string which represents the encoding for programmers. 00942 * 00943 * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>" 00944 * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>" 00945 */ 00946 static VALUE 00947 enc_inspect(VALUE self) 00948 { 00949 VALUE str = rb_sprintf("#<%s:%s%s>", rb_obj_classname(self), 00950 rb_enc_name((rb_encoding*)DATA_PTR(self)), 00951 (enc_dummy_p(self) ? " (dummy)" : "")); 00952 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT); 00953 return str; 00954 } 00955 00956 /* 00957 * call-seq: 00958 * enc.name -> string 00959 * 00960 * Returns the name of the encoding. 00961 * 00962 * Encoding::UTF_8.name #=> "UTF-8" 00963 */ 00964 static VALUE 00965 enc_name(VALUE self) 00966 { 00967 return rb_usascii_str_new2(rb_enc_name((rb_encoding*)DATA_PTR(self))); 00968 } 00969 00970 static int 00971 enc_names_i(st_data_t name, st_data_t idx, st_data_t args) 00972 { 00973 VALUE *arg = (VALUE *)args; 00974 00975 if ((int)idx == (int)arg[0]) { 00976 VALUE str = rb_usascii_str_new2((char *)name); 00977 OBJ_FREEZE(str); 00978 rb_ary_push(arg[1], str); 00979 } 00980 return ST_CONTINUE; 00981 } 00982 00983 /* 00984 * call-seq: 00985 * enc.names -> array 00986 * 00987 * Returns the list of name and aliases of the encoding. 00988 * 00989 * Encoding::WINDOWS_31J.names #=> ["Windows-31J", "CP932", "csWindows31J"] 00990 */ 00991 static VALUE 00992 enc_names(VALUE self) 00993 { 00994 VALUE args[2]; 00995 00996 args[0] = (VALUE)rb_to_encoding_index(self); 00997 args[1] = rb_ary_new2(0); 00998 st_foreach(enc_table.names, enc_names_i, (st_data_t)args); 00999 return args[1]; 01000 } 01001 01002 /* 01003 * call-seq: 01004 * Encoding.list -> [enc1, enc2, ...] 01005 * 01006 * Returns the list of loaded encodings. 01007 * 01008 * Encoding.list 01009 * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>, 01010 * #<Encoding:ISO-2022-JP (dummy)>] 01011 * 01012 * Encoding.find("US-ASCII") 01013 * #=> #<Encoding:US-ASCII> 01014 * 01015 * Encoding.list 01016 * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>, 01017 * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>] 01018 * 01019 */ 01020 static VALUE 01021 enc_list(VALUE klass) 01022 { 01023 VALUE ary = rb_ary_new2(0); 01024 rb_ary_replace(ary, rb_encoding_list); 01025 return ary; 01026 } 01027 01028 /* 01029 * call-seq: 01030 * Encoding.find(string) -> enc 01031 * Encoding.find(symbol) -> enc 01032 * 01033 * Search the encoding with specified <i>name</i>. 01034 * <i>name</i> should be a string or symbol. 01035 * 01036 * Encoding.find("US-ASCII") #=> #<Encoding:US-ASCII> 01037 * Encoding.find(:Shift_JIS) #=> #<Encoding:Shift_JIS> 01038 * 01039 * Names which this method accept are encoding names and aliases 01040 * including following special aliases 01041 * 01042 * "external":: default external encoding 01043 * "internal":: default internal encoding 01044 * "locale":: locale encoding 01045 * "filesystem":: filesystem encoding 01046 * 01047 * An ArgumentError is raised when no encoding with <i>name</i>. 01048 * Only <code>Encoding.find("internal")</code> however returns nil 01049 * when no encoding named "internal", in other words, when Ruby has no 01050 * default internal encoding. 01051 */ 01052 static VALUE 01053 enc_find(VALUE klass, VALUE enc) 01054 { 01055 return rb_enc_from_encoding(rb_to_encoding(enc)); 01056 } 01057 01058 /* 01059 * call-seq: 01060 * Encoding.compatible?(obj1, obj2) -> enc or nil 01061 * 01062 * Checks the compatibility of two objects. 01063 * 01064 * If the objects are both strings they are compatible when they are 01065 * concatenatable. The encoding of the concatenated string will be returned 01066 * if they are compatible, nil if they are not. 01067 * 01068 * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b") 01069 * #=> #<Encoding:ISO-8859-1> 01070 * 01071 * Encoding.compatible?( 01072 * "\xa1".force_encoding("iso-8859-1"), 01073 * "\xa1\xa1".force_encoding("euc-jp")) 01074 * #=> nil 01075 * 01076 * If the objects are non-strings their encodings are compatible when they 01077 * have an encoding and: 01078 * * Either encoding is US-ASCII compatible 01079 * * One of the encodings is a 7-bit encoding 01080 * 01081 */ 01082 static VALUE 01083 enc_compatible_p(VALUE klass, VALUE str1, VALUE str2) 01084 { 01085 rb_encoding *enc; 01086 01087 if (!enc_capable(str1)) return Qnil; 01088 if (!enc_capable(str2)) return Qnil; 01089 enc = rb_enc_compatible(str1, str2); 01090 if (!enc) return Qnil; 01091 return rb_enc_from_encoding(enc); 01092 } 01093 01094 /* :nodoc: */ 01095 static VALUE 01096 enc_dump(int argc, VALUE *argv, VALUE self) 01097 { 01098 rb_scan_args(argc, argv, "01", 0); 01099 return enc_name(self); 01100 } 01101 01102 /* :nodoc: */ 01103 static VALUE 01104 enc_load(VALUE klass, VALUE str) 01105 { 01106 return enc_find(klass, str); 01107 } 01108 01109 rb_encoding * 01110 rb_ascii8bit_encoding(void) 01111 { 01112 if (!enc_table.list) { 01113 rb_enc_init(); 01114 } 01115 return enc_table.list[ENCINDEX_ASCII].enc; 01116 } 01117 01118 int 01119 rb_ascii8bit_encindex(void) 01120 { 01121 return ENCINDEX_ASCII; 01122 } 01123 01124 rb_encoding * 01125 rb_utf8_encoding(void) 01126 { 01127 if (!enc_table.list) { 01128 rb_enc_init(); 01129 } 01130 return enc_table.list[ENCINDEX_UTF_8].enc; 01131 } 01132 01133 int 01134 rb_utf8_encindex(void) 01135 { 01136 return ENCINDEX_UTF_8; 01137 } 01138 01139 rb_encoding * 01140 rb_usascii_encoding(void) 01141 { 01142 if (!enc_table.list) { 01143 rb_enc_init(); 01144 } 01145 return enc_table.list[ENCINDEX_US_ASCII].enc; 01146 } 01147 01148 int 01149 rb_usascii_encindex(void) 01150 { 01151 return ENCINDEX_US_ASCII; 01152 } 01153 01154 int 01155 rb_locale_encindex(void) 01156 { 01157 VALUE charmap = rb_locale_charmap(rb_cEncoding); 01158 int idx; 01159 01160 if (NIL_P(charmap)) 01161 idx = rb_usascii_encindex(); 01162 else if ((idx = rb_enc_find_index(StringValueCStr(charmap))) < 0) 01163 idx = rb_ascii8bit_encindex(); 01164 01165 if (rb_enc_registered("locale") < 0) enc_alias_internal("locale", idx); 01166 01167 return idx; 01168 } 01169 01170 rb_encoding * 01171 rb_locale_encoding(void) 01172 { 01173 return rb_enc_from_index(rb_locale_encindex()); 01174 } 01175 01176 static int 01177 enc_set_filesystem_encoding(void) 01178 { 01179 int idx; 01180 #if defined NO_LOCALE_CHARMAP 01181 idx = rb_enc_to_index(rb_default_external_encoding()); 01182 #elif defined _WIN32 || defined __CYGWIN__ 01183 char cp[sizeof(int) * 8 / 3 + 4]; 01184 snprintf(cp, sizeof cp, "CP%d", AreFileApisANSI() ? GetACP() : GetOEMCP()); 01185 idx = rb_enc_find_index(cp); 01186 if (idx < 0) idx = rb_ascii8bit_encindex(); 01187 #else 01188 idx = rb_enc_to_index(rb_default_external_encoding()); 01189 #endif 01190 01191 enc_alias_internal("filesystem", idx); 01192 return idx; 01193 } 01194 01195 int 01196 rb_filesystem_encindex(void) 01197 { 01198 int idx = rb_enc_registered("filesystem"); 01199 if (idx < 0) 01200 idx = rb_ascii8bit_encindex(); 01201 return idx; 01202 } 01203 01204 rb_encoding * 01205 rb_filesystem_encoding(void) 01206 { 01207 return rb_enc_from_index(rb_filesystem_encindex()); 01208 } 01209 01210 struct default_encoding { 01211 int index; /* -2 => not yet set, -1 => nil */ 01212 rb_encoding *enc; 01213 }; 01214 01215 static struct default_encoding default_external = {0}; 01216 01217 static int 01218 enc_set_default_encoding(struct default_encoding *def, VALUE encoding, const char *name) 01219 { 01220 int overridden = FALSE; 01221 01222 if (def->index != -2) 01223 /* Already set */ 01224 overridden = TRUE; 01225 01226 if (NIL_P(encoding)) { 01227 def->index = -1; 01228 def->enc = 0; 01229 st_insert(enc_table.names, (st_data_t)strdup(name), 01230 (st_data_t)UNSPECIFIED_ENCODING); 01231 } 01232 else { 01233 def->index = rb_enc_to_index(rb_to_encoding(encoding)); 01234 def->enc = 0; 01235 enc_alias_internal(name, def->index); 01236 } 01237 01238 if (def == &default_external) 01239 enc_set_filesystem_encoding(); 01240 01241 return overridden; 01242 } 01243 01244 rb_encoding * 01245 rb_default_external_encoding(void) 01246 { 01247 if (default_external.enc) return default_external.enc; 01248 01249 if (default_external.index >= 0) { 01250 default_external.enc = rb_enc_from_index(default_external.index); 01251 return default_external.enc; 01252 } 01253 else { 01254 return rb_locale_encoding(); 01255 } 01256 } 01257 01258 VALUE 01259 rb_enc_default_external(void) 01260 { 01261 return rb_enc_from_encoding(rb_default_external_encoding()); 01262 } 01263 01264 /* 01265 * call-seq: 01266 * Encoding.default_external -> enc 01267 * 01268 * Returns default external encoding. 01269 * 01270 * The default external encoding is used by default for strings created from 01271 * the following locations: 01272 * 01273 * * CSV 01274 * * File data read from disk 01275 * * SDBM 01276 * * StringIO 01277 * * Zlib::GzipReader 01278 * * Zlib::GzipWriter 01279 * * String#inspect 01280 * * Regexp#inspect 01281 * 01282 * While strings created from these locations will have this encoding, the 01283 * encoding may not be valid. Be sure to check String#valid_encoding?. 01284 * 01285 * File data written to disk will be transcoded to the default external 01286 * encoding when written. 01287 * 01288 * The default external encoding is initialized by the locale or -E option. 01289 */ 01290 static VALUE 01291 get_default_external(VALUE klass) 01292 { 01293 return rb_enc_default_external(); 01294 } 01295 01296 void 01297 rb_enc_set_default_external(VALUE encoding) 01298 { 01299 if (NIL_P(encoding)) { 01300 rb_raise(rb_eArgError, "default external can not be nil"); 01301 } 01302 enc_set_default_encoding(&default_external, encoding, 01303 "external"); 01304 } 01305 01306 /* 01307 * call-seq: 01308 * Encoding.default_external = enc 01309 * 01310 * Sets default external encoding. You should not set 01311 * Encoding::default_external in ruby code as strings created before changing 01312 * the value may have a different encoding from strings created after thevalue 01313 * was changed., instead you should use <tt>ruby -E</tt> to invoke ruby with 01314 * the correct default_external. 01315 * 01316 * See Encoding::default_external for information on how the default external 01317 * encoding is used. 01318 */ 01319 static VALUE 01320 set_default_external(VALUE klass, VALUE encoding) 01321 { 01322 rb_warning("setting Encoding.default_external"); 01323 rb_enc_set_default_external(encoding); 01324 return encoding; 01325 } 01326 01327 static struct default_encoding default_internal = {-2}; 01328 01329 rb_encoding * 01330 rb_default_internal_encoding(void) 01331 { 01332 if (!default_internal.enc && default_internal.index >= 0) { 01333 default_internal.enc = rb_enc_from_index(default_internal.index); 01334 } 01335 return default_internal.enc; /* can be NULL */ 01336 } 01337 01338 VALUE 01339 rb_enc_default_internal(void) 01340 { 01341 /* Note: These functions cope with default_internal not being set */ 01342 return rb_enc_from_encoding(rb_default_internal_encoding()); 01343 } 01344 01345 /* 01346 * call-seq: 01347 * Encoding.default_internal -> enc 01348 * 01349 * Returns default internal encoding. Strings will be transcoded to the 01350 * default internal encoding in the following places if the default internal 01351 * encoding is not nil: 01352 * 01353 * * CSV 01354 * * Etc.sysconfdir and Etc.systmpdir 01355 * * File data read from disk 01356 * * File names from Dir 01357 * * Integer#chr 01358 * * String#inspect and Regexp#inspect 01359 * * Strings returned from Curses 01360 * * Strings returned from Readline 01361 * * Strings returned from SDBM 01362 * * Time#zone 01363 * * Values from ENV 01364 * * Values in ARGV including $PROGRAM_NAME 01365 * * __FILE__ 01366 * 01367 * Additionally String#encode and String#encode! use the default internal 01368 * encoding if no encoding is given. 01369 * 01370 * The locale encoding (__ENCODING__), not default_internal, is used as the 01371 * encoding of created strings. 01372 * 01373 * Encoding::default_internal is initialized by the source file's 01374 * internal_encoding or -E option. 01375 */ 01376 static VALUE 01377 get_default_internal(VALUE klass) 01378 { 01379 return rb_enc_default_internal(); 01380 } 01381 01382 void 01383 rb_enc_set_default_internal(VALUE encoding) 01384 { 01385 enc_set_default_encoding(&default_internal, encoding, 01386 "internal"); 01387 } 01388 01389 /* 01390 * call-seq: 01391 * Encoding.default_internal = enc or nil 01392 * 01393 * Sets default internal encoding or removes default internal encoding when 01394 * passed nil. You should not set Encoding::default_internal in ruby code as 01395 * strings created before changing the value may have a different encoding 01396 * from strings created after the change. Instead you should use 01397 * <tt>ruby -E</tt> to invoke ruby with the correct default_internal. 01398 * 01399 * See Encoding::default_internal for information on how the default internal 01400 * encoding is used. 01401 */ 01402 static VALUE 01403 set_default_internal(VALUE klass, VALUE encoding) 01404 { 01405 rb_warning("setting Encoding.default_internal"); 01406 rb_enc_set_default_internal(encoding); 01407 return encoding; 01408 } 01409 01410 /* 01411 * call-seq: 01412 * Encoding.locale_charmap -> string 01413 * 01414 * Returns the locale charmap name. 01415 * It returns nil if no appropriate information. 01416 * 01417 * Debian GNU/Linux 01418 * LANG=C 01419 * Encoding.locale_charmap #=> "ANSI_X3.4-1968" 01420 * LANG=ja_JP.EUC-JP 01421 * Encoding.locale_charmap #=> "EUC-JP" 01422 * 01423 * SunOS 5 01424 * LANG=C 01425 * Encoding.locale_charmap #=> "646" 01426 * LANG=ja 01427 * Encoding.locale_charmap #=> "eucJP" 01428 * 01429 * The result is highly platform dependent. 01430 * So Encoding.find(Encoding.locale_charmap) may cause an error. 01431 * If you need some encoding object even for unknown locale, 01432 * Encoding.find("locale") can be used. 01433 * 01434 */ 01435 VALUE 01436 rb_locale_charmap(VALUE klass) 01437 { 01438 #if defined NO_LOCALE_CHARMAP 01439 return rb_usascii_str_new2("ASCII-8BIT"); 01440 #elif defined _WIN32 || defined __CYGWIN__ 01441 const char *nl_langinfo_codeset(void); 01442 const char *codeset = nl_langinfo_codeset(); 01443 char cp[sizeof(int) * 3 + 4]; 01444 if (!codeset) { 01445 UINT codepage = GetConsoleCP(); 01446 if(!codepage) codepage = GetACP(); 01447 snprintf(cp, sizeof(cp), "CP%d", codepage); 01448 codeset = cp; 01449 } 01450 return rb_usascii_str_new2(codeset); 01451 #elif defined HAVE_LANGINFO_H 01452 char *codeset; 01453 codeset = nl_langinfo(CODESET); 01454 return rb_usascii_str_new2(codeset); 01455 #else 01456 return Qnil; 01457 #endif 01458 } 01459 01460 static void 01461 set_encoding_const(const char *name, rb_encoding *enc) 01462 { 01463 VALUE encoding = rb_enc_from_encoding(enc); 01464 char *s = (char *)name; 01465 int haslower = 0, hasupper = 0, valid = 0; 01466 01467 if (ISDIGIT(*s)) return; 01468 if (ISUPPER(*s)) { 01469 hasupper = 1; 01470 while (*++s && (ISALNUM(*s) || *s == '_')) { 01471 if (ISLOWER(*s)) haslower = 1; 01472 } 01473 } 01474 if (!*s) { 01475 if (s - name > ENCODING_NAMELEN_MAX) return; 01476 valid = 1; 01477 rb_define_const(rb_cEncoding, name, encoding); 01478 } 01479 if (!valid || haslower) { 01480 size_t len = s - name; 01481 if (len > ENCODING_NAMELEN_MAX) return; 01482 if (!haslower || !hasupper) { 01483 do { 01484 if (ISLOWER(*s)) haslower = 1; 01485 if (ISUPPER(*s)) hasupper = 1; 01486 } while (*++s && (!haslower || !hasupper)); 01487 len = s - name; 01488 } 01489 len += strlen(s); 01490 if (len++ > ENCODING_NAMELEN_MAX) return; 01491 MEMCPY(s = ALLOCA_N(char, len), name, char, len); 01492 name = s; 01493 if (!valid) { 01494 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s); 01495 for (; *s; ++s) { 01496 if (!ISALNUM(*s)) *s = '_'; 01497 } 01498 if (hasupper) { 01499 rb_define_const(rb_cEncoding, name, encoding); 01500 } 01501 } 01502 if (haslower) { 01503 for (s = (char *)name; *s; ++s) { 01504 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s); 01505 } 01506 rb_define_const(rb_cEncoding, name, encoding); 01507 } 01508 } 01509 } 01510 01511 static int 01512 rb_enc_name_list_i(st_data_t name, st_data_t idx, st_data_t arg) 01513 { 01514 VALUE ary = (VALUE)arg; 01515 VALUE str = rb_usascii_str_new2((char *)name); 01516 OBJ_FREEZE(str); 01517 rb_ary_push(ary, str); 01518 return ST_CONTINUE; 01519 } 01520 01521 /* 01522 * call-seq: 01523 * Encoding.name_list -> ["enc1", "enc2", ...] 01524 * 01525 * Returns the list of available encoding names. 01526 * 01527 * Encoding.name_list 01528 * #=> ["US-ASCII", "ASCII-8BIT", "UTF-8", 01529 * "ISO-8859-1", "Shift_JIS", "EUC-JP", 01530 * "Windows-31J", 01531 * "BINARY", "CP932", "eucJP"] 01532 * 01533 */ 01534 01535 static VALUE 01536 rb_enc_name_list(VALUE klass) 01537 { 01538 VALUE ary = rb_ary_new2(enc_table.names->num_entries); 01539 st_foreach(enc_table.names, rb_enc_name_list_i, (st_data_t)ary); 01540 return ary; 01541 } 01542 01543 static int 01544 rb_enc_aliases_enc_i(st_data_t name, st_data_t orig, st_data_t arg) 01545 { 01546 VALUE *p = (VALUE *)arg; 01547 VALUE aliases = p[0], ary = p[1]; 01548 int idx = (int)orig; 01549 VALUE key, str = rb_ary_entry(ary, idx); 01550 01551 if (NIL_P(str)) { 01552 rb_encoding *enc = rb_enc_from_index(idx); 01553 01554 if (!enc) return ST_CONTINUE; 01555 if (STRCASECMP((char*)name, rb_enc_name(enc)) == 0) { 01556 return ST_CONTINUE; 01557 } 01558 str = rb_usascii_str_new2(rb_enc_name(enc)); 01559 OBJ_FREEZE(str); 01560 rb_ary_store(ary, idx, str); 01561 } 01562 key = rb_usascii_str_new2((char *)name); 01563 OBJ_FREEZE(key); 01564 rb_hash_aset(aliases, key, str); 01565 return ST_CONTINUE; 01566 } 01567 01568 /* 01569 * call-seq: 01570 * Encoding.aliases -> {"alias1" => "orig1", "alias2" => "orig2", ...} 01571 * 01572 * Returns the hash of available encoding alias and original encoding name. 01573 * 01574 * Encoding.aliases 01575 * #=> {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII", 01576 * "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"} 01577 * 01578 */ 01579 01580 static VALUE 01581 rb_enc_aliases(VALUE klass) 01582 { 01583 VALUE aliases[2]; 01584 aliases[0] = rb_hash_new(); 01585 aliases[1] = rb_ary_new(); 01586 st_foreach(enc_table.names, rb_enc_aliases_enc_i, (st_data_t)aliases); 01587 return aliases[0]; 01588 } 01589 01590 void 01591 Init_Encoding(void) 01592 { 01593 #undef rb_intern 01594 #define rb_intern(str) rb_intern_const(str) 01595 VALUE list; 01596 int i; 01597 01598 rb_cEncoding = rb_define_class("Encoding", rb_cObject); 01599 rb_undef_alloc_func(rb_cEncoding); 01600 rb_undef_method(CLASS_OF(rb_cEncoding), "new"); 01601 rb_define_method(rb_cEncoding, "to_s", enc_name, 0); 01602 rb_define_method(rb_cEncoding, "inspect", enc_inspect, 0); 01603 rb_define_method(rb_cEncoding, "name", enc_name, 0); 01604 rb_define_method(rb_cEncoding, "names", enc_names, 0); 01605 rb_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0); 01606 rb_define_method(rb_cEncoding, "ascii_compatible?", enc_ascii_compatible_p, 0); 01607 rb_define_method(rb_cEncoding, "replicate", enc_replicate, 1); 01608 rb_define_singleton_method(rb_cEncoding, "list", enc_list, 0); 01609 rb_define_singleton_method(rb_cEncoding, "name_list", rb_enc_name_list, 0); 01610 rb_define_singleton_method(rb_cEncoding, "aliases", rb_enc_aliases, 0); 01611 rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1); 01612 rb_define_singleton_method(rb_cEncoding, "compatible?", enc_compatible_p, 2); 01613 01614 rb_define_method(rb_cEncoding, "_dump", enc_dump, -1); 01615 rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1); 01616 01617 rb_define_singleton_method(rb_cEncoding, "default_external", get_default_external, 0); 01618 rb_define_singleton_method(rb_cEncoding, "default_external=", set_default_external, 1); 01619 rb_define_singleton_method(rb_cEncoding, "default_internal", get_default_internal, 0); 01620 rb_define_singleton_method(rb_cEncoding, "default_internal=", set_default_internal, 1); 01621 rb_define_singleton_method(rb_cEncoding, "locale_charmap", rb_locale_charmap, 0); 01622 01623 list = rb_ary_new2(enc_table.count); 01624 RBASIC(list)->klass = 0; 01625 rb_encoding_list = list; 01626 rb_gc_register_mark_object(list); 01627 01628 for (i = 0; i < enc_table.count; ++i) { 01629 rb_ary_push(list, enc_new(enc_table.list[i].enc)); 01630 } 01631 } 01632 01633 /* locale insensitive ctype functions */ 01634 01635 #define ctype_test(c, ctype) \ 01636 (rb_isascii(c) && ONIGENC_IS_ASCII_CODE_CTYPE((c), (ctype))) 01637 01638 int rb_isalnum(int c) { return ctype_test(c, ONIGENC_CTYPE_ALNUM); } 01639 int rb_isalpha(int c) { return ctype_test(c, ONIGENC_CTYPE_ALPHA); } 01640 int rb_isblank(int c) { return ctype_test(c, ONIGENC_CTYPE_BLANK); } 01641 int rb_iscntrl(int c) { return ctype_test(c, ONIGENC_CTYPE_CNTRL); } 01642 int rb_isdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_DIGIT); } 01643 int rb_isgraph(int c) { return ctype_test(c, ONIGENC_CTYPE_GRAPH); } 01644 int rb_islower(int c) { return ctype_test(c, ONIGENC_CTYPE_LOWER); } 01645 int rb_isprint(int c) { return ctype_test(c, ONIGENC_CTYPE_PRINT); } 01646 int rb_ispunct(int c) { return ctype_test(c, ONIGENC_CTYPE_PUNCT); } 01647 int rb_isspace(int c) { return ctype_test(c, ONIGENC_CTYPE_SPACE); } 01648 int rb_isupper(int c) { return ctype_test(c, ONIGENC_CTYPE_UPPER); } 01649 int rb_isxdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_XDIGIT); } 01650 01651 int 01652 rb_tolower(int c) 01653 { 01654 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) : c; 01655 } 01656 01657 int 01658 rb_toupper(int c) 01659 { 01660 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) : c; 01661 } 01662 01663