Ruby 1.9.3p327(2012-11-10revision37606)
ext/iconv/iconv.c
Go to the documentation of this file.
00001 /* -*- mode:c; c-file-style:"ruby" -*- */
00002 /**********************************************************************
00003 
00004   iconv.c -
00005 
00006   $Author: nobu $
00007   created at: Wed Dec  1 20:28:09 JST 1999
00008 
00009   All the files in this distribution are covered under the Ruby's
00010   license (see the file COPYING).
00011 
00012   Documentation by Yukihiro Matsumoto and Gavin Sinclair.
00013 
00014 **********************************************************************/
00015 
00016 #include "ruby/ruby.h"
00017 #include <errno.h>
00018 #include <iconv.h>
00019 #include <assert.h>
00020 #include "ruby/st.h"
00021 #include "ruby/encoding.h"
00022 
00023 /*
00024  * Document-class: Iconv
00025  *
00026  * == Summary
00027  *
00028  * Ruby extension for charset conversion.
00029  *
00030  * == Abstract
00031  *
00032  * Iconv is a wrapper class for the UNIX 95 <tt>iconv()</tt> function family,
00033  * which translates string between various encoding systems.
00034  *
00035  * See Open Group's on-line documents for more details.
00036  * * <tt>iconv.h</tt>:       http://www.opengroup.org/onlinepubs/007908799/xsh/iconv.h.html
00037  * * <tt>iconv_open()</tt>:  http://www.opengroup.org/onlinepubs/007908799/xsh/iconv_open.html
00038  * * <tt>iconv()</tt>:       http://www.opengroup.org/onlinepubs/007908799/xsh/iconv.html
00039  * * <tt>iconv_close()</tt>: http://www.opengroup.org/onlinepubs/007908799/xsh/iconv_close.html
00040  *
00041  * Which coding systems are available is platform-dependent.
00042  *
00043  * == Examples
00044  *
00045  * 1. Simple conversion between two charsets.
00046  *
00047  *      converted_text = Iconv.conv('iso-8859-15', 'utf-8', text)
00048  *
00049  * 2. Instantiate a new Iconv and use method Iconv#iconv.
00050  *
00051  *      cd = Iconv.new(to, from)
00052  *      begin
00053  *        input.each { |s| output << cd.iconv(s) }
00054  *        output << cd.iconv(nil)                   # Don't forget this!
00055  *      ensure
00056  *        cd.close
00057  *      end
00058  *
00059  * 3. Invoke Iconv.open with a block.
00060  *
00061  *      Iconv.open(to, from) do |cd|
00062  *        input.each { |s| output << cd.iconv(s) }
00063  *        output << cd.iconv(nil)
00064  *      end
00065  *
00066  * 4. Shorthand for (3).
00067  *
00068  *      Iconv.iconv(to, from, *input.to_a)
00069  *
00070  * == Attentions
00071  *
00072  * Even if some extentions of implementation dependent are useful,
00073  * DON'T USE those extentions in libraries and scripts to widely distribute.
00074  * If you want to use those feature, use String#encode.
00075  */
00076 
00077 /* Invalid value for iconv_t is -1 but 0 for VALUE, I hope VALUE is
00078    big enough to keep iconv_t */
00079 #define VALUE2ICONV(v) ((iconv_t)((VALUE)(v) ^ -1))
00080 #define ICONV2VALUE(c) ((VALUE)(c) ^ -1)
00081 
00082 struct iconv_env_t
00083 {
00084     iconv_t cd;
00085     int argc;
00086     VALUE *argv;
00087     VALUE ret;
00088     int toidx;
00089     VALUE (*append)_((VALUE, VALUE));
00090 };
00091 
00092 struct rb_iconv_opt_t
00093 {
00094     VALUE transliterate;
00095     VALUE discard_ilseq;
00096 };
00097 
00098 static ID id_transliterate, id_discard_ilseq;
00099 
00100 static VALUE rb_eIconvInvalidEncoding;
00101 static VALUE rb_eIconvFailure;
00102 static VALUE rb_eIconvIllegalSeq;
00103 static VALUE rb_eIconvInvalidChar;
00104 static VALUE rb_eIconvOutOfRange;
00105 static VALUE rb_eIconvBrokenLibrary;
00106 
00107 static ID rb_success, rb_failed;
00108 static VALUE iconv_fail _((VALUE error, VALUE success, VALUE failed, struct iconv_env_t* env, const char *mesg));
00109 static VALUE iconv_fail_retry _((VALUE error, VALUE success, VALUE failed, struct iconv_env_t* env, const char *mesg));
00110 static VALUE iconv_failure_initialize _((VALUE error, VALUE mesg, VALUE success, VALUE failed));
00111 static VALUE iconv_failure_success _((VALUE self));
00112 static VALUE iconv_failure_failed _((VALUE self));
00113 
00114 static iconv_t iconv_create _((VALUE to, VALUE from, struct rb_iconv_opt_t *opt, int *idx));
00115 static void iconv_dfree _((void *cd));
00116 static VALUE iconv_free _((VALUE cd));
00117 static VALUE iconv_try _((iconv_t cd, const char **inptr, size_t *inlen, char **outptr, size_t *outlen));
00118 static VALUE rb_str_derive _((VALUE str, const char* ptr, long len));
00119 static VALUE iconv_convert _((iconv_t cd, VALUE str, long start, long length, int toidx,
00120                               struct iconv_env_t* env));
00121 static VALUE iconv_s_allocate _((VALUE klass));
00122 static VALUE iconv_initialize _((int argc, VALUE *argv, VALUE self));
00123 static VALUE iconv_s_open _((int argc, VALUE *argv, VALUE self));
00124 static VALUE iconv_s_convert _((struct iconv_env_t* env));
00125 static VALUE iconv_s_iconv _((int argc, VALUE *argv, VALUE self));
00126 static VALUE iconv_init_state _((VALUE cd));
00127 static VALUE iconv_finish _((VALUE self));
00128 static VALUE iconv_iconv _((int argc, VALUE *argv, VALUE self));
00129 static VALUE iconv_conv _((int argc, VALUE *argv, VALUE self));
00130 
00131 static VALUE charset_map;
00132 
00133 /*
00134  * Document-method: charset_map
00135  * call-seq: Iconv.charset_map
00136  *
00137  * Returns the map from canonical name to system dependent name.
00138  */
00139 static VALUE
00140 charset_map_get(void)
00141 {
00142     return charset_map;
00143 }
00144 
00145 static VALUE
00146 strip_glibc_option(VALUE *code)
00147 {
00148     VALUE val = StringValue(*code);
00149     const char *ptr = RSTRING_PTR(val), *pend = RSTRING_END(val);
00150     const char *slash = memchr(ptr, '/', pend - ptr);
00151 
00152     if (slash && slash < pend - 1 && slash[1] ==  '/') {
00153         VALUE opt = rb_str_subseq(val, slash - ptr, pend - slash);
00154         val = rb_str_subseq(val, 0, slash - ptr);
00155         *code = val;
00156         return opt;
00157     }
00158     return 0;
00159 }
00160 
00161 static char *
00162 map_charset(VALUE *code)
00163 {
00164     VALUE val = StringValue(*code);
00165 
00166     if (RHASH_SIZE(charset_map)) {
00167         st_data_t data;
00168         VALUE key = rb_funcall2(val, rb_intern("downcase"), 0, 0);
00169         StringValuePtr(key);
00170         if (st_lookup(RHASH_TBL(charset_map), key, &data)) {
00171             *code = (VALUE)data;
00172         }
00173     }
00174     return StringValuePtr(*code);
00175 }
00176 
00177 NORETURN(static void rb_iconv_sys_fail(const char *s));
00178 static void
00179 rb_iconv_sys_fail(const char *s)
00180 {
00181     if (errno == 0) {
00182         rb_exc_raise(iconv_fail(rb_eIconvBrokenLibrary, Qnil, Qnil, NULL, s));
00183     }
00184     rb_sys_fail(s);
00185 }
00186 
00187 #define rb_sys_fail(s) rb_iconv_sys_fail(s)
00188 
00189 static iconv_t
00190 iconv_create(VALUE to, VALUE from, struct rb_iconv_opt_t *opt, int *idx)
00191 {
00192     VALUE toopt = strip_glibc_option(&to);
00193     VALUE fromopt = strip_glibc_option(&from);
00194     VALUE toenc = 0, fromenc = 0;
00195     const char* tocode = map_charset(&to);
00196     const char* fromcode = map_charset(&from);
00197     iconv_t cd;
00198     int retry = 0;
00199 
00200     *idx = rb_enc_find_index(tocode);
00201 
00202     if (toopt) {
00203         toenc = rb_str_plus(to, toopt);
00204         tocode = RSTRING_PTR(toenc);
00205     }
00206     if (fromopt) {
00207         fromenc = rb_str_plus(from, fromopt);
00208         fromcode = RSTRING_PTR(fromenc);
00209     }
00210     while ((cd = iconv_open(tocode, fromcode)) == (iconv_t)-1) {
00211         int inval = 0;
00212         switch (errno) {
00213           case EMFILE:
00214           case ENFILE:
00215           case ENOMEM:
00216             if (!retry++) {
00217                 rb_gc();
00218                 continue;
00219             }
00220             break;
00221           case EINVAL:
00222             retry = 0;
00223             inval = 1;
00224             if (toenc) {
00225                 tocode = RSTRING_PTR(to);
00226                 rb_str_resize(toenc, 0);
00227                 toenc = 0;
00228                 continue;
00229             }
00230             if (fromenc) {
00231                 fromcode = RSTRING_PTR(from);
00232                 rb_str_resize(fromenc, 0);
00233                 fromenc = 0;
00234                 continue;
00235             }
00236             break;
00237         }
00238         {
00239             const char *s = inval ? "invalid encoding " : "iconv";
00240             volatile VALUE msg = rb_str_new(0, strlen(s) + RSTRING_LEN(to) +
00241                                             RSTRING_LEN(from) + 8);
00242 
00243             sprintf(RSTRING_PTR(msg), "%s(\"%s\", \"%s\")",
00244                     s, RSTRING_PTR(to), RSTRING_PTR(from));
00245             s = RSTRING_PTR(msg);
00246             rb_str_set_len(msg, strlen(s));
00247             if (!inval) rb_sys_fail(s);
00248             rb_exc_raise(iconv_fail(rb_eIconvInvalidEncoding, Qnil,
00249                                     rb_ary_new3(2, to, from), NULL, s));
00250         }
00251     }
00252 
00253     if (toopt || fromopt) {
00254         if (toopt && fromopt && RTEST(rb_str_equal(toopt, fromopt))) {
00255             fromopt = 0;
00256         }
00257         if (toopt && fromopt) {
00258             rb_warning("encoding option isn't portable: %s, %s",
00259                        RSTRING_PTR(toopt) + 2, RSTRING_PTR(fromopt) + 2);
00260         }
00261         else {
00262             rb_warning("encoding option isn't portable: %s",
00263                        (toopt ? RSTRING_PTR(toopt) : RSTRING_PTR(fromopt)) + 2);
00264         }
00265     }
00266 
00267     if (opt) {
00268 #ifdef ICONV_SET_TRANSLITERATE
00269         if (opt->transliterate != Qundef) {
00270             int flag = RTEST(opt->transliterate);
00271             rb_warning("encoding option isn't portable: transliterate");
00272             if (iconvctl(cd, ICONV_SET_TRANSLITERATE, (void *)&flag))
00273                 rb_sys_fail("ICONV_SET_TRANSLITERATE");
00274         }
00275 #endif
00276 #ifdef ICONV_SET_DISCARD_ILSEQ
00277         if (opt->discard_ilseq != Qundef) {
00278             int flag = RTEST(opt->discard_ilseq);
00279             rb_warning("encoding option isn't portable: discard_ilseq");
00280             if (iconvctl(cd, ICONV_SET_DISCARD_ILSEQ, (void *)&flag))
00281                 rb_sys_fail("ICONV_SET_DISCARD_ILSEQ");
00282         }
00283 #endif
00284     }
00285 
00286     return cd;
00287 }
00288 
00289 static void
00290 iconv_dfree(void *cd)
00291 {
00292     iconv_close(VALUE2ICONV(cd));
00293 }
00294 
00295 #define ICONV_FREE iconv_dfree
00296 
00297 static VALUE
00298 iconv_free(VALUE cd)
00299 {
00300     if (cd && iconv_close(VALUE2ICONV(cd)) == -1)
00301         rb_sys_fail("iconv_close");
00302     return Qnil;
00303 }
00304 
00305 static VALUE
00306 check_iconv(VALUE obj)
00307 {
00308     Check_Type(obj, T_DATA);
00309     if (RDATA(obj)->dfree != ICONV_FREE) {
00310         rb_raise(rb_eArgError, "Iconv expected (%s)", rb_class2name(CLASS_OF(obj)));
00311     }
00312     return (VALUE)DATA_PTR(obj);
00313 }
00314 
00315 static VALUE
00316 iconv_try(iconv_t cd, const char **inptr, size_t *inlen, char **outptr, size_t *outlen)
00317 {
00318 #ifdef ICONV_INPTR_CONST
00319 #define ICONV_INPTR_CAST
00320 #else
00321 #define ICONV_INPTR_CAST (char **)
00322 #endif
00323     size_t ret;
00324 
00325     errno = 0;
00326     ret = iconv(cd, ICONV_INPTR_CAST inptr, inlen, outptr, outlen);
00327     if (ret == (size_t)-1) {
00328         if (!*inlen)
00329             return Qfalse;
00330         switch (errno) {
00331           case E2BIG:
00332             /* try the left in next loop */
00333             break;
00334           case EILSEQ:
00335             return rb_eIconvIllegalSeq;
00336           case EINVAL:
00337             return rb_eIconvInvalidChar;
00338           case 0:
00339             return rb_eIconvBrokenLibrary;
00340           default:
00341             rb_sys_fail("iconv");
00342         }
00343     }
00344     else if (*inlen > 0) {
00345         /* something goes wrong */
00346         return rb_eIconvIllegalSeq;
00347     }
00348     else if (ret) {
00349         return Qnil;            /* conversion */
00350     }
00351     return Qfalse;
00352 }
00353 
00354 #define FAILED_MAXLEN 16
00355 
00356 static VALUE
00357 iconv_failure_initialize(VALUE error, VALUE mesg, VALUE success, VALUE failed)
00358 {
00359     rb_call_super(1, &mesg);
00360     rb_ivar_set(error, rb_success, success);
00361     rb_ivar_set(error, rb_failed, failed);
00362     return error;
00363 }
00364 
00365 static VALUE
00366 iconv_fail(VALUE error, VALUE success, VALUE failed, struct iconv_env_t* env, const char *mesg)
00367 {
00368     VALUE args[3];
00369 
00370     if (mesg && *mesg) {
00371         args[0] = rb_str_new2(mesg);
00372     }
00373     else if (TYPE(failed) != T_STRING || RSTRING_LEN(failed) < FAILED_MAXLEN) {
00374         args[0] = rb_inspect(failed);
00375     }
00376     else {
00377         args[0] = rb_inspect(rb_str_substr(failed, 0, FAILED_MAXLEN));
00378         rb_str_cat2(args[0], "...");
00379     }
00380     args[1] = success;
00381     args[2] = failed;
00382     if (env) {
00383         args[1] = env->append(rb_obj_dup(env->ret), success);
00384         if (env->argc > 0) {
00385             *(env->argv) = failed;
00386             args[2] = rb_ary_new4(env->argc, env->argv);
00387         }
00388     }
00389     return rb_class_new_instance(3, args, error);
00390 }
00391 
00392 static VALUE
00393 iconv_fail_retry(VALUE error, VALUE success, VALUE failed, struct iconv_env_t* env, const char *mesg)
00394 {
00395     error = iconv_fail(error, success, failed, env, mesg);
00396     if (!rb_block_given_p()) rb_exc_raise(error);
00397     rb_set_errinfo(error);
00398     return rb_yield(failed);
00399 }
00400 
00401 static VALUE
00402 rb_str_derive(VALUE str, const char* ptr, long len)
00403 {
00404     VALUE ret;
00405 
00406     if (NIL_P(str))
00407         return rb_str_new(ptr, len);
00408     if (RSTRING_PTR(str) + RSTRING_LEN(str) == ptr + len)
00409         ret = rb_str_subseq(str, ptr - RSTRING_PTR(str), len);
00410     else
00411         ret = rb_str_new(ptr, len);
00412     OBJ_INFECT(ret, str);
00413     return ret;
00414 }
00415 
00416 static VALUE
00417 iconv_convert(iconv_t cd, VALUE str, long start, long length, int toidx, struct iconv_env_t* env)
00418 {
00419     VALUE ret = Qfalse;
00420     VALUE error = Qfalse;
00421     VALUE rescue;
00422     const char *inptr, *instart;
00423     size_t inlen;
00424     /* I believe ONE CHARACTER never exceed this. */
00425     char buffer[BUFSIZ];
00426     char *outptr;
00427     size_t outlen;
00428 
00429     if (cd == (iconv_t)-1)
00430         rb_raise(rb_eArgError, "closed iconv");
00431 
00432     if (NIL_P(str)) {
00433         /* Reset output pointer or something. */
00434         inptr = "";
00435         inlen = 0;
00436         outptr = buffer;
00437         outlen = sizeof(buffer);
00438         error = iconv_try(cd, &inptr, &inlen, &outptr, &outlen);
00439         if (RTEST(error)) {
00440             unsigned int i;
00441             rescue = iconv_fail_retry(error, Qnil, Qnil, env, 0);
00442             if (TYPE(rescue) == T_ARRAY) {
00443                 str = RARRAY_LEN(rescue) > 0 ? RARRAY_PTR(rescue)[0] : Qnil;
00444             }
00445             if (FIXNUM_P(str) && (i = FIX2INT(str)) <= 0xff) {
00446                 char c = i;
00447                 str = rb_str_new(&c, 1);
00448             }
00449             else if (!NIL_P(str)) {
00450                 StringValue(str);
00451             }
00452         }
00453 
00454         inptr = NULL;
00455         length = 0;
00456     }
00457     else {
00458         long slen;
00459 
00460         StringValue(str);
00461         slen = RSTRING_LEN(str);
00462         inptr = RSTRING_PTR(str);
00463 
00464         inptr += start;
00465         if (length < 0 || length > start + slen)
00466             length = slen - start;
00467     }
00468     instart = inptr;
00469     inlen = length;
00470 
00471     do {
00472         char errmsg[50];
00473         const char *tmpstart = inptr;
00474         outptr = buffer;
00475         outlen = sizeof(buffer);
00476 
00477         errmsg[0] = 0;
00478         error = iconv_try(cd, &inptr, &inlen, &outptr, &outlen);
00479 
00480         if (
00481 #if SIGNEDNESS_OF_SIZE_T < 0
00482             0 <= outlen &&
00483 #endif
00484             outlen <= sizeof(buffer)) {
00485             outlen = sizeof(buffer) - outlen;
00486             if (NIL_P(error) || /* something converted */
00487                 outlen > (size_t)(inptr - tmpstart) || /* input can't contain output */
00488                 (outlen < (size_t)(inptr - tmpstart) && inlen > 0) || /* something skipped */
00489                 memcmp(buffer, tmpstart, outlen)) /* something differs */
00490             {
00491                 if (NIL_P(str)) {
00492                     ret = rb_str_new(buffer, outlen);
00493                     if (toidx >= 0) rb_enc_associate_index(ret, toidx);
00494                 }
00495                 else {
00496                     if (ret) {
00497                         ret = rb_str_buf_cat(ret, instart, tmpstart - instart);
00498                     }
00499                     else {
00500                         ret = rb_str_new(instart, tmpstart - instart);
00501                         if (toidx >= 0) rb_enc_associate_index(ret, toidx);
00502                         OBJ_INFECT(ret, str);
00503                     }
00504                     ret = rb_str_buf_cat(ret, buffer, outlen);
00505                     instart = inptr;
00506                 }
00507             }
00508             else if (!inlen) {
00509                 inptr = tmpstart + outlen;
00510             }
00511         }
00512         else {
00513             /* Some iconv() have a bug, return *outlen out of range */
00514             sprintf(errmsg, "bug?(output length = %ld)", (long)(sizeof(buffer) - outlen));
00515             error = rb_eIconvOutOfRange;
00516         }
00517 
00518         if (RTEST(error)) {
00519             long len = 0;
00520 
00521             if (!ret) {
00522                 ret = rb_str_derive(str, instart, inptr - instart);
00523                 if (toidx >= 0) rb_enc_associate_index(ret, toidx);
00524             }
00525             else if (inptr > instart) {
00526                 rb_str_cat(ret, instart, inptr - instart);
00527             }
00528             str = rb_str_derive(str, inptr, inlen);
00529             rescue = iconv_fail_retry(error, ret, str, env, errmsg);
00530             if (TYPE(rescue) == T_ARRAY) {
00531                 if ((len = RARRAY_LEN(rescue)) > 0)
00532                     rb_str_concat(ret, RARRAY_PTR(rescue)[0]);
00533                 if (len > 1 && !NIL_P(str = RARRAY_PTR(rescue)[1])) {
00534                     StringValue(str);
00535                     inlen = length = RSTRING_LEN(str);
00536                     instart = inptr = RSTRING_PTR(str);
00537                     continue;
00538                 }
00539             }
00540             else if (!NIL_P(rescue)) {
00541                 rb_str_concat(ret, rescue);
00542             }
00543             break;
00544         }
00545     } while (inlen > 0);
00546 
00547     if (!ret) {
00548         ret = rb_str_derive(str, instart, inptr - instart);
00549         if (toidx >= 0) rb_enc_associate_index(ret, toidx);
00550     }
00551     else if (inptr > instart) {
00552         rb_str_cat(ret, instart, inptr - instart);
00553     }
00554     return ret;
00555 }
00556 
00557 static VALUE
00558 iconv_s_allocate(VALUE klass)
00559 {
00560     return Data_Wrap_Struct(klass, 0, ICONV_FREE, 0);
00561 }
00562 
00563 static VALUE
00564 get_iconv_opt_i(VALUE i, VALUE arg)
00565 {
00566     struct rb_iconv_opt_t *opt = (struct rb_iconv_opt_t *)arg;
00567     VALUE name, val;
00568 
00569     (void)opt;
00570     i = rb_Array(i);
00571     name = rb_ary_entry(i, 0);
00572     val = rb_ary_entry(i, 1);
00573     do {
00574         if (SYMBOL_P(name)) {
00575             ID id = SYM2ID(name);
00576             if (id == id_transliterate) {
00577 #ifdef ICONV_SET_TRANSLITERATE
00578                 opt->transliterate = val;
00579 #else
00580                 rb_notimplement();
00581 #endif
00582                 break;
00583             }
00584             if (id == id_discard_ilseq) {
00585 #ifdef ICONV_SET_DISCARD_ILSEQ
00586                 opt->discard_ilseq = val;
00587 #else
00588                 rb_notimplement();
00589 #endif
00590                 break;
00591             }
00592         }
00593         else {
00594             const char *s = StringValueCStr(name);
00595             if (strcmp(s, "transliterate") == 0) {
00596 #ifdef ICONV_SET_TRANSLITERATE
00597                 opt->transliterate = val;
00598 #else
00599                 rb_notimplement();
00600 #endif
00601                 break;
00602             }
00603             if (strcmp(s, "discard_ilseq") == 0) {
00604 #ifdef ICONV_SET_DISCARD_ILSEQ
00605                 opt->discard_ilseq = val;
00606 #else
00607                 rb_notimplement();
00608 #endif
00609                 break;
00610             }
00611         }
00612         name = rb_inspect(name);
00613         rb_raise(rb_eArgError, "unknown option - %s", StringValueCStr(name));
00614     } while (0);
00615     return Qnil;
00616 }
00617 
00618 static void
00619 get_iconv_opt(struct rb_iconv_opt_t *opt, VALUE options)
00620 {
00621     opt->transliterate = Qundef;
00622     opt->discard_ilseq = Qundef;
00623     if (!NIL_P(options)) {
00624         rb_block_call(options, rb_intern("each"), 0, 0, get_iconv_opt_i, (VALUE)opt);
00625     }
00626 }
00627 
00628 #define iconv_ctl(self, func, val) (\
00629         iconvctl(VALUE2ICONV(check_iconv(self)), func, (void *)&(val)) ? \
00630         rb_sys_fail(#func) : (void)0)
00631 
00632 /*
00633  * Document-method: new
00634  * call-seq: Iconv.new(to, from, [options])
00635  *
00636  * Creates new code converter from a coding-system designated with +from+
00637  * to another one designated with +to+.
00638  *
00639  * === Parameters
00640  *
00641  * +to+::   encoding name for destination
00642  * +from+:: encoding name for source
00643  * +options+:: options for converter
00644  *
00645  * === Exceptions
00646  *
00647  * TypeError::       if +to+ or +from+ aren't String
00648  * InvalidEncoding:: if designated converter couldn't find out
00649  * SystemCallError:: if <tt>iconv_open(3)</tt> fails
00650  */
00651 static VALUE
00652 iconv_initialize(int argc, VALUE *argv, VALUE self)
00653 {
00654     VALUE to, from, options;
00655     struct rb_iconv_opt_t opt;
00656     int idx;
00657 
00658     rb_scan_args(argc, argv, "21", &to, &from, &options);
00659     get_iconv_opt(&opt, options);
00660     iconv_free(check_iconv(self));
00661     DATA_PTR(self) = NULL;
00662     DATA_PTR(self) = (void *)ICONV2VALUE(iconv_create(to, from, &opt, &idx));
00663     if (idx >= 0) ENCODING_SET(self, idx);
00664     return self;
00665 }
00666 
00667 /*
00668  * Document-method: open
00669  * call-seq: Iconv.open(to, from) { |iconv| ... }
00670  *
00671  * Equivalent to Iconv.new except that when it is called with a block, it
00672  * yields with the new instance and closes it, and returns the result which
00673  * returned from the block.
00674  */
00675 static VALUE
00676 iconv_s_open(int argc, VALUE *argv, VALUE self)
00677 {
00678     VALUE to, from, options, cd;
00679     struct rb_iconv_opt_t opt;
00680     int idx;
00681 
00682     rb_scan_args(argc, argv, "21", &to, &from, &options);
00683     get_iconv_opt(&opt, options);
00684     cd = ICONV2VALUE(iconv_create(to, from, &opt, &idx));
00685 
00686     self = Data_Wrap_Struct(self, NULL, ICONV_FREE, (void *)cd);
00687     if (idx >= 0) ENCODING_SET(self, idx);
00688 
00689     if (rb_block_given_p()) {
00690         return rb_ensure(rb_yield, self, (VALUE(*)())iconv_finish, self);
00691     }
00692     else {
00693         return self;
00694     }
00695 }
00696 
00697 static VALUE
00698 iconv_s_convert(struct iconv_env_t* env)
00699 {
00700     VALUE last = 0;
00701 
00702     for (; env->argc > 0; --env->argc, ++env->argv) {
00703         VALUE s = iconv_convert(env->cd, last = *(env->argv),
00704                                 0, -1, env->toidx, env);
00705         env->append(env->ret, s);
00706     }
00707 
00708     if (!NIL_P(last)) {
00709         VALUE s = iconv_convert(env->cd, Qnil, 0, 0, env->toidx, env);
00710         if (RSTRING_LEN(s))
00711             env->append(env->ret, s);
00712     }
00713 
00714     return env->ret;
00715 }
00716 
00717 /*
00718  * Document-method: Iconv::iconv
00719  * call-seq: Iconv.iconv(to, from, *strs)
00720  *
00721  * Shorthand for
00722  *   Iconv.open(to, from) { |cd|
00723  *     (strs + [nil]).collect { |s| cd.iconv(s) }
00724  *   }
00725  *
00726  * === Parameters
00727  *
00728  * <tt>to, from</tt>:: see Iconv.new
00729  * <tt>strs</tt>:: strings to be converted
00730  *
00731  * === Exceptions
00732  *
00733  * Exceptions thrown by Iconv.new, Iconv.open and Iconv#iconv.
00734  */
00735 static VALUE
00736 iconv_s_iconv(int argc, VALUE *argv, VALUE self)
00737 {
00738     struct iconv_env_t arg;
00739 
00740     if (argc < 2)               /* needs `to' and `from' arguments at least */
00741         rb_raise(rb_eArgError, "wrong number of arguments (%d for %d)", argc, 2);
00742 
00743     arg.argc = argc -= 2;
00744     arg.argv = argv + 2;
00745     arg.append = rb_ary_push;
00746     arg.ret = rb_ary_new2(argc);
00747     arg.cd = iconv_create(argv[0], argv[1], NULL, &arg.toidx);
00748     return rb_ensure(iconv_s_convert, (VALUE)&arg, iconv_free, ICONV2VALUE(arg.cd));
00749 }
00750 
00751 /*
00752  * Document-method: Iconv::conv
00753  * call-seq: Iconv.conv(to, from, str)
00754  *
00755  * Shorthand for
00756  *   Iconv.iconv(to, from, str).join
00757  * See Iconv.iconv.
00758  */
00759 static VALUE
00760 iconv_s_conv(VALUE self, VALUE to, VALUE from, VALUE str)
00761 {
00762     struct iconv_env_t arg;
00763 
00764     arg.argc = 1;
00765     arg.argv = &str;
00766     arg.append = rb_str_append;
00767     arg.ret = rb_str_new(0, 0);
00768     arg.cd = iconv_create(to, from, NULL, &arg.toidx);
00769     return rb_ensure(iconv_s_convert, (VALUE)&arg, iconv_free, ICONV2VALUE(arg.cd));
00770 }
00771 
00772 /*
00773  * Document-method: list
00774  * call-seq: Iconv.list {|*aliases| ... }
00775  *
00776  * Iterates each alias sets.
00777  */
00778 
00779 #ifdef HAVE_ICONVLIST
00780 struct iconv_name_list
00781 {
00782     unsigned int namescount;
00783     const char *const *names;
00784     VALUE array;
00785 };
00786 
00787 static VALUE
00788 list_iconv_i(VALUE ptr)
00789 {
00790     struct iconv_name_list *p = (struct iconv_name_list *)ptr;
00791     unsigned int i, namescount = p->namescount;
00792     const char *const *names = p->names;
00793     VALUE ary = rb_ary_new2(namescount);
00794 
00795     for (i = 0; i < namescount; i++) {
00796         rb_ary_push(ary, rb_str_new2(names[i]));
00797     }
00798     if (p->array) {
00799         return rb_ary_push(p->array, ary);
00800     }
00801     return rb_yield(ary);
00802 }
00803 
00804 static int
00805 list_iconv(unsigned int namescount, const char *const *names, void *data)
00806 {
00807     int *state = data;
00808     struct iconv_name_list list;
00809 
00810     list.namescount = namescount;
00811     list.names = names;
00812     list.array = ((VALUE *)data)[1];
00813     rb_protect(list_iconv_i, (VALUE)&list, state);
00814     return *state;
00815 }
00816 #endif
00817 
00818 #if defined(HAVE_ICONVLIST) || defined(HAVE___ICONV_FREE_LIST)
00819 static VALUE
00820 iconv_s_list(void)
00821 {
00822 #ifdef HAVE_ICONVLIST
00823     int state;
00824     VALUE args[2];
00825 
00826     args[1] = rb_block_given_p() ? 0 : rb_ary_new();
00827     iconvlist(list_iconv, args);
00828     state = *(int *)args;
00829     if (state) rb_jump_tag(state);
00830     if (args[1]) return args[1];
00831 #elif defined(HAVE___ICONV_FREE_LIST)
00832     char **list;
00833     size_t sz, i;
00834     VALUE ary;
00835 
00836     if (__iconv_get_list(&list, &sz)) return Qnil;
00837 
00838     ary = rb_ary_new2(sz);
00839     for (i = 0; i < sz; i++) {
00840         rb_ary_push(ary, rb_str_new2(list[i]));
00841     }
00842     __iconv_free_list(list, sz);
00843 
00844     if (!rb_block_given_p())
00845         return ary;
00846     for (i = 0; i < RARRAY_LEN(ary); i++) {
00847         rb_yield(RARRAY_PTR(ary)[i]);
00848     }
00849 #endif
00850     return Qnil;
00851 }
00852 #else
00853 #define iconv_s_list rb_f_notimplement
00854 #endif
00855 
00856 /*
00857  * Document-method: close
00858  *
00859  * Finishes conversion.
00860  *
00861  * After calling this, calling Iconv#iconv will cause an exception, but
00862  * multiple calls of #close are guaranteed to end successfully.
00863  *
00864  * Returns a string containing the byte sequence to change the output buffer to
00865  * its initial shift state.
00866  */
00867 static VALUE
00868 iconv_init_state(VALUE self)
00869 {
00870     iconv_t cd = VALUE2ICONV((VALUE)DATA_PTR(self));
00871     DATA_PTR(self) = NULL;
00872     return iconv_convert(cd, Qnil, 0, 0, ENCODING_GET(self), NULL);
00873 }
00874 
00875 static VALUE
00876 iconv_finish(VALUE self)
00877 {
00878     VALUE cd = check_iconv(self);
00879 
00880     if (!cd) return Qnil;
00881     return rb_ensure(iconv_init_state, self, iconv_free, cd);
00882 }
00883 
00884 /*
00885  * Document-method: Iconv#iconv
00886  * call-seq: iconv(str, start=0, length=-1)
00887  *
00888  * Converts string and returns the result.
00889  * * If +str+ is a String, converts <tt>str[start, length]</tt> and returns the converted string.
00890  * * If +str+ is +nil+, places converter itself into initial shift state and
00891  *   just returns a string containing the byte sequence to change the output
00892  *   buffer to its initial shift state.
00893  * * Otherwise, raises an exception.
00894  *
00895  * === Parameters
00896  *
00897  * str::    string to be converted, or nil
00898  * start::  starting offset
00899  * length:: conversion length; nil or -1 means whole the string from start
00900  *
00901  * === Exceptions
00902  *
00903  * * IconvIllegalSequence
00904  * * IconvInvalidCharacter
00905  * * IconvOutOfRange
00906  *
00907  * === Examples
00908  *
00909  * See the Iconv documentation.
00910  */
00911 static VALUE
00912 iconv_iconv(int argc, VALUE *argv, VALUE self)
00913 {
00914     VALUE str, n1, n2;
00915     VALUE cd = check_iconv(self);
00916     long start = 0, length = 0, slen = 0;
00917 
00918     rb_scan_args(argc, argv, "12", &str, &n1, &n2);
00919     if (!NIL_P(str)) {
00920         VALUE n = rb_str_length(StringValue(str));
00921         slen = NUM2LONG(n);
00922     }
00923     if (argc != 2 || !RTEST(rb_range_beg_len(n1, &start, &length, slen, 0))) {
00924         if (NIL_P(n1) || ((start = NUM2LONG(n1)) < 0 ? (start += slen) >= 0 : start < slen)) {
00925             length = NIL_P(n2) ? -1 : NUM2LONG(n2);
00926         }
00927     }
00928     if (start > 0 || length > 0) {
00929         rb_encoding *enc = rb_enc_get(str);
00930         const char *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
00931         const char *ps = s;
00932         if (start > 0) {
00933             start = (ps = rb_enc_nth(s, e, start, enc)) - s;
00934         }
00935         if (length > 0) {
00936             length = rb_enc_nth(ps, e, length, enc) - ps;
00937         }
00938     }
00939 
00940     return iconv_convert(VALUE2ICONV(cd), str, start, length, ENCODING_GET(self), NULL);
00941 }
00942 
00943 /*
00944  * Document-method: conv
00945  * call-seq: conv(str...)
00946  *
00947  * Equivalent to
00948  *
00949  *   iconv(nil, str..., nil).join
00950  */
00951 static VALUE
00952 iconv_conv(int argc, VALUE *argv, VALUE self)
00953 {
00954     iconv_t cd = VALUE2ICONV(check_iconv(self));
00955     VALUE str, s;
00956     int toidx = ENCODING_GET(self);
00957 
00958     str = iconv_convert(cd, Qnil, 0, 0, toidx, NULL);
00959     if (argc > 0) {
00960         do {
00961             s = iconv_convert(cd, *argv++, 0, -1, toidx, NULL);
00962             if (RSTRING_LEN(s))
00963                 rb_str_buf_append(str, s);
00964         } while (--argc);
00965         s = iconv_convert(cd, Qnil, 0, 0, toidx, NULL);
00966         if (RSTRING_LEN(s))
00967             rb_str_buf_append(str, s);
00968     }
00969 
00970     return str;
00971 }
00972 
00973 #ifdef ICONV_TRIVIALP
00974 /*
00975  * Document-method: trivial?
00976  * call-seq: trivial?
00977  *
00978  * Returns trivial flag.
00979  */
00980 static VALUE
00981 iconv_trivialp(VALUE self)
00982 {
00983     int trivial = 0;
00984     iconv_ctl(self, ICONV_TRIVIALP, trivial);
00985     if (trivial) return Qtrue;
00986     return Qfalse;
00987 }
00988 #else
00989 #define iconv_trivialp rb_f_notimplement
00990 #endif
00991 
00992 #ifdef ICONV_GET_TRANSLITERATE
00993 /*
00994  * Document-method: transliterate?
00995  * call-seq: transliterate?
00996  *
00997  * Returns transliterate flag.
00998  */
00999 static VALUE
01000 iconv_get_transliterate(VALUE self)
01001 {
01002     int trans = 0;
01003     iconv_ctl(self, ICONV_GET_TRANSLITERATE, trans);
01004     if (trans) return Qtrue;
01005     return Qfalse;
01006 }
01007 #else
01008 #define iconv_get_transliterate rb_f_notimplement
01009 #endif
01010 
01011 #ifdef ICONV_SET_TRANSLITERATE
01012 /*
01013  * Document-method: transliterate=
01014  * call-seq: cd.transliterate = flag
01015  *
01016  * Sets transliterate flag.
01017  */
01018 static VALUE
01019 iconv_set_transliterate(VALUE self, VALUE transliterate)
01020 {
01021     int trans = RTEST(transliterate);
01022     iconv_ctl(self, ICONV_SET_TRANSLITERATE, trans);
01023     return self;
01024 }
01025 #else
01026 #define iconv_set_transliterate rb_f_notimplement
01027 #endif
01028 
01029 #ifdef ICONV_GET_DISCARD_ILSEQ
01030 /*
01031  * Document-method: discard_ilseq?
01032  * call-seq: discard_ilseq?
01033  *
01034  * Returns discard_ilseq flag.
01035  */
01036 static VALUE
01037 iconv_get_discard_ilseq(VALUE self)
01038 {
01039     int dis = 0;
01040     iconv_ctl(self, ICONV_GET_DISCARD_ILSEQ, dis);
01041     if (dis) return Qtrue;
01042     return Qfalse;
01043 }
01044 #else
01045 #define iconv_get_discard_ilseq rb_f_notimplement
01046 #endif
01047 
01048 #ifdef ICONV_SET_DISCARD_ILSEQ
01049 /*
01050  * Document-method: discard_ilseq=
01051  * call-seq: cd.discard_ilseq = flag
01052  *
01053  * Sets discard_ilseq flag.
01054  */
01055 static VALUE
01056 iconv_set_discard_ilseq(VALUE self, VALUE discard_ilseq)
01057 {
01058     int dis = RTEST(discard_ilseq);
01059     iconv_ctl(self, ICONV_SET_DISCARD_ILSEQ, dis);
01060     return self;
01061 }
01062 #else
01063 #define iconv_set_discard_ilseq rb_f_notimplement
01064 #endif
01065 
01066 /*
01067  * Document-method: ctlmethods
01068  * call-seq: Iconv.ctlmethods => array
01069  *
01070  * Returns available iconvctl() method list.
01071  */
01072 static VALUE
01073 iconv_s_ctlmethods(VALUE klass)
01074 {
01075     VALUE ary = rb_ary_new();
01076 #ifdef ICONV_TRIVIALP
01077     rb_ary_push(ary, ID2SYM(rb_intern("trivial?")));
01078 #endif
01079 #ifdef ICONV_GET_TRANSLITERATE
01080     rb_ary_push(ary, ID2SYM(rb_intern("transliterate?")));
01081 #endif
01082 #ifdef ICONV_SET_TRANSLITERATE
01083     rb_ary_push(ary, ID2SYM(rb_intern("transliterate=")));
01084 #endif
01085 #ifdef ICONV_GET_DISCARD_ILSEQ
01086     rb_ary_push(ary, ID2SYM(rb_intern("discard_ilseq?")));
01087 #endif
01088 #ifdef ICONV_SET_DISCARD_ILSEQ
01089     rb_ary_push(ary, ID2SYM(rb_intern("discard_ilseq=")));
01090 #endif
01091     return ary;
01092 }
01093 
01094 /*
01095  * Document-class: Iconv::Failure
01096  *
01097  * Base attributes for Iconv exceptions.
01098  */
01099 
01100 /*
01101  * Document-method: success
01102  * call-seq: success
01103  *
01104  * Returns string(s) translated successfully until the exception occurred.
01105  * * In the case of failure occurred within Iconv.iconv, returned
01106  *   value is an array of strings translated successfully preceding
01107  *   failure and the last element is string on the way.
01108  */
01109 static VALUE
01110 iconv_failure_success(VALUE self)
01111 {
01112     return rb_attr_get(self, rb_success);
01113 }
01114 
01115 /*
01116  * Document-method: failed
01117  * call-seq: failed
01118  *
01119  * Returns substring of the original string passed to Iconv that starts at the
01120  * character caused the exception.
01121  */
01122 static VALUE
01123 iconv_failure_failed(VALUE self)
01124 {
01125     return rb_attr_get(self, rb_failed);
01126 }
01127 
01128 /*
01129  * Document-method: inspect
01130  * call-seq: inspect
01131  *
01132  * Returns inspected string like as: #<_class_: _success_, _failed_>
01133  */
01134 static VALUE
01135 iconv_failure_inspect(VALUE self)
01136 {
01137     const char *cname = rb_class2name(CLASS_OF(self));
01138     VALUE success = rb_attr_get(self, rb_success);
01139     VALUE failed = rb_attr_get(self, rb_failed);
01140     VALUE str = rb_str_buf_cat2(rb_str_new2("#<"), cname);
01141     str = rb_str_buf_cat(str, ": ", 2);
01142     str = rb_str_buf_append(str, rb_inspect(success));
01143     str = rb_str_buf_cat(str, ", ", 2);
01144     str = rb_str_buf_append(str, rb_inspect(failed));
01145     return rb_str_buf_cat(str, ">", 1);
01146 }
01147 
01148 /*
01149  * Document-class: Iconv::InvalidEncoding
01150  *
01151  * Requested coding-system is not available on this system.
01152  */
01153 
01154 /*
01155  * Document-class: Iconv::IllegalSequence
01156  *
01157  * Input conversion stopped due to an input byte that does not belong to
01158  * the input codeset, or the output codeset does not contain the
01159  * character.
01160  */
01161 
01162 /*
01163  * Document-class: Iconv::InvalidCharacter
01164  *
01165  * Input conversion stopped due to an incomplete character or shift
01166  * sequence at the end of the input buffer.
01167  */
01168 
01169 /*
01170  * Document-class: Iconv::OutOfRange
01171  *
01172  * Iconv library internal error.  Must not occur.
01173  */
01174 
01175 /*
01176  * Document-class: Iconv::BrokenLibrary
01177  *
01178  * Detected a bug of underlying iconv(3) libray.
01179  * * returns an error without setting errno properly
01180  */
01181 
01182 static void
01183 warn_deprecated(void)
01184 {
01185     static const char message[] =
01186         ": iconv will be deprecated in the future, use String#encode instead.\n";
01187     VALUE msg = Qnil, caller = rb_make_backtrace();
01188     long i;
01189 
01190     for (i = 1; i < RARRAY_LEN(caller); ++i) {
01191         VALUE s = RARRAY_PTR(caller)[i];
01192         if (strncmp(RSTRING_PTR(s), "<internal:", 10) != 0) {
01193             msg = s;
01194             break;
01195         }
01196     }
01197     if (NIL_P(msg)) {
01198         msg = rb_str_new_cstr(message + 2);
01199     }
01200     else {
01201         rb_str_cat(msg, message, sizeof(message) - 1);
01202     }
01203     rb_io_puts(1, &msg, rb_stderr);
01204 }
01205 
01206 void
01207 Init_iconv(void)
01208 {
01209     VALUE rb_cIconv = rb_define_class("Iconv", rb_cData);
01210 
01211     if (!NIL_P(ruby_verbose)) {
01212         warn_deprecated();
01213     }
01214     rb_define_alloc_func(rb_cIconv, iconv_s_allocate);
01215     rb_define_singleton_method(rb_cIconv, "open", iconv_s_open, -1);
01216     rb_define_singleton_method(rb_cIconv, "iconv", iconv_s_iconv, -1);
01217     rb_define_singleton_method(rb_cIconv, "conv", iconv_s_conv, 3);
01218     rb_define_singleton_method(rb_cIconv, "list", iconv_s_list, 0);
01219     rb_define_singleton_method(rb_cIconv, "ctlmethods", iconv_s_ctlmethods, 0);
01220     rb_define_method(rb_cIconv, "initialize", iconv_initialize, -1);
01221     rb_define_method(rb_cIconv, "close", iconv_finish, 0);
01222     rb_define_method(rb_cIconv, "iconv", iconv_iconv, -1);
01223     rb_define_method(rb_cIconv, "conv", iconv_conv, -1);
01224     rb_define_method(rb_cIconv, "trivial?", iconv_trivialp, 0);
01225     rb_define_method(rb_cIconv, "transliterate?", iconv_get_transliterate, 0);
01226     rb_define_method(rb_cIconv, "transliterate=", iconv_set_transliterate, 1);
01227     rb_define_method(rb_cIconv, "discard_ilseq?", iconv_get_discard_ilseq, 0);
01228     rb_define_method(rb_cIconv, "discard_ilseq=", iconv_set_discard_ilseq, 1);
01229 
01230     rb_eIconvFailure = rb_define_module_under(rb_cIconv, "Failure");
01231     rb_define_method(rb_eIconvFailure, "initialize", iconv_failure_initialize, 3);
01232     rb_define_method(rb_eIconvFailure, "success", iconv_failure_success, 0);
01233     rb_define_method(rb_eIconvFailure, "failed", iconv_failure_failed, 0);
01234     rb_define_method(rb_eIconvFailure, "inspect", iconv_failure_inspect, 0);
01235 
01236     rb_eIconvInvalidEncoding = rb_define_class_under(rb_cIconv, "InvalidEncoding", rb_eArgError);
01237     rb_eIconvIllegalSeq = rb_define_class_under(rb_cIconv, "IllegalSequence", rb_eArgError);
01238     rb_eIconvInvalidChar = rb_define_class_under(rb_cIconv, "InvalidCharacter", rb_eArgError);
01239     rb_eIconvOutOfRange = rb_define_class_under(rb_cIconv, "OutOfRange", rb_eRuntimeError);
01240     rb_eIconvBrokenLibrary = rb_define_class_under(rb_cIconv, "BrokenLibrary", rb_eRuntimeError);
01241     rb_include_module(rb_eIconvInvalidEncoding, rb_eIconvFailure);
01242     rb_include_module(rb_eIconvIllegalSeq, rb_eIconvFailure);
01243     rb_include_module(rb_eIconvInvalidChar, rb_eIconvFailure);
01244     rb_include_module(rb_eIconvOutOfRange, rb_eIconvFailure);
01245     rb_include_module(rb_eIconvBrokenLibrary, rb_eIconvFailure);
01246 
01247     rb_success = rb_intern("success");
01248     rb_failed = rb_intern("failed");
01249     id_transliterate = rb_intern("transliterate");
01250     id_discard_ilseq = rb_intern("discard_ilseq");
01251 
01252     rb_gc_register_address(&charset_map);
01253     charset_map = rb_hash_new();
01254     rb_define_singleton_method(rb_cIconv, "charset_map", charset_map_get, 0);
01255 }
01256 
01257