Ruby 1.9.3p327(2012-11-10revision37606)
transcode.c
Go to the documentation of this file.
00001 /**********************************************************************
00002 
00003   transcode.c -
00004 
00005   $Author: naruse $
00006   created at: Tue Oct 30 16:10:22 JST 2007
00007 
00008   Copyright (C) 2007 Martin Duerst
00009 
00010 **********************************************************************/
00011 
00012 #include "ruby/ruby.h"
00013 #include "ruby/encoding.h"
00014 #include "internal.h"
00015 #include "transcode_data.h"
00016 #include <ctype.h>
00017 
00018 #define ENABLE_ECONV_NEWLINE_OPTION 1
00019 
00020 /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
00021 VALUE rb_eUndefinedConversionError;
00022 VALUE rb_eInvalidByteSequenceError;
00023 VALUE rb_eConverterNotFoundError;
00024 
00025 VALUE rb_cEncodingConverter;
00026 
00027 static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback, sym_aref;
00028 static VALUE sym_xml, sym_text, sym_attr;
00029 static VALUE sym_universal_newline;
00030 static VALUE sym_crlf_newline;
00031 static VALUE sym_cr_newline;
00032 #ifdef ENABLE_ECONV_NEWLINE_OPTION
00033 static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
00034 #endif
00035 static VALUE sym_partial_input;
00036 
00037 static VALUE sym_invalid_byte_sequence;
00038 static VALUE sym_undefined_conversion;
00039 static VALUE sym_destination_buffer_full;
00040 static VALUE sym_source_buffer_empty;
00041 static VALUE sym_finished;
00042 static VALUE sym_after_output;
00043 static VALUE sym_incomplete_input;
00044 
00045 static unsigned char *
00046 allocate_converted_string(const char *sname, const char *dname,
00047         const unsigned char *str, size_t len,
00048         unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
00049         size_t *dst_len_ptr);
00050 
00051 /* dynamic structure, one per conversion (similar to iconv_t) */
00052 /* may carry conversion state (e.g. for iso-2022-jp) */
00053 typedef struct rb_transcoding {
00054     const rb_transcoder *transcoder;
00055 
00056     int flags;
00057 
00058     int resume_position;
00059     unsigned int next_table;
00060     VALUE next_info;
00061     unsigned char next_byte;
00062     unsigned int output_index;
00063 
00064     ssize_t recognized_len; /* already interpreted */
00065     ssize_t readagain_len; /* not yet interpreted */
00066     union {
00067         unsigned char ary[8]; /* max_input <= sizeof(ary) */
00068         unsigned char *ptr; /* length: max_input */
00069     } readbuf; /* recognized_len + readagain_len used */
00070 
00071     ssize_t writebuf_off;
00072     ssize_t writebuf_len;
00073     union {
00074         unsigned char ary[8]; /* max_output <= sizeof(ary) */
00075         unsigned char *ptr; /* length: max_output */
00076     } writebuf;
00077 
00078     union rb_transcoding_state_t { /* opaque data for stateful encoding */
00079         void *ptr;
00080         char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
00081         double dummy_for_alignment;
00082     } state;
00083 } rb_transcoding;
00084 #define TRANSCODING_READBUF(tc) \
00085     ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
00086      (tc)->readbuf.ary : \
00087      (tc)->readbuf.ptr)
00088 #define TRANSCODING_WRITEBUF(tc) \
00089     ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
00090      (tc)->writebuf.ary : \
00091      (tc)->writebuf.ptr)
00092 #define TRANSCODING_WRITEBUF_SIZE(tc) \
00093     ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
00094      sizeof((tc)->writebuf.ary) : \
00095      (size_t)(tc)->transcoder->max_output)
00096 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
00097 #define TRANSCODING_STATE(tc) \
00098     ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
00099      (tc)->state.ary : \
00100      (tc)->state.ptr)
00101 
00102 typedef struct {
00103     struct rb_transcoding *tc;
00104     unsigned char *out_buf_start;
00105     unsigned char *out_data_start;
00106     unsigned char *out_data_end;
00107     unsigned char *out_buf_end;
00108     rb_econv_result_t last_result;
00109 } rb_econv_elem_t;
00110 
00111 struct rb_econv_t {
00112     int flags;
00113     const char *source_encoding_name;
00114     const char *destination_encoding_name;
00115 
00116     int started;
00117 
00118     const unsigned char *replacement_str;
00119     size_t replacement_len;
00120     const char *replacement_enc;
00121     int replacement_allocated;
00122 
00123     unsigned char *in_buf_start;
00124     unsigned char *in_data_start;
00125     unsigned char *in_data_end;
00126     unsigned char *in_buf_end;
00127     rb_econv_elem_t *elems;
00128     int num_allocated;
00129     int num_trans;
00130     int num_finished;
00131     struct rb_transcoding *last_tc;
00132 
00133     /* last error */
00134     struct {
00135         rb_econv_result_t result;
00136         struct rb_transcoding *error_tc;
00137         const char *source_encoding;
00138         const char *destination_encoding;
00139         const unsigned char *error_bytes_start;
00140         size_t error_bytes_len;
00141         size_t readagain_len;
00142     } last_error;
00143 
00144     /* The following fields are only for Encoding::Converter.
00145      * rb_econv_open set them NULL. */
00146     rb_encoding *source_encoding;
00147     rb_encoding *destination_encoding;
00148 };
00149 
00150 /*
00151  *  Dispatch data and logic
00152  */
00153 
00154 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
00155 
00156 typedef struct {
00157     const char *sname;
00158     const char *dname;
00159     const char *lib; /* null means means no need to load a library */
00160     const rb_transcoder *transcoder;
00161 } transcoder_entry_t;
00162 
00163 static st_table *transcoder_table;
00164 
00165 static transcoder_entry_t *
00166 make_transcoder_entry(const char *sname, const char *dname)
00167 {
00168     st_data_t val;
00169     st_table *table2;
00170 
00171     if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
00172         val = (st_data_t)st_init_strcasetable();
00173         st_add_direct(transcoder_table, (st_data_t)sname, val);
00174     }
00175     table2 = (st_table *)val;
00176     if (!st_lookup(table2, (st_data_t)dname, &val)) {
00177         transcoder_entry_t *entry = ALLOC(transcoder_entry_t);
00178         entry->sname = sname;
00179         entry->dname = dname;
00180         entry->lib = NULL;
00181         entry->transcoder = NULL;
00182         val = (st_data_t)entry;
00183         st_add_direct(table2, (st_data_t)dname, val);
00184     }
00185     return (transcoder_entry_t *)val;
00186 }
00187 
00188 static transcoder_entry_t *
00189 get_transcoder_entry(const char *sname, const char *dname)
00190 {
00191     st_data_t val;
00192     st_table *table2;
00193 
00194     if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
00195         return NULL;
00196     }
00197     table2 = (st_table *)val;
00198     if (!st_lookup(table2, (st_data_t)dname, &val)) {
00199         return NULL;
00200     }
00201     return (transcoder_entry_t *)val;
00202 }
00203 
00204 void
00205 rb_register_transcoder(const rb_transcoder *tr)
00206 {
00207     const char *const sname = tr->src_encoding;
00208     const char *const dname = tr->dst_encoding;
00209 
00210     transcoder_entry_t *entry;
00211 
00212     entry = make_transcoder_entry(sname, dname);
00213     if (entry->transcoder) {
00214         rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
00215                  sname, dname);
00216     }
00217 
00218     entry->transcoder = tr;
00219 }
00220 
00221 static void
00222 declare_transcoder(const char *sname, const char *dname, const char *lib)
00223 {
00224     transcoder_entry_t *entry;
00225 
00226     entry = make_transcoder_entry(sname, dname);
00227     entry->lib = lib;
00228 }
00229 
00230 #define MAX_TRANSCODER_LIBNAME_LEN 64
00231 static const char transcoder_lib_prefix[] = "enc/trans/";
00232 
00233 void
00234 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
00235 {
00236     if (!lib || strlen(lib) > MAX_TRANSCODER_LIBNAME_LEN) {
00237         rb_raise(rb_eArgError, "invalid library name - %s",
00238                  lib ? lib : "(null)");
00239     }
00240     declare_transcoder(enc1, enc2, lib);
00241 }
00242 
00243 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
00244 
00245 typedef struct search_path_queue_tag {
00246     struct search_path_queue_tag *next;
00247     const char *enc;
00248 } search_path_queue_t;
00249 
00250 typedef struct {
00251     st_table *visited;
00252     search_path_queue_t *queue;
00253     search_path_queue_t **queue_last_ptr;
00254     const char *base_enc;
00255 } search_path_bfs_t;
00256 
00257 static int
00258 transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
00259 {
00260     const char *dname = (const char *)key;
00261     search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
00262     search_path_queue_t *q;
00263 
00264     if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
00265         return ST_CONTINUE;
00266     }
00267 
00268     q = ALLOC(search_path_queue_t);
00269     q->enc = dname;
00270     q->next = NULL;
00271     *bfs->queue_last_ptr = q;
00272     bfs->queue_last_ptr = &q->next;
00273 
00274     st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
00275     return ST_CONTINUE;
00276 }
00277 
00278 static int
00279 transcode_search_path(const char *sname, const char *dname,
00280     void (*callback)(const char *sname, const char *dname, int depth, void *arg),
00281     void *arg)
00282 {
00283     search_path_bfs_t bfs;
00284     search_path_queue_t *q;
00285     st_data_t val;
00286     st_table *table2;
00287     int found;
00288     int pathlen = -1;
00289 
00290     if (encoding_equal(sname, dname))
00291         return -1;
00292 
00293     q = ALLOC(search_path_queue_t);
00294     q->enc = sname;
00295     q->next = NULL;
00296     bfs.queue_last_ptr = &q->next;
00297     bfs.queue = q;
00298 
00299     bfs.visited = st_init_strcasetable();
00300     st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
00301 
00302     while (bfs.queue) {
00303         q = bfs.queue;
00304         bfs.queue = q->next;
00305         if (!bfs.queue)
00306             bfs.queue_last_ptr = &bfs.queue;
00307 
00308         if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
00309             xfree(q);
00310             continue;
00311         }
00312         table2 = (st_table *)val;
00313 
00314         if (st_lookup(table2, (st_data_t)dname, &val)) {
00315             st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
00316             xfree(q);
00317             found = 1;
00318             goto cleanup;
00319         }
00320 
00321         bfs.base_enc = q->enc;
00322         st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
00323         bfs.base_enc = NULL;
00324 
00325         xfree(q);
00326     }
00327     found = 0;
00328 
00329   cleanup:
00330     while (bfs.queue) {
00331         q = bfs.queue;
00332         bfs.queue = q->next;
00333         xfree(q);
00334     }
00335 
00336     if (found) {
00337         const char *enc = dname;
00338         int depth;
00339         pathlen = 0;
00340         while (1) {
00341             st_lookup(bfs.visited, (st_data_t)enc, &val);
00342             if (!val)
00343                 break;
00344             pathlen++;
00345             enc = (const char *)val;
00346         }
00347         depth = pathlen;
00348         enc = dname;
00349         while (1) {
00350             st_lookup(bfs.visited, (st_data_t)enc, &val);
00351             if (!val)
00352                 break;
00353             callback((const char *)val, enc, --depth, arg);
00354             enc = (const char *)val;
00355         }
00356     }
00357 
00358     st_free_table(bfs.visited);
00359 
00360     return pathlen; /* is -1 if not found */
00361 }
00362 
00363 static const rb_transcoder *
00364 load_transcoder_entry(transcoder_entry_t *entry)
00365 {
00366     if (entry->transcoder)
00367         return entry->transcoder;
00368 
00369     if (entry->lib) {
00370         const char *lib = entry->lib;
00371         size_t len = strlen(lib);
00372         char path[sizeof(transcoder_lib_prefix) + MAX_TRANSCODER_LIBNAME_LEN];
00373         VALUE fn;
00374         const int safe = rb_safe_level();
00375 
00376         entry->lib = NULL;
00377 
00378         if (len > MAX_TRANSCODER_LIBNAME_LEN)
00379             return NULL;
00380         memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
00381         memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1);
00382         fn = rb_str_new2(path);
00383         FL_UNSET(fn, FL_TAINT|FL_UNTRUSTED);
00384         OBJ_FREEZE(fn);
00385         if (!rb_require_safe(fn, safe > 3 ? 3 : safe))
00386             return NULL;
00387     }
00388 
00389     if (entry->transcoder)
00390         return entry->transcoder;
00391 
00392     return NULL;
00393 }
00394 
00395 static const char*
00396 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
00397 {
00398     if (encoding_equal(encname, "UTF-8")) {
00399         *len_ret = 3;
00400         *repl_encname_ptr = "UTF-8";
00401         return "\xEF\xBF\xBD";
00402     }
00403     else {
00404         *len_ret = 1;
00405         *repl_encname_ptr = "US-ASCII";
00406         return "?";
00407     }
00408 }
00409 
00410 /*
00411  *  Transcoding engine logic
00412  */
00413 
00414 static const unsigned char *
00415 transcode_char_start(rb_transcoding *tc,
00416                          const unsigned char *in_start,
00417                          const unsigned char *inchar_start,
00418                          const unsigned char *in_p,
00419                          size_t *char_len_ptr)
00420 {
00421     const unsigned char *ptr;
00422     if (inchar_start - in_start < tc->recognized_len) {
00423         MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
00424                inchar_start, unsigned char, in_p - inchar_start);
00425         ptr = TRANSCODING_READBUF(tc);
00426     }
00427     else {
00428         ptr = inchar_start - tc->recognized_len;
00429     }
00430     *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
00431     return ptr;
00432 }
00433 
00434 static rb_econv_result_t
00435 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
00436                       const unsigned char *in_stop, unsigned char *out_stop,
00437                       rb_transcoding *tc,
00438                       const int opt)
00439 {
00440     const rb_transcoder *tr = tc->transcoder;
00441     int unitlen = tr->input_unit_length;
00442     ssize_t readagain_len = 0;
00443 
00444     const unsigned char *inchar_start;
00445     const unsigned char *in_p;
00446 
00447     unsigned char *out_p;
00448 
00449     in_p = inchar_start = *in_pos;
00450 
00451     out_p = *out_pos;
00452 
00453 #define SUSPEND(ret, num) \
00454     do { \
00455         tc->resume_position = (num); \
00456         if (0 < in_p - inchar_start) \
00457             MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
00458                    inchar_start, unsigned char, in_p - inchar_start); \
00459         *in_pos = in_p; \
00460         *out_pos = out_p; \
00461         tc->recognized_len += in_p - inchar_start; \
00462         if (readagain_len) { \
00463             tc->recognized_len -= readagain_len; \
00464             tc->readagain_len = readagain_len; \
00465         } \
00466         return (ret); \
00467         resume_label ## num:; \
00468     } while (0)
00469 #define SUSPEND_OBUF(num) \
00470     do { \
00471         while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
00472     } while (0)
00473 
00474 #define SUSPEND_AFTER_OUTPUT(num) \
00475     if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
00476         SUSPEND(econv_after_output, num); \
00477     }
00478 
00479 #define next_table (tc->next_table)
00480 #define next_info (tc->next_info)
00481 #define next_byte (tc->next_byte)
00482 #define writebuf_len (tc->writebuf_len)
00483 #define writebuf_off (tc->writebuf_off)
00484 
00485     switch (tc->resume_position) {
00486       case 0: break;
00487       case 1: goto resume_label1;
00488       case 2: goto resume_label2;
00489       case 3: goto resume_label3;
00490       case 4: goto resume_label4;
00491       case 5: goto resume_label5;
00492       case 6: goto resume_label6;
00493       case 7: goto resume_label7;
00494       case 8: goto resume_label8;
00495       case 9: goto resume_label9;
00496       case 10: goto resume_label10;
00497       case 11: goto resume_label11;
00498       case 12: goto resume_label12;
00499       case 13: goto resume_label13;
00500       case 14: goto resume_label14;
00501       case 15: goto resume_label15;
00502       case 16: goto resume_label16;
00503       case 17: goto resume_label17;
00504       case 18: goto resume_label18;
00505       case 19: goto resume_label19;
00506       case 20: goto resume_label20;
00507       case 21: goto resume_label21;
00508       case 22: goto resume_label22;
00509       case 23: goto resume_label23;
00510       case 24: goto resume_label24;
00511       case 25: goto resume_label25;
00512       case 26: goto resume_label26;
00513       case 27: goto resume_label27;
00514       case 28: goto resume_label28;
00515       case 29: goto resume_label29;
00516       case 30: goto resume_label30;
00517       case 31: goto resume_label31;
00518       case 32: goto resume_label32;
00519       case 33: goto resume_label33;
00520       case 34: goto resume_label34;
00521     }
00522 
00523     while (1) {
00524         inchar_start = in_p;
00525         tc->recognized_len = 0;
00526         next_table = tr->conv_tree_start;
00527 
00528         SUSPEND_AFTER_OUTPUT(24);
00529 
00530         if (in_stop <= in_p) {
00531             if (!(opt & ECONV_PARTIAL_INPUT))
00532                 break;
00533             SUSPEND(econv_source_buffer_empty, 7);
00534             continue;
00535         }
00536 
00537 #define BYTE_ADDR(index) (tr->byte_array + (index))
00538 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
00539 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
00540 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
00541 #define BL_MIN_BYTE     (BL_BASE[0])
00542 #define BL_MAX_BYTE     (BL_BASE[1])
00543 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
00544 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
00545 
00546         next_byte = (unsigned char)*in_p++;
00547       follow_byte:
00548         if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
00549             next_info = INVALID;
00550         else {
00551             next_info = (VALUE)BL_ACTION(next_byte);
00552         }
00553       follow_info:
00554         switch (next_info & 0x1F) {
00555           case NOMAP:
00556             {
00557                 const unsigned char *p = inchar_start;
00558                 writebuf_off = 0;
00559                 while (p < in_p) {
00560                     TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
00561                 }
00562                 writebuf_len = writebuf_off;
00563                 writebuf_off = 0;
00564                 while (writebuf_off < writebuf_len) {
00565                     SUSPEND_OBUF(3);
00566                     *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00567                 }
00568             }
00569             continue;
00570           case 0x00: case 0x04: case 0x08: case 0x0C:
00571           case 0x10: case 0x14: case 0x18: case 0x1C:
00572             SUSPEND_AFTER_OUTPUT(25);
00573             while (in_p >= in_stop) {
00574                 if (!(opt & ECONV_PARTIAL_INPUT))
00575                     goto incomplete;
00576                 SUSPEND(econv_source_buffer_empty, 5);
00577             }
00578             next_byte = (unsigned char)*in_p++;
00579             next_table = (unsigned int)next_info;
00580             goto follow_byte;
00581           case ZERObt: /* drop input */
00582             continue;
00583           case ONEbt:
00584             SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
00585             continue;
00586           case TWObt:
00587             SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
00588             SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
00589             continue;
00590           case THREEbt:
00591             SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
00592             SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
00593             SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
00594             continue;
00595           case FOURbt:
00596             SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
00597             SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
00598             SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
00599             SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
00600             continue;
00601           case GB4bt:
00602             SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
00603             SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
00604             SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
00605             SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
00606             continue;
00607           case STR1:
00608             tc->output_index = 0;
00609             while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
00610                 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
00611                 tc->output_index++;
00612             }
00613             continue;
00614           case FUNii:
00615             next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
00616             goto follow_info;
00617           case FUNsi:
00618             {
00619                 const unsigned char *char_start;
00620                 size_t char_len;
00621                 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00622                 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
00623                 goto follow_info;
00624             }
00625           case FUNio:
00626             SUSPEND_OBUF(13);
00627             if (tr->max_output <= out_stop - out_p)
00628                 out_p += tr->func_io(TRANSCODING_STATE(tc),
00629                     next_info, out_p, out_stop - out_p);
00630             else {
00631                 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
00632                     next_info,
00633                     TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00634                 writebuf_off = 0;
00635                 while (writebuf_off < writebuf_len) {
00636                     SUSPEND_OBUF(20);
00637                     *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00638                 }
00639             }
00640             break;
00641           case FUNso:
00642             {
00643                 const unsigned char *char_start;
00644                 size_t char_len;
00645                 SUSPEND_OBUF(14);
00646                 if (tr->max_output <= out_stop - out_p) {
00647                     char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00648                     out_p += tr->func_so(TRANSCODING_STATE(tc),
00649                         char_start, (size_t)char_len,
00650                         out_p, out_stop - out_p);
00651                 }
00652                 else {
00653                     char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00654                     writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
00655                         char_start, (size_t)char_len,
00656                         TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00657                     writebuf_off = 0;
00658                     while (writebuf_off < writebuf_len) {
00659                         SUSPEND_OBUF(22);
00660                         *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00661                     }
00662                 }
00663                 break;
00664             }
00665       case FUNsio:
00666             {
00667                 const unsigned char *char_start;
00668                 size_t char_len;
00669                 SUSPEND_OBUF(33);
00670                 if (tr->max_output <= out_stop - out_p) {
00671                     char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00672                     out_p += tr->func_sio(TRANSCODING_STATE(tc),
00673                         char_start, (size_t)char_len, next_info,
00674                         out_p, out_stop - out_p);
00675                 }
00676                 else {
00677                     char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00678                     writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
00679                         char_start, (size_t)char_len, next_info,
00680                         TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00681                     writebuf_off = 0;
00682                     while (writebuf_off < writebuf_len) {
00683                         SUSPEND_OBUF(34);
00684                         *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00685                     }
00686                 }
00687                 break;
00688             }
00689           case INVALID:
00690             if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
00691                 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
00692                     SUSPEND_AFTER_OUTPUT(26);
00693                 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
00694                     in_p = in_stop;
00695                     SUSPEND(econv_source_buffer_empty, 8);
00696                 }
00697                 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
00698                     in_p = in_stop;
00699                 }
00700                 else {
00701                     in_p = inchar_start + (unitlen - tc->recognized_len);
00702                 }
00703             }
00704             else {
00705                 ssize_t invalid_len; /* including the last byte which causes invalid */
00706                 ssize_t discard_len;
00707                 invalid_len = tc->recognized_len + (in_p - inchar_start);
00708                 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
00709                 readagain_len = invalid_len - discard_len;
00710             }
00711             goto invalid;
00712           case UNDEF:
00713             goto undef;
00714           default:
00715             rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
00716         }
00717         continue;
00718 
00719       invalid:
00720         SUSPEND(econv_invalid_byte_sequence, 1);
00721         continue;
00722 
00723       incomplete:
00724         SUSPEND(econv_incomplete_input, 27);
00725         continue;
00726 
00727       undef:
00728         SUSPEND(econv_undefined_conversion, 2);
00729         continue;
00730     }
00731 
00732     /* cleanup */
00733     if (tr->finish_func) {
00734         SUSPEND_OBUF(4);
00735         if (tr->max_output <= out_stop - out_p) {
00736             out_p += tr->finish_func(TRANSCODING_STATE(tc),
00737                 out_p, out_stop - out_p);
00738         }
00739         else {
00740             writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
00741                 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00742             writebuf_off = 0;
00743             while (writebuf_off < writebuf_len) {
00744                 SUSPEND_OBUF(23);
00745                 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00746             }
00747         }
00748     }
00749     while (1)
00750         SUSPEND(econv_finished, 6);
00751 #undef SUSPEND
00752 #undef next_table
00753 #undef next_info
00754 #undef next_byte
00755 #undef writebuf_len
00756 #undef writebuf_off
00757 }
00758 
00759 static rb_econv_result_t
00760 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
00761                       const unsigned char *in_stop, unsigned char *out_stop,
00762                       rb_transcoding *tc,
00763                       const int opt)
00764 {
00765     if (tc->readagain_len) {
00766         unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
00767         const unsigned char *readagain_pos = readagain_buf;
00768         const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
00769         rb_econv_result_t res;
00770 
00771         MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
00772                unsigned char, tc->readagain_len);
00773         tc->readagain_len = 0;
00774         res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
00775         if (res != econv_source_buffer_empty) {
00776             MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
00777                    readagain_pos, unsigned char, readagain_stop - readagain_pos);
00778             tc->readagain_len += readagain_stop - readagain_pos;
00779             return res;
00780         }
00781     }
00782     return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
00783 }
00784 
00785 static rb_transcoding *
00786 rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
00787 {
00788     rb_transcoding *tc;
00789 
00790     tc = ALLOC(rb_transcoding);
00791     tc->transcoder = tr;
00792     tc->flags = flags;
00793     if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
00794         tc->state.ptr = xmalloc(tr->state_size);
00795     if (tr->state_init_func) {
00796         (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
00797     }
00798     tc->resume_position = 0;
00799     tc->recognized_len = 0;
00800     tc->readagain_len = 0;
00801     tc->writebuf_len = 0;
00802     tc->writebuf_off = 0;
00803     if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
00804         tc->readbuf.ptr = xmalloc(tr->max_input);
00805     }
00806     if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
00807         tc->writebuf.ptr = xmalloc(tr->max_output);
00808     }
00809     return tc;
00810 }
00811 
00812 static rb_econv_result_t
00813 rb_transcoding_convert(rb_transcoding *tc,
00814   const unsigned char **input_ptr, const unsigned char *input_stop,
00815   unsigned char **output_ptr, unsigned char *output_stop,
00816   int flags)
00817 {
00818     return transcode_restartable(
00819                 input_ptr, output_ptr,
00820                 input_stop, output_stop,
00821                 tc, flags);
00822 }
00823 
00824 static void
00825 rb_transcoding_close(rb_transcoding *tc)
00826 {
00827     const rb_transcoder *tr = tc->transcoder;
00828     if (tr->state_fini_func) {
00829         (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
00830     }
00831     if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
00832         xfree(tc->state.ptr);
00833     if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
00834         xfree(tc->readbuf.ptr);
00835     if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
00836         xfree(tc->writebuf.ptr);
00837     xfree(tc);
00838 }
00839 
00840 static size_t
00841 rb_transcoding_memsize(rb_transcoding *tc)
00842 {
00843     size_t size = sizeof(rb_transcoding);
00844     const rb_transcoder *tr = tc->transcoder;
00845 
00846     if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
00847         size += tr->state_size;
00848     }
00849     if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
00850         size += tr->max_input;
00851     }
00852     if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
00853         size += tr->max_output;
00854     }
00855     return size;
00856 }
00857 
00858 static rb_econv_t *
00859 rb_econv_alloc(int n_hint)
00860 {
00861     rb_econv_t *ec;
00862 
00863     if (n_hint <= 0)
00864         n_hint = 1;
00865 
00866     ec = ALLOC(rb_econv_t);
00867     ec->flags = 0;
00868     ec->source_encoding_name = NULL;
00869     ec->destination_encoding_name = NULL;
00870     ec->started = 0;
00871     ec->replacement_str = NULL;
00872     ec->replacement_len = 0;
00873     ec->replacement_enc = NULL;
00874     ec->replacement_allocated = 0;
00875     ec->in_buf_start = NULL;
00876     ec->in_data_start = NULL;
00877     ec->in_data_end = NULL;
00878     ec->in_buf_end = NULL;
00879     ec->num_allocated = n_hint;
00880     ec->num_trans = 0;
00881     ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
00882     ec->num_finished = 0;
00883     ec->last_tc = NULL;
00884     ec->last_error.result = econv_source_buffer_empty;
00885     ec->last_error.error_tc = NULL;
00886     ec->last_error.source_encoding = NULL;
00887     ec->last_error.destination_encoding = NULL;
00888     ec->last_error.error_bytes_start = NULL;
00889     ec->last_error.error_bytes_len = 0;
00890     ec->last_error.readagain_len = 0;
00891     ec->source_encoding = NULL;
00892     ec->destination_encoding = NULL;
00893     return ec;
00894 }
00895 
00896 static int
00897 rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
00898 {
00899     int n, j;
00900     int bufsize = 4096;
00901     unsigned char *p;
00902 
00903     if (ec->num_trans == ec->num_allocated) {
00904         n = ec->num_allocated * 2;
00905         REALLOC_N(ec->elems, rb_econv_elem_t, n);
00906         ec->num_allocated = n;
00907     }
00908 
00909     p = xmalloc(bufsize);
00910 
00911     MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
00912 
00913     ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
00914     ec->elems[i].out_buf_start = p;
00915     ec->elems[i].out_buf_end = p + bufsize;
00916     ec->elems[i].out_data_start = p;
00917     ec->elems[i].out_data_end = p;
00918     ec->elems[i].last_result = econv_source_buffer_empty;
00919 
00920     ec->num_trans++;
00921 
00922     if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
00923         for (j = ec->num_trans-1; i <= j; j--) {
00924             rb_transcoding *tc = ec->elems[j].tc;
00925             const rb_transcoder *tr2 = tc->transcoder;
00926             if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
00927                 ec->last_tc = tc;
00928                 break;
00929             }
00930         }
00931 
00932     return 0;
00933 }
00934 
00935 static rb_econv_t *
00936 rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
00937 {
00938     rb_econv_t *ec;
00939     int i, ret;
00940 
00941     for (i = 0; i < n; i++) {
00942         const rb_transcoder *tr;
00943         tr = load_transcoder_entry(entries[i]);
00944         if (!tr)
00945             return NULL;
00946     }
00947 
00948     ec = rb_econv_alloc(n);
00949 
00950     for (i = 0; i < n; i++) {
00951         const rb_transcoder *tr = load_transcoder_entry(entries[i]);
00952         ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
00953         if (ret == -1) {
00954             rb_econv_close(ec);
00955             return NULL;
00956         }
00957     }
00958 
00959     return ec;
00960 }
00961 
00962 struct trans_open_t {
00963     transcoder_entry_t **entries;
00964     int num_additional;
00965 };
00966 
00967 static void
00968 trans_open_i(const char *sname, const char *dname, int depth, void *arg)
00969 {
00970     struct trans_open_t *toarg = arg;
00971 
00972     if (!toarg->entries) {
00973         toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
00974     }
00975     toarg->entries[depth] = get_transcoder_entry(sname, dname);
00976 }
00977 
00978 static rb_econv_t *
00979 rb_econv_open0(const char *sname, const char *dname, int ecflags)
00980 {
00981     transcoder_entry_t **entries = NULL;
00982     int num_trans;
00983     rb_econv_t *ec;
00984 
00985     rb_encoding *senc, *denc;
00986     int sidx, didx;
00987 
00988     senc = NULL;
00989     if (*sname) {
00990         sidx = rb_enc_find_index(sname);
00991         if (0 <= sidx) {
00992             senc = rb_enc_from_index(sidx);
00993         }
00994     }
00995 
00996     denc = NULL;
00997     if (*dname) {
00998         didx = rb_enc_find_index(dname);
00999         if (0 <= didx) {
01000             denc = rb_enc_from_index(didx);
01001         }
01002     }
01003 
01004     if (*sname == '\0' && *dname == '\0') {
01005         num_trans = 0;
01006         entries = NULL;
01007     }
01008     else {
01009         struct trans_open_t toarg;
01010         toarg.entries = NULL;
01011         toarg.num_additional = 0;
01012         num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
01013         entries = toarg.entries;
01014         if (num_trans < 0) {
01015             xfree(entries);
01016             return NULL;
01017         }
01018     }
01019 
01020     ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
01021     xfree(entries);
01022     if (!ec)
01023         return NULL;
01024 
01025     ec->flags = ecflags;
01026     ec->source_encoding_name = sname;
01027     ec->destination_encoding_name = dname;
01028 
01029     return ec;
01030 }
01031 
01032 #define MAX_ECFLAGS_DECORATORS 32
01033 
01034 static int
01035 decorator_names(int ecflags, const char **decorators_ret)
01036 {
01037     int num_decorators;
01038 
01039     switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
01040       case ECONV_UNIVERSAL_NEWLINE_DECORATOR:
01041       case ECONV_CRLF_NEWLINE_DECORATOR:
01042       case ECONV_CR_NEWLINE_DECORATOR:
01043       case 0:
01044         break;
01045       default:
01046         return -1;
01047     }
01048 
01049     if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
01050         (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR))
01051         return -1;
01052 
01053     num_decorators = 0;
01054 
01055     if (ecflags & ECONV_XML_TEXT_DECORATOR)
01056         decorators_ret[num_decorators++] = "xml_text_escape";
01057     if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
01058         decorators_ret[num_decorators++] = "xml_attr_content_escape";
01059     if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
01060         decorators_ret[num_decorators++] = "xml_attr_quote";
01061 
01062     if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
01063         decorators_ret[num_decorators++] = "crlf_newline";
01064     if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
01065         decorators_ret[num_decorators++] = "cr_newline";
01066     if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
01067         decorators_ret[num_decorators++] = "universal_newline";
01068 
01069     return num_decorators;
01070 }
01071 
01072 rb_econv_t *
01073 rb_econv_open(const char *sname, const char *dname, int ecflags)
01074 {
01075     rb_econv_t *ec;
01076     int num_decorators;
01077     const char *decorators[MAX_ECFLAGS_DECORATORS];
01078     int i;
01079 
01080     num_decorators = decorator_names(ecflags, decorators);
01081     if (num_decorators == -1)
01082         return NULL;
01083 
01084     ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
01085     if (!ec)
01086         return NULL;
01087 
01088     for (i = 0; i < num_decorators; i++)
01089         if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
01090             rb_econv_close(ec);
01091             return NULL;
01092         }
01093 
01094     ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
01095 
01096     return ec;
01097 }
01098 
01099 static int
01100 trans_sweep(rb_econv_t *ec,
01101     const unsigned char **input_ptr, const unsigned char *input_stop,
01102     unsigned char **output_ptr, unsigned char *output_stop,
01103     int flags,
01104     int start)
01105 {
01106     int try;
01107     int i, f;
01108 
01109     const unsigned char **ipp, *is, *iold;
01110     unsigned char **opp, *os, *oold;
01111     rb_econv_result_t res;
01112 
01113     try = 1;
01114     while (try) {
01115         try = 0;
01116         for (i = start; i < ec->num_trans; i++) {
01117             rb_econv_elem_t *te = &ec->elems[i];
01118 
01119             if (i == 0) {
01120                 ipp = input_ptr;
01121                 is = input_stop;
01122             }
01123             else {
01124                 rb_econv_elem_t *prev_te = &ec->elems[i-1];
01125                 ipp = (const unsigned char **)&prev_te->out_data_start;
01126                 is = prev_te->out_data_end;
01127             }
01128 
01129             if (i == ec->num_trans-1) {
01130                 opp = output_ptr;
01131                 os = output_stop;
01132             }
01133             else {
01134                 if (te->out_buf_start != te->out_data_start) {
01135                     ssize_t len = te->out_data_end - te->out_data_start;
01136                     ssize_t off = te->out_data_start - te->out_buf_start;
01137                     MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
01138                     te->out_data_start = te->out_buf_start;
01139                     te->out_data_end -= off;
01140                 }
01141                 opp = &te->out_data_end;
01142                 os = te->out_buf_end;
01143             }
01144 
01145             f = flags;
01146             if (ec->num_finished != i)
01147                 f |= ECONV_PARTIAL_INPUT;
01148             if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
01149                 start = 1;
01150                 flags &= ~ECONV_AFTER_OUTPUT;
01151             }
01152             if (i != 0)
01153                 f &= ~ECONV_AFTER_OUTPUT;
01154             iold = *ipp;
01155             oold = *opp;
01156             te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
01157             if (iold != *ipp || oold != *opp)
01158                 try = 1;
01159 
01160             switch (res) {
01161               case econv_invalid_byte_sequence:
01162               case econv_incomplete_input:
01163               case econv_undefined_conversion:
01164               case econv_after_output:
01165                 return i;
01166 
01167               case econv_destination_buffer_full:
01168               case econv_source_buffer_empty:
01169                 break;
01170 
01171               case econv_finished:
01172                 ec->num_finished = i+1;
01173                 break;
01174             }
01175         }
01176     }
01177     return -1;
01178 }
01179 
01180 static rb_econv_result_t
01181 rb_trans_conv(rb_econv_t *ec,
01182     const unsigned char **input_ptr, const unsigned char *input_stop,
01183     unsigned char **output_ptr, unsigned char *output_stop,
01184     int flags,
01185     int *result_position_ptr)
01186 {
01187     int i;
01188     int needreport_index;
01189     int sweep_start;
01190 
01191     unsigned char empty_buf;
01192     unsigned char *empty_ptr = &empty_buf;
01193 
01194     if (!input_ptr) {
01195         input_ptr = (const unsigned char **)&empty_ptr;
01196         input_stop = empty_ptr;
01197     }
01198 
01199     if (!output_ptr) {
01200         output_ptr = &empty_ptr;
01201         output_stop = empty_ptr;
01202     }
01203 
01204     if (ec->elems[0].last_result == econv_after_output)
01205         ec->elems[0].last_result = econv_source_buffer_empty;
01206 
01207     needreport_index = -1;
01208     for (i = ec->num_trans-1; 0 <= i; i--) {
01209         switch (ec->elems[i].last_result) {
01210           case econv_invalid_byte_sequence:
01211           case econv_incomplete_input:
01212           case econv_undefined_conversion:
01213           case econv_after_output:
01214           case econv_finished:
01215             sweep_start = i+1;
01216             needreport_index = i;
01217             goto found_needreport;
01218 
01219           case econv_destination_buffer_full:
01220           case econv_source_buffer_empty:
01221             break;
01222 
01223           default:
01224             rb_bug("unexpected transcode last result");
01225         }
01226     }
01227 
01228     /* /^[sd]+$/ is confirmed.  but actually /^s*d*$/. */
01229 
01230     if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
01231         (flags & ECONV_AFTER_OUTPUT)) {
01232         rb_econv_result_t res;
01233 
01234         res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
01235                 (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT,
01236                 result_position_ptr);
01237 
01238         if (res == econv_source_buffer_empty)
01239             return econv_after_output;
01240         return res;
01241     }
01242 
01243     sweep_start = 0;
01244 
01245   found_needreport:
01246 
01247     do {
01248         needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
01249         sweep_start = needreport_index + 1;
01250     } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
01251 
01252     for (i = ec->num_trans-1; 0 <= i; i--) {
01253         if (ec->elems[i].last_result != econv_source_buffer_empty) {
01254             rb_econv_result_t res = ec->elems[i].last_result;
01255             if (res == econv_invalid_byte_sequence ||
01256                 res == econv_incomplete_input ||
01257                 res == econv_undefined_conversion ||
01258                 res == econv_after_output) {
01259                 ec->elems[i].last_result = econv_source_buffer_empty;
01260             }
01261             if (result_position_ptr)
01262                 *result_position_ptr = i;
01263             return res;
01264         }
01265     }
01266     if (result_position_ptr)
01267         *result_position_ptr = -1;
01268     return econv_source_buffer_empty;
01269 }
01270 
01271 static rb_econv_result_t
01272 rb_econv_convert0(rb_econv_t *ec,
01273     const unsigned char **input_ptr, const unsigned char *input_stop,
01274     unsigned char **output_ptr, unsigned char *output_stop,
01275     int flags)
01276 {
01277     rb_econv_result_t res;
01278     int result_position;
01279     int has_output = 0;
01280 
01281     memset(&ec->last_error, 0, sizeof(ec->last_error));
01282 
01283     if (ec->num_trans == 0) {
01284         size_t len;
01285         if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
01286             if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
01287                 len = output_stop - *output_ptr;
01288                 memcpy(*output_ptr, ec->in_data_start, len);
01289                 *output_ptr = output_stop;
01290                 ec->in_data_start += len;
01291                 res = econv_destination_buffer_full;
01292                 goto gotresult;
01293             }
01294             len = ec->in_data_end - ec->in_data_start;
01295             memcpy(*output_ptr, ec->in_data_start, len);
01296             *output_ptr += len;
01297             ec->in_data_start = ec->in_data_end = ec->in_buf_start;
01298             if (flags & ECONV_AFTER_OUTPUT) {
01299                 res = econv_after_output;
01300                 goto gotresult;
01301             }
01302         }
01303         if (output_stop - *output_ptr < input_stop - *input_ptr) {
01304             len = output_stop - *output_ptr;
01305         }
01306         else {
01307             len = input_stop - *input_ptr;
01308         }
01309         if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
01310             *(*output_ptr)++ = *(*input_ptr)++;
01311             res = econv_after_output;
01312             goto gotresult;
01313         }
01314         memcpy(*output_ptr, *input_ptr, len);
01315         *output_ptr += len;
01316         *input_ptr += len;
01317         if (*input_ptr != input_stop)
01318             res = econv_destination_buffer_full;
01319         else if (flags & ECONV_PARTIAL_INPUT)
01320             res = econv_source_buffer_empty;
01321         else
01322             res = econv_finished;
01323         goto gotresult;
01324     }
01325 
01326     if (ec->elems[ec->num_trans-1].out_data_start) {
01327         unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
01328         unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
01329         if (data_start != data_end) {
01330             size_t len;
01331             if (output_stop - *output_ptr < data_end - data_start) {
01332                 len = output_stop - *output_ptr;
01333                 memcpy(*output_ptr, data_start, len);
01334                 *output_ptr = output_stop;
01335                 ec->elems[ec->num_trans-1].out_data_start += len;
01336                 res = econv_destination_buffer_full;
01337                 goto gotresult;
01338             }
01339             len = data_end - data_start;
01340             memcpy(*output_ptr, data_start, len);
01341             *output_ptr += len;
01342             ec->elems[ec->num_trans-1].out_data_start =
01343                 ec->elems[ec->num_trans-1].out_data_end =
01344                 ec->elems[ec->num_trans-1].out_buf_start;
01345             has_output = 1;
01346         }
01347     }
01348 
01349     if (ec->in_buf_start &&
01350         ec->in_data_start != ec->in_data_end) {
01351         res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
01352                 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
01353         if (res != econv_source_buffer_empty)
01354             goto gotresult;
01355     }
01356 
01357     if (has_output &&
01358         (flags & ECONV_AFTER_OUTPUT) &&
01359         *input_ptr != input_stop) {
01360         input_stop = *input_ptr;
01361         res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01362         if (res == econv_source_buffer_empty)
01363             res = econv_after_output;
01364     }
01365     else if ((flags & ECONV_AFTER_OUTPUT) ||
01366         ec->num_trans == 1) {
01367         res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01368     }
01369     else {
01370         flags |= ECONV_AFTER_OUTPUT;
01371         do {
01372             res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01373         } while (res == econv_after_output);
01374     }
01375 
01376   gotresult:
01377     ec->last_error.result = res;
01378     if (res == econv_invalid_byte_sequence ||
01379         res == econv_incomplete_input ||
01380         res == econv_undefined_conversion) {
01381         rb_transcoding *error_tc = ec->elems[result_position].tc;
01382         ec->last_error.error_tc = error_tc;
01383         ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
01384         ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
01385         ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
01386         ec->last_error.error_bytes_len = error_tc->recognized_len;
01387         ec->last_error.readagain_len = error_tc->readagain_len;
01388     }
01389 
01390     return res;
01391 }
01392 
01393 static int output_replacement_character(rb_econv_t *ec);
01394 
01395 static int
01396 output_hex_charref(rb_econv_t *ec)
01397 {
01398     int ret;
01399     unsigned char utfbuf[1024];
01400     const unsigned char *utf;
01401     size_t utf_len;
01402     int utf_allocated = 0;
01403     char charef_buf[16];
01404     const unsigned char *p;
01405 
01406     if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
01407         utf = ec->last_error.error_bytes_start;
01408         utf_len = ec->last_error.error_bytes_len;
01409     }
01410     else {
01411         utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
01412                 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
01413                 utfbuf, sizeof(utfbuf),
01414                 &utf_len);
01415         if (!utf)
01416             return -1;
01417         if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
01418             utf_allocated = 1;
01419     }
01420 
01421     if (utf_len % 4 != 0)
01422         goto fail;
01423 
01424     p = utf;
01425     while (4 <= utf_len) {
01426         unsigned int u = 0;
01427         u += p[0] << 24;
01428         u += p[1] << 16;
01429         u += p[2] << 8;
01430         u += p[3];
01431         snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
01432 
01433         ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
01434         if (ret == -1)
01435             goto fail;
01436 
01437         p += 4;
01438         utf_len -= 4;
01439     }
01440 
01441     if (utf_allocated)
01442         xfree((void *)utf);
01443     return 0;
01444 
01445   fail:
01446     if (utf_allocated)
01447         xfree((void *)utf);
01448     return -1;
01449 }
01450 
01451 rb_econv_result_t
01452 rb_econv_convert(rb_econv_t *ec,
01453     const unsigned char **input_ptr, const unsigned char *input_stop,
01454     unsigned char **output_ptr, unsigned char *output_stop,
01455     int flags)
01456 {
01457     rb_econv_result_t ret;
01458 
01459     unsigned char empty_buf;
01460     unsigned char *empty_ptr = &empty_buf;
01461 
01462     ec->started = 1;
01463 
01464     if (!input_ptr) {
01465         input_ptr = (const unsigned char **)&empty_ptr;
01466         input_stop = empty_ptr;
01467     }
01468 
01469     if (!output_ptr) {
01470         output_ptr = &empty_ptr;
01471         output_stop = empty_ptr;
01472     }
01473 
01474   resume:
01475     ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
01476 
01477     if (ret == econv_invalid_byte_sequence ||
01478         ret == econv_incomplete_input) {
01479         /* deal with invalid byte sequence */
01480         /* todo: add more alternative behaviors */
01481         switch (ec->flags & ECONV_INVALID_MASK) {
01482           case ECONV_INVALID_REPLACE:
01483             if (output_replacement_character(ec) == 0)
01484                 goto resume;
01485         }
01486     }
01487 
01488     if (ret == econv_undefined_conversion) {
01489         /* valid character in source encoding
01490          * but no related character(s) in destination encoding */
01491         /* todo: add more alternative behaviors */
01492         switch (ec->flags & ECONV_UNDEF_MASK) {
01493           case ECONV_UNDEF_REPLACE:
01494             if (output_replacement_character(ec) == 0)
01495                 goto resume;
01496             break;
01497 
01498           case ECONV_UNDEF_HEX_CHARREF:
01499             if (output_hex_charref(ec) == 0)
01500                 goto resume;
01501             break;
01502         }
01503     }
01504 
01505     return ret;
01506 }
01507 
01508 const char *
01509 rb_econv_encoding_to_insert_output(rb_econv_t *ec)
01510 {
01511     rb_transcoding *tc = ec->last_tc;
01512     const rb_transcoder *tr;
01513 
01514     if (tc == NULL)
01515         return "";
01516 
01517     tr = tc->transcoder;
01518 
01519     if (tr->asciicompat_type == asciicompat_encoder)
01520         return tr->src_encoding;
01521     return tr->dst_encoding;
01522 }
01523 
01524 static unsigned char *
01525 allocate_converted_string(const char *sname, const char *dname,
01526         const unsigned char *str, size_t len,
01527         unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
01528         size_t *dst_len_ptr)
01529 {
01530     unsigned char *dst_str;
01531     size_t dst_len;
01532     size_t dst_bufsize;
01533 
01534     rb_econv_t *ec;
01535     rb_econv_result_t res;
01536 
01537     const unsigned char *sp;
01538     unsigned char *dp;
01539 
01540     if (caller_dst_buf)
01541         dst_bufsize = caller_dst_bufsize;
01542     else if (len == 0)
01543         dst_bufsize = 1;
01544     else
01545         dst_bufsize = len;
01546 
01547     ec = rb_econv_open(sname, dname, 0);
01548     if (ec == NULL)
01549         return NULL;
01550     if (caller_dst_buf)
01551         dst_str = caller_dst_buf;
01552     else
01553         dst_str = xmalloc(dst_bufsize);
01554     dst_len = 0;
01555     sp = str;
01556     dp = dst_str+dst_len;
01557     res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
01558     dst_len = dp - dst_str;
01559     while (res == econv_destination_buffer_full) {
01560         if (SIZE_MAX/2 < dst_bufsize) {
01561             goto fail;
01562         }
01563         dst_bufsize *= 2;
01564         if (dst_str == caller_dst_buf) {
01565             unsigned char *tmp;
01566             tmp = xmalloc(dst_bufsize);
01567             memcpy(tmp, dst_str, dst_bufsize/2);
01568             dst_str = tmp;
01569         }
01570         else {
01571             dst_str = xrealloc(dst_str, dst_bufsize);
01572         }
01573         dp = dst_str+dst_len;
01574         res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
01575         dst_len = dp - dst_str;
01576     }
01577     if (res != econv_finished) {
01578         goto fail;
01579     }
01580     rb_econv_close(ec);
01581     *dst_len_ptr = dst_len;
01582     return dst_str;
01583 
01584   fail:
01585     if (dst_str != caller_dst_buf)
01586         xfree(dst_str);
01587     rb_econv_close(ec);
01588     return NULL;
01589 }
01590 
01591 /* result: 0:success -1:failure */
01592 int
01593 rb_econv_insert_output(rb_econv_t *ec,
01594     const unsigned char *str, size_t len, const char *str_encoding)
01595 {
01596     const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
01597     unsigned char insert_buf[4096];
01598     const unsigned char *insert_str = NULL;
01599     size_t insert_len;
01600 
01601     int last_trans_index;
01602     rb_transcoding *tc;
01603 
01604     unsigned char **buf_start_p;
01605     unsigned char **data_start_p;
01606     unsigned char **data_end_p;
01607     unsigned char **buf_end_p;
01608 
01609     size_t need;
01610 
01611     ec->started = 1;
01612 
01613     if (len == 0)
01614         return 0;
01615 
01616     if (encoding_equal(insert_encoding, str_encoding)) {
01617         insert_str = str;
01618         insert_len = len;
01619     }
01620     else {
01621         insert_str = allocate_converted_string(str_encoding, insert_encoding,
01622                 str, len, insert_buf, sizeof(insert_buf), &insert_len);
01623         if (insert_str == NULL)
01624             return -1;
01625     }
01626 
01627     need = insert_len;
01628 
01629     last_trans_index = ec->num_trans-1;
01630     if (ec->num_trans == 0) {
01631         tc = NULL;
01632         buf_start_p = &ec->in_buf_start;
01633         data_start_p = &ec->in_data_start;
01634         data_end_p = &ec->in_data_end;
01635         buf_end_p = &ec->in_buf_end;
01636     }
01637     else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
01638         tc = ec->elems[last_trans_index].tc;
01639         need += tc->readagain_len;
01640         if (need < insert_len)
01641             goto fail;
01642         if (last_trans_index == 0) {
01643             buf_start_p = &ec->in_buf_start;
01644             data_start_p = &ec->in_data_start;
01645             data_end_p = &ec->in_data_end;
01646             buf_end_p = &ec->in_buf_end;
01647         }
01648         else {
01649             rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
01650             buf_start_p = &ee->out_buf_start;
01651             data_start_p = &ee->out_data_start;
01652             data_end_p = &ee->out_data_end;
01653             buf_end_p = &ee->out_buf_end;
01654         }
01655     }
01656     else {
01657         rb_econv_elem_t *ee = &ec->elems[last_trans_index];
01658         buf_start_p = &ee->out_buf_start;
01659         data_start_p = &ee->out_data_start;
01660         data_end_p = &ee->out_data_end;
01661         buf_end_p = &ee->out_buf_end;
01662         tc = ec->elems[last_trans_index].tc;
01663     }
01664 
01665     if (*buf_start_p == NULL) {
01666         unsigned char *buf = xmalloc(need);
01667         *buf_start_p = buf;
01668         *data_start_p = buf;
01669         *data_end_p = buf;
01670         *buf_end_p = buf+need;
01671     }
01672     else if ((size_t)(*buf_end_p - *data_end_p) < need) {
01673         MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
01674         *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
01675         *data_start_p = *buf_start_p;
01676         if ((size_t)(*buf_end_p - *data_end_p) < need) {
01677             unsigned char *buf;
01678             size_t s = (*data_end_p - *buf_start_p) + need;
01679             if (s < need)
01680                 goto fail;
01681             buf = xrealloc(*buf_start_p, s);
01682             *data_start_p = buf;
01683             *data_end_p = buf + (*data_end_p - *buf_start_p);
01684             *buf_start_p = buf;
01685             *buf_end_p = buf + s;
01686         }
01687     }
01688 
01689     memcpy(*data_end_p, insert_str, insert_len);
01690     *data_end_p += insert_len;
01691     if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
01692         memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
01693         *data_end_p += tc->readagain_len;
01694         tc->readagain_len = 0;
01695     }
01696 
01697     if (insert_str != str && insert_str != insert_buf)
01698         xfree((void*)insert_str);
01699     return 0;
01700 
01701   fail:
01702     if (insert_str != str && insert_str != insert_buf)
01703         xfree((void*)insert_str);
01704     return -1;
01705 }
01706 
01707 void
01708 rb_econv_close(rb_econv_t *ec)
01709 {
01710     int i;
01711 
01712     if (ec->replacement_allocated) {
01713         xfree((void *)ec->replacement_str);
01714     }
01715     for (i = 0; i < ec->num_trans; i++) {
01716         rb_transcoding_close(ec->elems[i].tc);
01717         if (ec->elems[i].out_buf_start)
01718             xfree(ec->elems[i].out_buf_start);
01719     }
01720     xfree(ec->in_buf_start);
01721     xfree(ec->elems);
01722     xfree(ec);
01723 }
01724 
01725 size_t
01726 rb_econv_memsize(rb_econv_t *ec)
01727 {
01728     size_t size = sizeof(rb_econv_t);
01729     int i;
01730 
01731     if (ec->replacement_allocated) {
01732         size += ec->replacement_len;
01733     }
01734     for (i = 0; i < ec->num_trans; i++) {
01735         size += rb_transcoding_memsize(ec->elems[i].tc);
01736 
01737         if (ec->elems[i].out_buf_start) {
01738             size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
01739         }
01740     }
01741     size += ec->in_buf_end - ec->in_buf_start;
01742     size += sizeof(rb_econv_elem_t) * ec->num_allocated;
01743 
01744     return size;
01745 }
01746 
01747 int
01748 rb_econv_putbackable(rb_econv_t *ec)
01749 {
01750     if (ec->num_trans == 0)
01751         return 0;
01752 #if SIZEOF_SIZE_T > SIZEOF_INT
01753     if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
01754 #endif
01755     return (int)ec->elems[0].tc->readagain_len;
01756 }
01757 
01758 void
01759 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
01760 {
01761     rb_transcoding *tc;
01762     if (ec->num_trans == 0 || n == 0)
01763         return;
01764     tc = ec->elems[0].tc;
01765     memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
01766     tc->readagain_len -= n;
01767 }
01768 
01769 struct asciicompat_encoding_t {
01770     const char *ascii_compat_name;
01771     const char *ascii_incompat_name;
01772 };
01773 
01774 static int
01775 asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
01776 {
01777     struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
01778     transcoder_entry_t *entry = (transcoder_entry_t *)val;
01779     const rb_transcoder *tr;
01780 
01781     if (DECORATOR_P(entry->sname, entry->dname))
01782         return ST_CONTINUE;
01783     tr = load_transcoder_entry(entry);
01784     if (tr && tr->asciicompat_type == asciicompat_decoder) {
01785         data->ascii_compat_name = tr->dst_encoding;
01786         return ST_STOP;
01787     }
01788     return ST_CONTINUE;
01789 }
01790 
01791 const char *
01792 rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
01793 {
01794     st_data_t v;
01795     st_table *table2;
01796     struct asciicompat_encoding_t data;
01797 
01798     if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
01799         return NULL;
01800     table2 = (st_table *)v;
01801 
01802     /*
01803      * Assumption:
01804      * There is at most one transcoder for
01805      * converting from ASCII incompatible encoding.
01806      *
01807      * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
01808      */
01809     if (table2->num_entries != 1)
01810         return NULL;
01811 
01812     data.ascii_incompat_name = ascii_incompat_name;
01813     data.ascii_compat_name = NULL;
01814     st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
01815     return data.ascii_compat_name;
01816 }
01817 
01818 VALUE
01819 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
01820 {
01821     unsigned const char *ss, *sp, *se;
01822     unsigned char *ds, *dp, *de;
01823     rb_econv_result_t res;
01824     int max_output;
01825 
01826     if (NIL_P(dst)) {
01827         dst = rb_str_buf_new(len);
01828         if (ec->destination_encoding)
01829             rb_enc_associate(dst, ec->destination_encoding);
01830     }
01831 
01832     if (ec->last_tc)
01833         max_output = ec->last_tc->transcoder->max_output;
01834     else
01835         max_output = 1;
01836 
01837     res = econv_destination_buffer_full;
01838     while (res == econv_destination_buffer_full) {
01839         long dlen = RSTRING_LEN(dst);
01840         if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
01841             unsigned long new_capa = (unsigned long)dlen + len + max_output;
01842             if (LONG_MAX < new_capa)
01843                 rb_raise(rb_eArgError, "too long string");
01844             rb_str_resize(dst, new_capa);
01845             rb_str_set_len(dst, dlen);
01846         }
01847         ss = sp = (const unsigned char *)RSTRING_PTR(src) + off;
01848         se = ss + len;
01849         ds = (unsigned char *)RSTRING_PTR(dst);
01850         de = ds + rb_str_capacity(dst);
01851         dp = ds += dlen;
01852         res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
01853         off += sp - ss;
01854         len -= sp - ss;
01855         rb_str_set_len(dst, dlen + (dp - ds));
01856         rb_econv_check_error(ec);
01857     }
01858 
01859     return dst;
01860 }
01861 
01862 VALUE
01863 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
01864 {
01865     return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
01866 }
01867 
01868 VALUE
01869 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
01870 {
01871     return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
01872 }
01873 
01874 VALUE
01875 rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
01876 {
01877     return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
01878 }
01879 
01880 static int
01881 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
01882 {
01883     transcoder_entry_t *entry;
01884     const rb_transcoder *tr;
01885 
01886     if (ec->started != 0)
01887         return -1;
01888 
01889     entry = get_transcoder_entry(sname, dname);
01890     if (!entry)
01891         return -1;
01892 
01893     tr = load_transcoder_entry(entry);
01894     if (!tr) return -1;
01895 
01896     return rb_econv_add_transcoder_at(ec, tr, n);
01897 }
01898 
01899 static int
01900 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
01901 {
01902     return rb_econv_add_converter(ec, "", decorator_name, n);
01903 }
01904 
01905 int
01906 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
01907 {
01908     const rb_transcoder *tr;
01909 
01910     if (ec->num_trans == 0)
01911         return rb_econv_decorate_at(ec, decorator_name, 0);
01912 
01913     tr = ec->elems[0].tc->transcoder;
01914 
01915     if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
01916         tr->asciicompat_type == asciicompat_decoder)
01917         return rb_econv_decorate_at(ec, decorator_name, 1);
01918 
01919     return rb_econv_decorate_at(ec, decorator_name, 0);
01920 }
01921 
01922 int
01923 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
01924 {
01925     const rb_transcoder *tr;
01926 
01927     if (ec->num_trans == 0)
01928         return rb_econv_decorate_at(ec, decorator_name, 0);
01929 
01930     tr = ec->elems[ec->num_trans-1].tc->transcoder;
01931 
01932     if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
01933         tr->asciicompat_type == asciicompat_encoder)
01934         return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
01935 
01936     return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
01937 }
01938 
01939 void
01940 rb_econv_binmode(rb_econv_t *ec)
01941 {
01942     const rb_transcoder *trs[3];
01943     int n, i, j;
01944     transcoder_entry_t *entry;
01945     int num_trans;
01946 
01947     n = 0;
01948     if (ec->flags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
01949         entry = get_transcoder_entry("", "universal_newline");
01950         if (entry->transcoder)
01951             trs[n++] = entry->transcoder;
01952     }
01953     if (ec->flags & ECONV_CRLF_NEWLINE_DECORATOR) {
01954         entry = get_transcoder_entry("", "crlf_newline");
01955         if (entry->transcoder)
01956             trs[n++] = entry->transcoder;
01957     }
01958     if (ec->flags & ECONV_CR_NEWLINE_DECORATOR) {
01959         entry = get_transcoder_entry("", "cr_newline");
01960         if (entry->transcoder)
01961             trs[n++] = entry->transcoder;
01962     }
01963 
01964     num_trans = ec->num_trans;
01965     j = 0;
01966     for (i = 0; i < num_trans; i++) {
01967         int k;
01968         for (k = 0; k < n; k++)
01969             if (trs[k] == ec->elems[i].tc->transcoder)
01970                 break;
01971         if (k == n) {
01972             ec->elems[j] = ec->elems[i];
01973             j++;
01974         }
01975         else {
01976             rb_transcoding_close(ec->elems[i].tc);
01977             xfree(ec->elems[i].out_buf_start);
01978             ec->num_trans--;
01979         }
01980     }
01981 
01982     ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
01983 
01984 }
01985 
01986 static VALUE
01987 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
01988 {
01989     int has_description = 0;
01990 
01991     if (NIL_P(mesg))
01992         mesg = rb_str_new(NULL, 0);
01993 
01994     if (*sname != '\0' || *dname != '\0') {
01995         if (*sname == '\0')
01996             rb_str_cat2(mesg, dname);
01997         else if (*dname == '\0')
01998             rb_str_cat2(mesg, sname);
01999         else
02000             rb_str_catf(mesg, "%s to %s", sname, dname);
02001         has_description = 1;
02002     }
02003 
02004     if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
02005                    ECONV_XML_TEXT_DECORATOR|
02006                    ECONV_XML_ATTR_CONTENT_DECORATOR|
02007                    ECONV_XML_ATTR_QUOTE_DECORATOR)) {
02008         const char *pre = "";
02009         if (has_description)
02010             rb_str_cat2(mesg, " with ");
02011         if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)  {
02012             rb_str_cat2(mesg, pre); pre = ",";
02013             rb_str_cat2(mesg, "universal_newline");
02014         }
02015         if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
02016             rb_str_cat2(mesg, pre); pre = ",";
02017             rb_str_cat2(mesg, "crlf_newline");
02018         }
02019         if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
02020             rb_str_cat2(mesg, pre); pre = ",";
02021             rb_str_cat2(mesg, "cr_newline");
02022         }
02023         if (ecflags & ECONV_XML_TEXT_DECORATOR) {
02024             rb_str_cat2(mesg, pre); pre = ",";
02025             rb_str_cat2(mesg, "xml_text");
02026         }
02027         if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
02028             rb_str_cat2(mesg, pre); pre = ",";
02029             rb_str_cat2(mesg, "xml_attr_content");
02030         }
02031         if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
02032             rb_str_cat2(mesg, pre); pre = ",";
02033             rb_str_cat2(mesg, "xml_attr_quote");
02034         }
02035         has_description = 1;
02036     }
02037     if (!has_description) {
02038         rb_str_cat2(mesg, "no-conversion");
02039     }
02040 
02041     return mesg;
02042 }
02043 
02044 VALUE
02045 rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
02046 {
02047     VALUE mesg, exc;
02048     mesg = rb_str_new_cstr("code converter not found (");
02049     econv_description(sname, dname, ecflags, mesg);
02050     rb_str_cat2(mesg, ")");
02051     exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
02052     return exc;
02053 }
02054 
02055 static VALUE
02056 make_econv_exception(rb_econv_t *ec)
02057 {
02058     VALUE mesg, exc;
02059     if (ec->last_error.result == econv_invalid_byte_sequence ||
02060         ec->last_error.result == econv_incomplete_input) {
02061         const char *err = (const char *)ec->last_error.error_bytes_start;
02062         size_t error_len = ec->last_error.error_bytes_len;
02063         VALUE bytes = rb_str_new(err, error_len);
02064         VALUE dumped = rb_str_dump(bytes);
02065         size_t readagain_len = ec->last_error.readagain_len;
02066         VALUE bytes2 = Qnil;
02067         VALUE dumped2;
02068         int idx;
02069         if (ec->last_error.result == econv_incomplete_input) {
02070             mesg = rb_sprintf("incomplete %s on %s",
02071                     StringValueCStr(dumped),
02072                     ec->last_error.source_encoding);
02073         }
02074         else if (readagain_len) {
02075             bytes2 = rb_str_new(err+error_len, readagain_len);
02076             dumped2 = rb_str_dump(bytes2);
02077             mesg = rb_sprintf("%s followed by %s on %s",
02078                     StringValueCStr(dumped),
02079                     StringValueCStr(dumped2),
02080                     ec->last_error.source_encoding);
02081         }
02082         else {
02083             mesg = rb_sprintf("%s on %s",
02084                     StringValueCStr(dumped),
02085                     ec->last_error.source_encoding);
02086         }
02087 
02088         exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
02089         rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
02090         rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
02091         rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
02092 
02093       set_encs:
02094         rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding));
02095         rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding));
02096         idx = rb_enc_find_index(ec->last_error.source_encoding);
02097         if (0 <= idx)
02098             rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
02099         idx = rb_enc_find_index(ec->last_error.destination_encoding);
02100         if (0 <= idx)
02101             rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
02102         return exc;
02103     }
02104     if (ec->last_error.result == econv_undefined_conversion) {
02105         VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
02106                                  ec->last_error.error_bytes_len);
02107         VALUE dumped = Qnil;
02108         int idx;
02109         if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
02110             rb_encoding *utf8 = rb_utf8_encoding();
02111             const char *start, *end;
02112             int n;
02113             start = (const char *)ec->last_error.error_bytes_start;
02114             end = start + ec->last_error.error_bytes_len;
02115             n = rb_enc_precise_mbclen(start, end, utf8);
02116             if (MBCLEN_CHARFOUND_P(n) &&
02117                 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
02118                 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
02119                 dumped = rb_sprintf("U+%04X", cc);
02120             }
02121         }
02122         if (dumped == Qnil)
02123             dumped = rb_str_dump(bytes);
02124         if (strcmp(ec->last_error.source_encoding,
02125                    ec->source_encoding_name) == 0 &&
02126             strcmp(ec->last_error.destination_encoding,
02127                    ec->destination_encoding_name) == 0) {
02128             mesg = rb_sprintf("%s from %s to %s",
02129                     StringValueCStr(dumped),
02130                     ec->last_error.source_encoding,
02131                     ec->last_error.destination_encoding);
02132         }
02133         else {
02134             int i;
02135             mesg = rb_sprintf("%s to %s in conversion from %s",
02136                     StringValueCStr(dumped),
02137                     ec->last_error.destination_encoding,
02138                     ec->source_encoding_name);
02139             for (i = 0; i < ec->num_trans; i++) {
02140                 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
02141                 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
02142                     rb_str_catf(mesg, " to %s",
02143                                 ec->elems[i].tc->transcoder->dst_encoding);
02144             }
02145         }
02146         exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
02147         idx = rb_enc_find_index(ec->last_error.source_encoding);
02148         if (0 <= idx)
02149             rb_enc_associate_index(bytes, idx);
02150         rb_ivar_set(exc, rb_intern("error_char"), bytes);
02151         goto set_encs;
02152     }
02153     return Qnil;
02154 }
02155 
02156 static void
02157 more_output_buffer(
02158         VALUE destination,
02159         unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02160         int max_output,
02161         unsigned char **out_start_ptr,
02162         unsigned char **out_pos,
02163         unsigned char **out_stop_ptr)
02164 {
02165     size_t len = (*out_pos - *out_start_ptr);
02166     size_t new_len = (len + max_output) * 2;
02167     *out_start_ptr = resize_destination(destination, len, new_len);
02168     *out_pos = *out_start_ptr + len;
02169     *out_stop_ptr = *out_start_ptr + new_len;
02170 }
02171 
02172 static int
02173 make_replacement(rb_econv_t *ec)
02174 {
02175     rb_transcoding *tc;
02176     const rb_transcoder *tr;
02177     rb_encoding *enc;
02178     const unsigned char *replacement;
02179     const char *repl_enc;
02180     const char *ins_enc;
02181     size_t len;
02182 
02183     if (ec->replacement_str)
02184         return 0;
02185 
02186     ins_enc = rb_econv_encoding_to_insert_output(ec);
02187 
02188     tc = ec->last_tc;
02189     if (*ins_enc) {
02190         tr = tc->transcoder;
02191         enc = rb_enc_find(tr->dst_encoding);
02192         replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
02193     }
02194     else {
02195         replacement = (unsigned char *)"?";
02196         len = 1;
02197         repl_enc = "";
02198     }
02199 
02200     ec->replacement_str = replacement;
02201     ec->replacement_len = len;
02202     ec->replacement_enc = repl_enc;
02203     ec->replacement_allocated = 0;
02204     return 0;
02205 }
02206 
02207 int
02208 rb_econv_set_replacement(rb_econv_t *ec,
02209     const unsigned char *str, size_t len, const char *encname)
02210 {
02211     unsigned char *str2;
02212     size_t len2;
02213     const char *encname2;
02214 
02215     encname2 = rb_econv_encoding_to_insert_output(ec);
02216 
02217     if (encoding_equal(encname, encname2)) {
02218         str2 = xmalloc(len);
02219         MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
02220         len2 = len;
02221         encname2 = encname;
02222     }
02223     else {
02224         str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
02225         if (!str2)
02226             return -1;
02227     }
02228 
02229     if (ec->replacement_allocated) {
02230         xfree((void *)ec->replacement_str);
02231     }
02232     ec->replacement_allocated = 1;
02233     ec->replacement_str = str2;
02234     ec->replacement_len = len2;
02235     ec->replacement_enc = encname2;
02236     return 0;
02237 }
02238 
02239 static int
02240 output_replacement_character(rb_econv_t *ec)
02241 {
02242     int ret;
02243 
02244     if (make_replacement(ec) == -1)
02245         return -1;
02246 
02247     ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
02248     if (ret == -1)
02249         return -1;
02250 
02251     return 0;
02252 }
02253 
02254 #if 1
02255 #define hash_fallback rb_hash_aref
02256 
02257 static VALUE
02258 proc_fallback(VALUE fallback, VALUE c)
02259 {
02260     return rb_proc_call(fallback, rb_ary_new4(1, &c));
02261 }
02262 
02263 static VALUE
02264 method_fallback(VALUE fallback, VALUE c)
02265 {
02266     return rb_method_call(1, &c, fallback);
02267 }
02268 
02269 static VALUE
02270 aref_fallback(VALUE fallback, VALUE c)
02271 {
02272     return rb_funcall3(fallback, sym_aref, 1, &c);
02273 }
02274 
02275 static void
02276 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
02277                const unsigned char *in_stop, unsigned char *out_stop,
02278                VALUE destination,
02279                unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02280                const char *src_encoding,
02281                const char *dst_encoding,
02282                int ecflags,
02283                VALUE ecopts)
02284 {
02285     rb_econv_t *ec;
02286     rb_transcoding *last_tc;
02287     rb_econv_result_t ret;
02288     unsigned char *out_start = *out_pos;
02289     int max_output;
02290     VALUE exc;
02291     VALUE fallback = Qnil;
02292     VALUE (*fallback_func)(VALUE, VALUE) = 0;
02293 
02294     ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
02295     if (!ec)
02296         rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
02297 
02298     if (!NIL_P(ecopts) && TYPE(ecopts) == T_HASH) {
02299         fallback = rb_hash_aref(ecopts, sym_fallback);
02300         if (RB_TYPE_P(fallback, T_HASH)) {
02301             fallback_func = hash_fallback;
02302         }
02303         else if (rb_obj_is_proc(fallback)) {
02304             fallback_func = proc_fallback;
02305         }
02306         else if (rb_obj_is_method(fallback)) {
02307             fallback_func = method_fallback;
02308         }
02309         else {
02310             fallback_func = aref_fallback;
02311         }
02312     }
02313     last_tc = ec->last_tc;
02314     max_output = last_tc ? last_tc->transcoder->max_output : 1;
02315 
02316   resume:
02317     ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
02318 
02319     if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
02320         VALUE rep = rb_enc_str_new(
02321                 (const char *)ec->last_error.error_bytes_start,
02322                 ec->last_error.error_bytes_len,
02323                 rb_enc_find(ec->last_error.source_encoding));
02324         rep = (*fallback_func)(fallback, rep);
02325         if (rep != Qundef && !NIL_P(rep)) {
02326             StringValue(rep);
02327             ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
02328                     RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
02329             if ((int)ret == -1) {
02330                 rb_raise(rb_eArgError, "too big fallback string");
02331             }
02332             goto resume;
02333         }
02334     }
02335 
02336     if (ret == econv_invalid_byte_sequence ||
02337         ret == econv_incomplete_input ||
02338         ret == econv_undefined_conversion) {
02339         exc = make_econv_exception(ec);
02340         rb_econv_close(ec);
02341         rb_exc_raise(exc);
02342     }
02343 
02344     if (ret == econv_destination_buffer_full) {
02345         more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
02346         goto resume;
02347     }
02348 
02349     rb_econv_close(ec);
02350     return;
02351 }
02352 #else
02353 /* sample transcode_loop implementation in byte-by-byte stream style */
02354 static void
02355 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
02356                const unsigned char *in_stop, unsigned char *out_stop,
02357                VALUE destination,
02358                unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02359                const char *src_encoding,
02360                const char *dst_encoding,
02361                int ecflags,
02362                VALUE ecopts)
02363 {
02364     rb_econv_t *ec;
02365     rb_transcoding *last_tc;
02366     rb_econv_result_t ret;
02367     unsigned char *out_start = *out_pos;
02368     const unsigned char *ptr;
02369     int max_output;
02370     VALUE exc;
02371 
02372     ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
02373     if (!ec)
02374         rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
02375 
02376     last_tc = ec->last_tc;
02377     max_output = last_tc ? last_tc->transcoder->max_output : 1;
02378 
02379     ret = econv_source_buffer_empty;
02380     ptr = *in_pos;
02381     while (ret != econv_finished) {
02382         unsigned char input_byte;
02383         const unsigned char *p = &input_byte;
02384 
02385         if (ret == econv_source_buffer_empty) {
02386             if (ptr < in_stop) {
02387                 input_byte = *ptr;
02388                 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
02389             }
02390             else {
02391                 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
02392             }
02393         }
02394         else {
02395             ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
02396         }
02397         if (&input_byte != p)
02398             ptr += p - &input_byte;
02399         switch (ret) {
02400           case econv_invalid_byte_sequence:
02401           case econv_incomplete_input:
02402           case econv_undefined_conversion:
02403             exc = make_econv_exception(ec);
02404             rb_econv_close(ec);
02405             rb_exc_raise(exc);
02406             break;
02407 
02408           case econv_destination_buffer_full:
02409             more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
02410             break;
02411 
02412           case econv_source_buffer_empty:
02413             break;
02414 
02415           case econv_finished:
02416             break;
02417         }
02418     }
02419     rb_econv_close(ec);
02420     *in_pos = in_stop;
02421     return;
02422 }
02423 #endif
02424 
02425 
02426 /*
02427  *  String-specific code
02428  */
02429 
02430 static unsigned char *
02431 str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
02432 {
02433     rb_str_resize(destination, new_len);
02434     return (unsigned char *)RSTRING_PTR(destination);
02435 }
02436 
02437 static int
02438 econv_opts(VALUE opt, int ecflags)
02439 {
02440     VALUE v;
02441 
02442     v = rb_hash_aref(opt, sym_invalid);
02443     if (NIL_P(v)) {
02444     }
02445     else if (v==sym_replace) {
02446         ecflags |= ECONV_INVALID_REPLACE;
02447     }
02448     else {
02449         rb_raise(rb_eArgError, "unknown value for invalid character option");
02450     }
02451 
02452     v = rb_hash_aref(opt, sym_undef);
02453     if (NIL_P(v)) {
02454     }
02455     else if (v==sym_replace) {
02456         ecflags |= ECONV_UNDEF_REPLACE;
02457     }
02458     else {
02459         rb_raise(rb_eArgError, "unknown value for undefined character option");
02460     }
02461 
02462     v = rb_hash_aref(opt, sym_replace);
02463     if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
02464         ecflags |= ECONV_UNDEF_REPLACE;
02465     }
02466 
02467     v = rb_hash_aref(opt, sym_xml);
02468     if (!NIL_P(v)) {
02469         if (v==sym_text) {
02470             ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
02471         }
02472         else if (v==sym_attr) {
02473             ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
02474         }
02475         else if (TYPE(v) == T_SYMBOL) {
02476             rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v)));
02477         }
02478         else {
02479             rb_raise(rb_eArgError, "unexpected value for xml option");
02480         }
02481     }
02482 
02483 #ifdef ENABLE_ECONV_NEWLINE_OPTION
02484     v = rb_hash_aref(opt, sym_newline);
02485     if (!NIL_P(v)) {
02486         ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
02487         if (v == sym_universal) {
02488             ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
02489         }
02490         else if (v == sym_crlf) {
02491             ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
02492         }
02493         else if (v == sym_cr) {
02494             ecflags |= ECONV_CR_NEWLINE_DECORATOR;
02495         }
02496         else if (v == sym_lf) {
02497             /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */
02498         }
02499         else if (SYMBOL_P(v)) {
02500             rb_raise(rb_eArgError, "unexpected value for newline option: %s",
02501                      rb_id2name(SYM2ID(v)));
02502         }
02503         else {
02504             rb_raise(rb_eArgError, "unexpected value for newline option");
02505         }
02506     }
02507     else
02508 #endif
02509     {
02510         int setflags = 0, newlineflag = 0;
02511 
02512         v = rb_hash_aref(opt, sym_universal_newline);
02513         if (RTEST(v))
02514             setflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
02515         newlineflag |= !NIL_P(v);
02516 
02517         v = rb_hash_aref(opt, sym_crlf_newline);
02518         if (RTEST(v))
02519             setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
02520         newlineflag |= !NIL_P(v);
02521 
02522         v = rb_hash_aref(opt, sym_cr_newline);
02523         if (RTEST(v))
02524             setflags |= ECONV_CR_NEWLINE_DECORATOR;
02525         newlineflag |= !NIL_P(v);
02526 
02527         if (newlineflag) {
02528             ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
02529             ecflags |= setflags;
02530         }
02531     }
02532 
02533     return ecflags;
02534 }
02535 
02536 int
02537 rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
02538 {
02539     VALUE newhash = Qnil;
02540     VALUE v;
02541 
02542     if (NIL_P(opthash)) {
02543         *opts = Qnil;
02544         return ecflags;
02545     }
02546     ecflags = econv_opts(opthash, ecflags);
02547 
02548     v = rb_hash_aref(opthash, sym_replace);
02549     if (!NIL_P(v)) {
02550         StringValue(v);
02551         if (rb_enc_str_coderange(v) == ENC_CODERANGE_BROKEN) {
02552             VALUE dumped = rb_str_dump(v);
02553             rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
02554                      StringValueCStr(dumped),
02555                      rb_enc_name(rb_enc_get(v)));
02556         }
02557         v = rb_str_new_frozen(v);
02558         newhash = rb_hash_new();
02559         rb_hash_aset(newhash, sym_replace, v);
02560     }
02561 
02562     v = rb_hash_aref(opthash, sym_fallback);
02563     if (!NIL_P(v)) {
02564         VALUE h = rb_check_hash_type(v);
02565         if (NIL_P(h)
02566             ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, sym_aref))
02567             : (v = h, 1)) {
02568             if (NIL_P(newhash))
02569                 newhash = rb_hash_new();
02570             rb_hash_aset(newhash, sym_fallback, v);
02571         }
02572     }
02573 
02574     if (!NIL_P(newhash))
02575         rb_hash_freeze(newhash);
02576     *opts = newhash;
02577 
02578     return ecflags;
02579 }
02580 
02581 int
02582 rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
02583 {
02584     return rb_econv_prepare_options(opthash, opts, 0);
02585 }
02586 
02587 rb_econv_t *
02588 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
02589 {
02590     rb_econv_t *ec;
02591     VALUE replacement;
02592 
02593     if (NIL_P(opthash)) {
02594         replacement = Qnil;
02595     }
02596     else {
02597         if (TYPE(opthash) != T_HASH || !OBJ_FROZEN(opthash))
02598             rb_bug("rb_econv_open_opts called with invalid opthash");
02599         replacement = rb_hash_aref(opthash, sym_replace);
02600     }
02601 
02602     ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
02603     if (!ec)
02604         return ec;
02605 
02606     if (!NIL_P(replacement)) {
02607         int ret;
02608         rb_encoding *enc = rb_enc_get(replacement);
02609 
02610         ret = rb_econv_set_replacement(ec,
02611                 (const unsigned char *)RSTRING_PTR(replacement),
02612                 RSTRING_LEN(replacement),
02613                 rb_enc_name(enc));
02614         if (ret == -1) {
02615             rb_econv_close(ec);
02616             return NULL;
02617         }
02618     }
02619     return ec;
02620 }
02621 
02622 static int
02623 enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p)
02624 {
02625     rb_encoding *enc;
02626     const char *n;
02627     int encidx;
02628     VALUE encval;
02629 
02630     if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
02631         !(enc = rb_enc_from_index(encidx))) {
02632         enc = NULL;
02633         encidx = 0;
02634         n = StringValueCStr(*arg);
02635     }
02636     else {
02637         n = rb_enc_name(enc);
02638     }
02639 
02640     *name_p = n;
02641     *enc_p = enc;
02642 
02643     return encidx;
02644 }
02645 
02646 static int
02647 str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2,
02648         const char **sname_p, rb_encoding **senc_p,
02649         const char **dname_p, rb_encoding **denc_p)
02650 {
02651     rb_encoding *senc, *denc;
02652     const char *sname, *dname;
02653     int sencidx, dencidx;
02654 
02655     dencidx = enc_arg(arg1, &dname, &denc);
02656 
02657     if (NIL_P(*arg2)) {
02658         sencidx = rb_enc_get_index(str);
02659         senc = rb_enc_from_index(sencidx);
02660         sname = rb_enc_name(senc);
02661     }
02662     else {
02663         sencidx = enc_arg(arg2, &sname, &senc);
02664     }
02665 
02666     *sname_p = sname;
02667     *senc_p = senc;
02668     *dname_p = dname;
02669     *denc_p = denc;
02670     return dencidx;
02671 }
02672 
02673 static int
02674 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
02675 {
02676     VALUE dest;
02677     VALUE str = *self;
02678     volatile VALUE arg1, arg2;
02679     long blen, slen;
02680     unsigned char *buf, *bp, *sp;
02681     const unsigned char *fromp;
02682     rb_encoding *senc, *denc;
02683     const char *sname, *dname;
02684     int dencidx;
02685 
02686     if (argc <0 || argc > 2) {
02687         rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc);
02688     }
02689 
02690     if (argc == 0) {
02691         arg1 = rb_enc_default_internal();
02692         if (NIL_P(arg1)) {
02693             if (!ecflags) return -1;
02694             arg1 = rb_obj_encoding(str);
02695         }
02696         ecflags |= ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE;
02697     }
02698     else {
02699         arg1 = argv[0];
02700     }
02701     arg2 = argc<=1 ? Qnil : argv[1];
02702     dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
02703 
02704     if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
02705                     ECONV_XML_TEXT_DECORATOR|
02706                     ECONV_XML_ATTR_CONTENT_DECORATOR|
02707                     ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) {
02708         if (senc && senc == denc) {
02709             return NIL_P(arg2) ? -1 : dencidx;
02710         }
02711         if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
02712             if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02713                 return dencidx;
02714             }
02715         }
02716         if (encoding_equal(sname, dname)) {
02717             return NIL_P(arg2) ? -1 : dencidx;
02718         }
02719     }
02720     else {
02721         if (encoding_equal(sname, dname)) {
02722             sname = "";
02723             dname = "";
02724         }
02725     }
02726 
02727     fromp = sp = (unsigned char *)RSTRING_PTR(str);
02728     slen = RSTRING_LEN(str);
02729     blen = slen + 30; /* len + margin */
02730     dest = rb_str_tmp_new(blen);
02731     bp = (unsigned char *)RSTRING_PTR(dest);
02732 
02733     transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
02734     if (fromp != sp+slen) {
02735         rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
02736     }
02737     buf = (unsigned char *)RSTRING_PTR(dest);
02738     *bp = '\0';
02739     rb_str_set_len(dest, bp - buf);
02740 
02741     /* set encoding */
02742     if (!denc) {
02743         dencidx = rb_define_dummy_encoding(dname);
02744     }
02745     *self = dest;
02746 
02747     return dencidx;
02748 }
02749 
02750 static int
02751 str_transcode(int argc, VALUE *argv, VALUE *self)
02752 {
02753     VALUE opt;
02754     int ecflags = 0;
02755     VALUE ecopts = Qnil;
02756 
02757     argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
02758     if (!NIL_P(opt)) {
02759         ecflags = rb_econv_prepare_opts(opt, &ecopts);
02760     }
02761     return str_transcode0(argc, argv, self, ecflags, ecopts);
02762 }
02763 
02764 static inline VALUE
02765 str_encode_associate(VALUE str, int encidx)
02766 {
02767     int cr = 0;
02768 
02769     rb_enc_associate_index(str, encidx);
02770 
02771     /* transcoded string never be broken. */
02772     if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
02773         rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
02774     }
02775     else {
02776         cr = ENC_CODERANGE_VALID;
02777     }
02778     ENC_CODERANGE_SET(str, cr);
02779     return str;
02780 }
02781 
02782 /*
02783  *  call-seq:
02784  *     str.encode!(encoding [, options] )   -> str
02785  *     str.encode!(dst_encoding, src_encoding [, options] )   -> str
02786  *
02787  *  The first form transcodes the contents of <i>str</i> from
02788  *  str.encoding to +encoding+.
02789  *  The second form transcodes the contents of <i>str</i> from
02790  *  src_encoding to dst_encoding.
02791  *  The options Hash gives details for conversion. See String#encode
02792  *  for details.
02793  *  Returns the string even if no changes were made.
02794  */
02795 
02796 static VALUE
02797 str_encode_bang(int argc, VALUE *argv, VALUE str)
02798 {
02799     VALUE newstr;
02800     int encidx;
02801 
02802     rb_check_frozen(str);
02803 
02804     newstr = str;
02805     encidx = str_transcode(argc, argv, &newstr);
02806 
02807     if (encidx < 0) return str;
02808     rb_str_shared_replace(str, newstr);
02809     return str_encode_associate(str, encidx);
02810 }
02811 
02812 static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
02813 
02814 /*
02815  *  call-seq:
02816  *     str.encode(encoding [, options] )   -> str
02817  *     str.encode(dst_encoding, src_encoding [, options] )   -> str
02818  *     str.encode([options])   -> str
02819  *
02820  *  The first form returns a copy of +str+ transcoded
02821  *  to encoding +encoding+.
02822  *  The second form returns a copy of +str+ transcoded
02823  *  from src_encoding to dst_encoding.
02824  *  The last form returns a copy of +str+ transcoded to
02825  *  <tt>Encoding.default_internal</tt>.
02826  *
02827  *  By default, the first and second form raise
02828  *  Encoding::UndefinedConversionError for characters that are
02829  *  undefined in the destination encoding, and
02830  *  Encoding::InvalidByteSequenceError for invalid byte sequences
02831  *  in the source encoding. The last form by default does not raise
02832  *  exceptions but uses replacement strings.
02833  *
02834  *  The +options+ Hash gives details for conversion and can have the following
02835  *  keys:
02836  *
02837  *  :invalid ::
02838  *    If the value is +:replace+, #encode replaces invalid byte sequences in
02839  *    +str+ with the replacement character.  The default is to raise the
02840  *    Encoding::InvalidByteSequenceError exception
02841  *  :undef ::
02842  *    If the value is +:replace+, #encode replaces characters which are
02843  *    undefined in the destination encoding with the replacement character.
02844  *    The default is to raise the Encoding::UndefinedConversionError.
02845  *  :replace ::
02846  *    Sets the replacement string to the given value. The default replacement
02847  *    string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
02848  *  :fallback ::
02849  *    Sets the replacement string by the given object for undefined
02850  *    character.  The object should be a Hash, a Proc, a Method, or an
02851  *    object which has [] method.
02852  *    Its key is an undefined character encoded in the source encoding
02853  *    of current transcoder. Its value can be any encoding until it
02854  *    can be converted into the destination encoding of the transcoder.
02855  *  :xml ::
02856  *    The value must be +:text+ or +:attr+.
02857  *    If the value is +:text+ #encode replaces undefined characters with their
02858  *    (upper-case hexadecimal) numeric character references. '&', '<', and '>'
02859  *    are converted to "&amp;", "&lt;", and "&gt;", respectively.
02860  *    If the value is +:attr+, #encode also quotes the replacement result
02861  *    (using '"'), and replaces '"' with "&quot;".
02862  *  :cr_newline ::
02863  *    Replaces LF ("\n") with CR ("\r") if value is true.
02864  *  :crlf_newline ::
02865  *    Replaces LF ("\n") with CRLF ("\r\n") if value is true.
02866  *  :universal_newline ::
02867  *    Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
02868  */
02869 
02870 static VALUE
02871 str_encode(int argc, VALUE *argv, VALUE str)
02872 {
02873     VALUE newstr = str;
02874     int encidx = str_transcode(argc, argv, &newstr);
02875     return encoded_dup(newstr, str, encidx);
02876 }
02877 
02878 VALUE
02879 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
02880 {
02881     int argc = 1;
02882     VALUE *argv = &to;
02883     VALUE newstr = str;
02884     int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
02885     return encoded_dup(newstr, str, encidx);
02886 }
02887 
02888 static VALUE
02889 encoded_dup(VALUE newstr, VALUE str, int encidx)
02890 {
02891     if (encidx < 0) return rb_str_dup(str);
02892     if (newstr == str) {
02893         newstr = rb_str_dup(str);
02894     }
02895     else {
02896         RBASIC(newstr)->klass = rb_obj_class(str);
02897     }
02898     return str_encode_associate(newstr, encidx);
02899 }
02900 
02901 static void
02902 econv_free(void *ptr)
02903 {
02904     rb_econv_t *ec = ptr;
02905     rb_econv_close(ec);
02906 }
02907 
02908 static size_t
02909 econv_memsize(const void *ptr)
02910 {
02911     return ptr ? sizeof(rb_econv_t) : 0;
02912 }
02913 
02914 static const rb_data_type_t econv_data_type = {
02915     "econv",
02916     {NULL, econv_free, econv_memsize,},
02917 };
02918 
02919 static VALUE
02920 econv_s_allocate(VALUE klass)
02921 {
02922     return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
02923 }
02924 
02925 static rb_encoding *
02926 make_dummy_encoding(const char *name)
02927 {
02928     rb_encoding *enc;
02929     int idx;
02930     idx = rb_define_dummy_encoding(name);
02931     enc = rb_enc_from_index(idx);
02932     return enc;
02933 }
02934 
02935 static rb_encoding *
02936 make_encoding(const char *name)
02937 {
02938     rb_encoding *enc;
02939     enc = rb_enc_find(name);
02940     if (!enc)
02941         enc = make_dummy_encoding(name);
02942     return enc;
02943 }
02944 
02945 static VALUE
02946 make_encobj(const char *name)
02947 {
02948     return rb_enc_from_encoding(make_encoding(name));
02949 }
02950 
02951 /*
02952  * call-seq:
02953  *   Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
02954  *   Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
02955  *
02956  * Returns the corresponding ASCII compatible encoding.
02957  *
02958  * Returns nil if the argument is an ASCII compatible encoding.
02959  *
02960  * "corresponding ASCII compatible encoding" is a ASCII compatible encoding which
02961  * can represents exactly the same characters as the given ASCII incompatible encoding.
02962  * So, no conversion undefined error occurs when converting between the two encodings.
02963  *
02964  *   Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
02965  *   Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
02966  *   Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
02967  *
02968  */
02969 static VALUE
02970 econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
02971 {
02972     const char *arg_name, *result_name;
02973     rb_encoding *arg_enc, *result_enc;
02974 
02975     enc_arg(&arg, &arg_name, &arg_enc);
02976 
02977     result_name = rb_econv_asciicompat_encoding(arg_name);
02978 
02979     if (result_name == NULL)
02980         return Qnil;
02981 
02982     result_enc = make_encoding(result_name);
02983 
02984     return rb_enc_from_encoding(result_enc);
02985 }
02986 
02987 static void
02988 econv_args(int argc, VALUE *argv,
02989     volatile VALUE *snamev_p, volatile VALUE *dnamev_p,
02990     const char **sname_p, const char **dname_p,
02991     rb_encoding **senc_p, rb_encoding **denc_p,
02992     int *ecflags_p,
02993     VALUE *ecopts_p)
02994 {
02995     VALUE opt, flags_v, ecopts;
02996     int sidx, didx;
02997     const char *sname, *dname;
02998     rb_encoding *senc, *denc;
02999     int ecflags;
03000 
03001     argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
03002 
03003     if (!NIL_P(flags_v)) {
03004         if (!NIL_P(opt)) {
03005             rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)",
03006                 argc + 1);
03007         }
03008         ecflags = NUM2INT(rb_to_int(flags_v));
03009         ecopts = Qnil;
03010     }
03011     else if (!NIL_P(opt)) {
03012         ecflags = rb_econv_prepare_opts(opt, &ecopts);
03013     }
03014     else {
03015         ecflags = 0;
03016         ecopts = Qnil;
03017     }
03018 
03019     senc = NULL;
03020     sidx = rb_to_encoding_index(*snamev_p);
03021     if (0 <= sidx) {
03022         senc = rb_enc_from_index(sidx);
03023     }
03024     else {
03025         StringValue(*snamev_p);
03026     }
03027 
03028     denc = NULL;
03029     didx = rb_to_encoding_index(*dnamev_p);
03030     if (0 <= didx) {
03031         denc = rb_enc_from_index(didx);
03032     }
03033     else {
03034         StringValue(*dnamev_p);
03035     }
03036 
03037     sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
03038     dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
03039 
03040     *sname_p = sname;
03041     *dname_p = dname;
03042     *senc_p = senc;
03043     *denc_p = denc;
03044     *ecflags_p = ecflags;
03045     *ecopts_p = ecopts;
03046 }
03047 
03048 static int
03049 decorate_convpath(VALUE convpath, int ecflags)
03050 {
03051     int num_decorators;
03052     const char *decorators[MAX_ECFLAGS_DECORATORS];
03053     int i;
03054     int n, len;
03055 
03056     num_decorators = decorator_names(ecflags, decorators);
03057     if (num_decorators == -1)
03058         return -1;
03059 
03060     len = n = RARRAY_LENINT(convpath);
03061     if (n != 0) {
03062         VALUE pair = RARRAY_PTR(convpath)[n-1];
03063         if (TYPE(pair) == T_ARRAY) {
03064             const char *sname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[0]));
03065             const char *dname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[1]));
03066             transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
03067             const rb_transcoder *tr = load_transcoder_entry(entry);
03068             if (!tr)
03069                 return -1;
03070             if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
03071                     tr->asciicompat_type == asciicompat_encoder) {
03072                 n--;
03073                 rb_ary_store(convpath, len + num_decorators - 1, pair);
03074             }
03075         }
03076         else {
03077             rb_ary_store(convpath, len + num_decorators - 1, pair);
03078         }
03079     }
03080 
03081     for (i = 0; i < num_decorators; i++)
03082         rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
03083 
03084     return 0;
03085 }
03086 
03087 static void
03088 search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
03089 {
03090     VALUE *ary_p = arg;
03091     VALUE v;
03092 
03093     if (*ary_p == Qnil) {
03094         *ary_p = rb_ary_new();
03095     }
03096 
03097     if (DECORATOR_P(sname, dname)) {
03098         v = rb_str_new_cstr(dname);
03099     }
03100     else {
03101         v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
03102     }
03103     rb_ary_store(*ary_p, depth, v);
03104 }
03105 
03106 /*
03107  * call-seq:
03108  *   Encoding::Converter.search_convpath(source_encoding, destination_encoding)         -> ary
03109  *   Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt)    -> ary
03110  *
03111  *  Returns a conversion path.
03112  *
03113  *   p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
03114  *   #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
03115  *   #    [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
03116  *
03117  *   p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
03118  *   or
03119  *   p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
03120  *   #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
03121  *   #    [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
03122  *   #    "universal_newline"]
03123  *
03124  *   p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
03125  *   or
03126  *   p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
03127  *   #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
03128  *   #    "universal_newline",
03129  *   #    [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
03130  */
03131 static VALUE
03132 econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
03133 {
03134     volatile VALUE snamev, dnamev;
03135     const char *sname, *dname;
03136     rb_encoding *senc, *denc;
03137     int ecflags;
03138     VALUE ecopts;
03139     VALUE convpath;
03140 
03141     econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
03142 
03143     convpath = Qnil;
03144     transcode_search_path(sname, dname, search_convpath_i, &convpath);
03145 
03146     if (NIL_P(convpath))
03147         rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03148 
03149     if (decorate_convpath(convpath, ecflags) == -1)
03150         rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03151 
03152     return convpath;
03153 }
03154 
03155 /*
03156  * Check the existence of a conversion path.
03157  * Returns the number of converters in the conversion path.
03158  * result: >=0:success -1:failure
03159  */
03160 int
03161 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
03162 {
03163     VALUE convpath = Qnil;
03164     transcode_search_path(from_encoding, to_encoding, search_convpath_i,
03165                           &convpath);
03166     return RTEST(convpath);
03167 }
03168 
03169 struct rb_econv_init_by_convpath_t {
03170     rb_econv_t *ec;
03171     int index;
03172     int ret;
03173 };
03174 
03175 static void
03176 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
03177 {
03178     struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
03179     int ret;
03180 
03181     if (a->ret == -1)
03182         return;
03183 
03184     ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
03185 
03186     a->ret = ret;
03187     return;
03188 }
03189 
03190 static rb_econv_t *
03191 rb_econv_init_by_convpath(VALUE self, VALUE convpath,
03192     const char **sname_p, const char **dname_p,
03193     rb_encoding **senc_p, rb_encoding**denc_p)
03194 {
03195     rb_econv_t *ec;
03196     long i;
03197     int ret, first=1;
03198     VALUE elt;
03199     rb_encoding *senc = 0, *denc = 0;
03200     const char *sname, *dname;
03201 
03202     ec = rb_econv_alloc(RARRAY_LENINT(convpath));
03203     DATA_PTR(self) = ec;
03204 
03205     for (i = 0; i < RARRAY_LEN(convpath); i++) {
03206         volatile VALUE snamev, dnamev;
03207         VALUE pair;
03208         elt = rb_ary_entry(convpath, i);
03209         if (!NIL_P(pair = rb_check_array_type(elt))) {
03210             if (RARRAY_LEN(pair) != 2)
03211                 rb_raise(rb_eArgError, "not a 2-element array in convpath");
03212             snamev = rb_ary_entry(pair, 0);
03213             enc_arg(&snamev, &sname, &senc);
03214             dnamev = rb_ary_entry(pair, 1);
03215             enc_arg(&dnamev, &dname, &denc);
03216         }
03217         else {
03218             sname = "";
03219             dname = StringValueCStr(elt);
03220         }
03221         if (DECORATOR_P(sname, dname)) {
03222             ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
03223             if (ret == -1)
03224                 rb_raise(rb_eArgError, "decoration failed: %s", dname);
03225         }
03226         else {
03227             int j = ec->num_trans;
03228             struct rb_econv_init_by_convpath_t arg;
03229             arg.ec = ec;
03230             arg.index = ec->num_trans;
03231             arg.ret = 0;
03232             ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
03233             if (ret == -1 || arg.ret == -1)
03234                 rb_raise(rb_eArgError, "adding conversion failed: %s to %s", sname, dname);
03235             if (first) {
03236                 first = 0;
03237                 *senc_p = senc;
03238                 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
03239             }
03240             *denc_p = denc;
03241             *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
03242         }
03243     }
03244 
03245     if (first) {
03246       *senc_p = NULL;
03247       *denc_p = NULL;
03248       *sname_p = "";
03249       *dname_p = "";
03250     }
03251 
03252     ec->source_encoding_name = *sname_p;
03253     ec->destination_encoding_name = *dname_p;
03254 
03255     return ec;
03256 }
03257 
03258 /*
03259  * call-seq:
03260  *   Encoding::Converter.new(source_encoding, destination_encoding)
03261  *   Encoding::Converter.new(source_encoding, destination_encoding, opt)
03262  *   Encoding::Converter.new(convpath)
03263  *
03264  * possible options elements:
03265  *   hash form:
03266  *     :invalid => nil            # raise error on invalid byte sequence (default)
03267  *     :invalid => :replace       # replace invalid byte sequence
03268  *     :undef => nil              # raise error on undefined conversion (default)
03269  *     :undef => :replace         # replace undefined conversion
03270  *     :replace => string         # replacement string ("?" or "\uFFFD" if not specified)
03271  *     :newline => :universal     # decorator for converting CRLF and CR to LF
03272  *     :newline => :crlf          # decorator for converting LF to CRLF
03273  *     :newline => :cr            # decorator for converting LF to CR
03274  *     :universal_newline => true # decorator for converting CRLF and CR to LF
03275  *     :crlf_newline => true      # decorator for converting LF to CRLF
03276  *     :cr_newline => true        # decorator for converting LF to CR
03277  *     :xml => :text              # escape as XML CharData.
03278  *     :xml => :attr              # escape as XML AttValue
03279  *   integer form:
03280  *     Encoding::Converter::INVALID_REPLACE
03281  *     Encoding::Converter::UNDEF_REPLACE
03282  *     Encoding::Converter::UNDEF_HEX_CHARREF
03283  *     Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
03284  *     Encoding::Converter::CRLF_NEWLINE_DECORATOR
03285  *     Encoding::Converter::CR_NEWLINE_DECORATOR
03286  *     Encoding::Converter::XML_TEXT_DECORATOR
03287  *     Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
03288  *     Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
03289  *
03290  * Encoding::Converter.new creates an instance of Encoding::Converter.
03291  *
03292  * Source_encoding and destination_encoding should be a string or
03293  * Encoding object.
03294  *
03295  * opt should be nil, a hash or an integer.
03296  *
03297  * convpath should be an array.
03298  * convpath may contain
03299  * - two-element arrays which contain encodings or encoding names, or
03300  * - strings representing decorator names.
03301  *
03302  * Encoding::Converter.new optionally takes an option.
03303  * The option should be a hash or an integer.
03304  * The option hash can contain :invalid => nil, etc.
03305  * The option integer should be logical-or of constants such as
03306  * Encoding::Converter::INVALID_REPLACE, etc.
03307  *
03308  * [:invalid => nil]
03309  *   Raise error on invalid byte sequence.  This is a default behavior.
03310  * [:invalid => :replace]
03311  *   Replace invalid byte sequence by replacement string.
03312  * [:undef => nil]
03313  *   Raise an error if a character in source_encoding is not defined in destination_encoding.
03314  *   This is a default behavior.
03315  * [:undef => :replace]
03316  *   Replace undefined character in destination_encoding with replacement string.
03317  * [:replace => string]
03318  *   Specify the replacement string.
03319  *   If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
03320  * [:universal_newline => true]
03321  *   Convert CRLF and CR to LF.
03322  * [:crlf_newline => true]
03323  *   Convert LF to CRLF.
03324  * [:cr_newline => true]
03325  *   Convert LF to CR.
03326  * [:xml => :text]
03327  *   Escape as XML CharData.
03328  *   This form can be used as a HTML 4.0 #PCDATA.
03329  *   - '&' -> '&amp;'
03330  *   - '<' -> '&lt;'
03331  *   - '>' -> '&gt;'
03332  *   - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
03333  * [:xml => :attr]
03334  *   Escape as XML AttValue.
03335  *   The converted result is quoted as "...".
03336  *   This form can be used as a HTML 4.0 attribute value.
03337  *   - '&' -> '&amp;'
03338  *   - '<' -> '&lt;'
03339  *   - '>' -> '&gt;'
03340  *   - '"' -> '&quot;'
03341  *   - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
03342  *
03343  * Examples:
03344  *   # UTF-16BE to UTF-8
03345  *   ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
03346  *
03347  *   # Usually, decorators such as newline conversion are inserted last.
03348  *   ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
03349  *   p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
03350  *                 #    "universal_newline"]
03351  *
03352  *   # But, if the last encoding is ASCII incompatible,
03353  *   # decorators are inserted before the last conversion.
03354  *   ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
03355  *   p ec.convpath #=> ["crlf_newline",
03356  *                 #    [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
03357  *
03358  *   # Conversion path can be specified directly.
03359  *   ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
03360  *   p ec.convpath #=> ["universal_newline",
03361  *                 #    [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
03362  *                 #    [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
03363  */
03364 static VALUE
03365 econv_init(int argc, VALUE *argv, VALUE self)
03366 {
03367     VALUE ecopts;
03368     volatile VALUE snamev, dnamev;
03369     const char *sname, *dname;
03370     rb_encoding *senc, *denc;
03371     rb_econv_t *ec;
03372     int ecflags;
03373     VALUE convpath;
03374 
03375     if (rb_check_typeddata(self, &econv_data_type)) {
03376         rb_raise(rb_eTypeError, "already initialized");
03377     }
03378 
03379     if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
03380         ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
03381         ecflags = 0;
03382         ecopts = Qnil;
03383     }
03384     else {
03385         econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
03386         ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
03387     }
03388 
03389     if (!ec) {
03390         rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03391     }
03392 
03393     if (!DECORATOR_P(sname, dname)) {
03394         if (!senc)
03395             senc = make_dummy_encoding(sname);
03396         if (!denc)
03397             denc = make_dummy_encoding(dname);
03398     }
03399 
03400     ec->source_encoding = senc;
03401     ec->destination_encoding = denc;
03402 
03403     DATA_PTR(self) = ec;
03404 
03405     return self;
03406 }
03407 
03408 /*
03409  * call-seq:
03410  *   ec.inspect         -> string
03411  *
03412  * Returns a printable version of <i>ec</i>
03413  *
03414  *   ec = Encoding::Converter.new("iso-8859-1", "utf-8")
03415  *   puts ec.inspect    #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
03416  *
03417  */
03418 static VALUE
03419 econv_inspect(VALUE self)
03420 {
03421     const char *cname = rb_obj_classname(self);
03422     rb_econv_t *ec;
03423 
03424     TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
03425     if (!ec)
03426         return rb_sprintf("#<%s: uninitialized>", cname);
03427     else {
03428         const char *sname = ec->source_encoding_name;
03429         const char *dname = ec->destination_encoding_name;
03430         VALUE str;
03431         str = rb_sprintf("#<%s: ", cname);
03432         econv_description(sname, dname, ec->flags, str);
03433         rb_str_cat2(str, ">");
03434         return str;
03435     }
03436 }
03437 
03438 static rb_econv_t *
03439 check_econv(VALUE self)
03440 {
03441     rb_econv_t *ec;
03442 
03443     TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
03444     if (!ec) {
03445         rb_raise(rb_eTypeError, "uninitialized encoding converter");
03446     }
03447     return ec;
03448 }
03449 
03450 /*
03451  * call-seq:
03452  *   ec.source_encoding -> encoding
03453  *
03454  * Returns the source encoding as an Encoding object.
03455  */
03456 static VALUE
03457 econv_source_encoding(VALUE self)
03458 {
03459     rb_econv_t *ec = check_econv(self);
03460     if (!ec->source_encoding)
03461         return Qnil;
03462     return rb_enc_from_encoding(ec->source_encoding);
03463 }
03464 
03465 /*
03466  * call-seq:
03467  *   ec.destination_encoding -> encoding
03468  *
03469  * Returns the destination encoding as an Encoding object.
03470  */
03471 static VALUE
03472 econv_destination_encoding(VALUE self)
03473 {
03474     rb_econv_t *ec = check_econv(self);
03475     if (!ec->destination_encoding)
03476         return Qnil;
03477     return rb_enc_from_encoding(ec->destination_encoding);
03478 }
03479 
03480 /*
03481  * call-seq:
03482  *   ec.convpath        -> ary
03483  *
03484  * Returns the conversion path of ec.
03485  *
03486  * The result is an array of conversions.
03487  *
03488  *   ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
03489  *   p ec.convpath
03490  *   #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
03491  *   #    [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
03492  *   #    "crlf_newline"]
03493  *
03494  * Each element of the array is a pair of encodings or a string.
03495  * A pair means an encoding conversion.
03496  * A string means a decorator.
03497  *
03498  * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
03499  * a converter from ISO-8859-1 to UTF-8.
03500  * "crlf_newline" means newline converter from LF to CRLF.
03501  */
03502 static VALUE
03503 econv_convpath(VALUE self)
03504 {
03505     rb_econv_t *ec = check_econv(self);
03506     VALUE result;
03507     int i;
03508 
03509     result = rb_ary_new();
03510     for (i = 0; i < ec->num_trans; i++) {
03511         const rb_transcoder *tr = ec->elems[i].tc->transcoder;
03512         VALUE v;
03513         if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
03514             v = rb_str_new_cstr(tr->dst_encoding);
03515         else
03516             v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
03517         rb_ary_push(result, v);
03518     }
03519     return result;
03520 }
03521 
03522 /*
03523  * call-seq:
03524  *   ec == other        -> true or false
03525  */
03526 static VALUE
03527 econv_equal(VALUE self, VALUE other)
03528 {
03529     rb_econv_t *ec1 = check_econv(self);
03530     rb_econv_t *ec2;
03531     int i;
03532 
03533     if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
03534         return Qnil;
03535     }
03536     ec2 = DATA_PTR(other);
03537     if (!ec2) return Qfalse;
03538     if (ec1->source_encoding_name != ec2->source_encoding_name &&
03539         strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
03540         return Qfalse;
03541     if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
03542         strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
03543         return Qfalse;
03544     if (ec1->flags != ec2->flags) return Qfalse;
03545     if (ec1->replacement_enc != ec2->replacement_enc &&
03546         strcmp(ec1->replacement_enc, ec2->replacement_enc))
03547         return Qfalse;
03548     if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
03549     if (ec1->replacement_str != ec2->replacement_str &&
03550         memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
03551         return Qfalse;
03552 
03553     if (ec1->num_trans != ec2->num_trans) return Qfalse;
03554     for (i = 0; i < ec1->num_trans; i++) {
03555         if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
03556             return Qfalse;
03557     }
03558     return Qtrue;
03559 }
03560 
03561 static VALUE
03562 econv_result_to_symbol(rb_econv_result_t res)
03563 {
03564     switch (res) {
03565       case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
03566       case econv_incomplete_input: return sym_incomplete_input;
03567       case econv_undefined_conversion: return sym_undefined_conversion;
03568       case econv_destination_buffer_full: return sym_destination_buffer_full;
03569       case econv_source_buffer_empty: return sym_source_buffer_empty;
03570       case econv_finished: return sym_finished;
03571       case econv_after_output: return sym_after_output;
03572       default: return INT2NUM(res); /* should not be reached */
03573     }
03574 }
03575 
03576 /*
03577  * call-seq:
03578  *   ec.primitive_convert(source_buffer, destination_buffer) -> symbol
03579  *   ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
03580  *   ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
03581  *   ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
03582  *
03583  * possible opt elements:
03584  *   hash form:
03585  *     :partial_input => true           # source buffer may be part of larger source
03586  *     :after_output => true            # stop conversion after output before input
03587  *   integer form:
03588  *     Encoding::Converter::PARTIAL_INPUT
03589  *     Encoding::Converter::AFTER_OUTPUT
03590  *
03591  * possible results:
03592  *    :invalid_byte_sequence
03593  *    :incomplete_input
03594  *    :undefined_conversion
03595  *    :after_output
03596  *    :destination_buffer_full
03597  *    :source_buffer_empty
03598  *    :finished
03599  *
03600  * primitive_convert converts source_buffer into destination_buffer.
03601  *
03602  * source_buffer should be a string or nil.
03603  * nil means a empty string.
03604  *
03605  * destination_buffer should be a string.
03606  *
03607  * destination_byteoffset should be an integer or nil.
03608  * nil means the end of destination_buffer.
03609  * If it is omitted, nil is assumed.
03610  *
03611  * destination_bytesize should be an integer or nil.
03612  * nil means unlimited.
03613  * If it is omitted, nil is assumed.
03614  *
03615  * opt should be nil, a hash or an integer.
03616  * nil means no flags.
03617  * If it is omitted, nil is assumed.
03618  *
03619  * primitive_convert converts the content of source_buffer from beginning
03620  * and store the result into destination_buffer.
03621  *
03622  * destination_byteoffset and destination_bytesize specify the region which
03623  * the converted result is stored.
03624  * destination_byteoffset specifies the start position in destination_buffer in bytes.
03625  * If destination_byteoffset is nil,
03626  * destination_buffer.bytesize is used for appending the result.
03627  * destination_bytesize specifies maximum number of bytes.
03628  * If destination_bytesize is nil,
03629  * destination size is unlimited.
03630  * After conversion, destination_buffer is resized to
03631  * destination_byteoffset + actually produced number of bytes.
03632  * Also destination_buffer's encoding is set to destination_encoding.
03633  *
03634  * primitive_convert drops the converted part of source_buffer.
03635  * the dropped part is converted in destination_buffer or
03636  * buffered in Encoding::Converter object.
03637  *
03638  * primitive_convert stops conversion when one of following condition met.
03639  * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
03640  * - unexpected end of source buffer (:incomplete_input)
03641  *   this occur only when :partial_input is not specified.
03642  * - character not representable in output encoding (:undefined_conversion)
03643  * - after some output is generated, before input is done (:after_output)
03644  *   this occur only when :after_output is specified.
03645  * - destination buffer is full (:destination_buffer_full)
03646  *   this occur only when destination_bytesize is non-nil.
03647  * - source buffer is empty (:source_buffer_empty)
03648  *   this occur only when :partial_input is specified.
03649  * - conversion is finished (:finished)
03650  *
03651  * example:
03652  *   ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
03653  *   ret = ec.primitive_convert(src="pi", dst="", nil, 100)
03654  *   p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
03655  *
03656  *   ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
03657  *   ret = ec.primitive_convert(src="pi", dst="", nil, 1)
03658  *   p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
03659  *   ret = ec.primitive_convert(src, dst="", nil, 1)
03660  *   p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
03661  *   ret = ec.primitive_convert(src, dst="", nil, 1)
03662  *   p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
03663  *   ret = ec.primitive_convert(src, dst="", nil, 1)
03664  *   p [ret, src, dst] #=> [:finished, "", "i"]
03665  *
03666  */
03667 static VALUE
03668 econv_primitive_convert(int argc, VALUE *argv, VALUE self)
03669 {
03670     VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
03671     rb_econv_t *ec = check_econv(self);
03672     rb_econv_result_t res;
03673     const unsigned char *ip, *is;
03674     unsigned char *op, *os;
03675     long output_byteoffset, output_bytesize;
03676     unsigned long output_byteend;
03677     int flags;
03678 
03679     argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
03680 
03681     if (NIL_P(output_byteoffset_v))
03682         output_byteoffset = 0; /* dummy */
03683     else
03684         output_byteoffset = NUM2LONG(output_byteoffset_v);
03685 
03686     if (NIL_P(output_bytesize_v))
03687         output_bytesize = 0; /* dummy */
03688     else
03689         output_bytesize = NUM2LONG(output_bytesize_v);
03690 
03691     if (!NIL_P(flags_v)) {
03692         if (!NIL_P(opt)) {
03693             rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..5)",
03694                 argc + 1);
03695         }
03696         flags = NUM2INT(rb_to_int(flags_v));
03697     }
03698     else if (!NIL_P(opt)) {
03699         VALUE v;
03700         flags = 0;
03701         v = rb_hash_aref(opt, sym_partial_input);
03702         if (RTEST(v))
03703             flags |= ECONV_PARTIAL_INPUT;
03704         v = rb_hash_aref(opt, sym_after_output);
03705         if (RTEST(v))
03706             flags |= ECONV_AFTER_OUTPUT;
03707     }
03708     else {
03709         flags = 0;
03710     }
03711 
03712     StringValue(output);
03713     if (!NIL_P(input))
03714         StringValue(input);
03715     rb_str_modify(output);
03716 
03717     if (NIL_P(output_bytesize_v)) {
03718         output_bytesize = RSTRING_EMBED_LEN_MAX;
03719         if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
03720             output_bytesize = RSTRING_LEN(input);
03721     }
03722 
03723   retry:
03724 
03725     if (NIL_P(output_byteoffset_v))
03726         output_byteoffset = RSTRING_LEN(output);
03727 
03728     if (output_byteoffset < 0)
03729         rb_raise(rb_eArgError, "negative output_byteoffset");
03730 
03731     if (RSTRING_LEN(output) < output_byteoffset)
03732         rb_raise(rb_eArgError, "output_byteoffset too big");
03733 
03734     if (output_bytesize < 0)
03735         rb_raise(rb_eArgError, "negative output_bytesize");
03736 
03737     output_byteend = (unsigned long)output_byteoffset +
03738                      (unsigned long)output_bytesize;
03739 
03740     if (output_byteend < (unsigned long)output_byteoffset ||
03741         LONG_MAX < output_byteend)
03742         rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
03743 
03744     if (rb_str_capacity(output) < output_byteend)
03745         rb_str_resize(output, output_byteend);
03746 
03747     if (NIL_P(input)) {
03748         ip = is = NULL;
03749     }
03750     else {
03751         ip = (const unsigned char *)RSTRING_PTR(input);
03752         is = ip + RSTRING_LEN(input);
03753     }
03754 
03755     op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
03756     os = op + output_bytesize;
03757 
03758     res = rb_econv_convert(ec, &ip, is, &op, os, flags);
03759     rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
03760     if (!NIL_P(input))
03761         rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
03762 
03763     if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
03764         if (LONG_MAX / 2 < output_bytesize)
03765             rb_raise(rb_eArgError, "too long conversion result");
03766         output_bytesize *= 2;
03767         output_byteoffset_v = Qnil;
03768         goto retry;
03769     }
03770 
03771     if (ec->destination_encoding) {
03772         rb_enc_associate(output, ec->destination_encoding);
03773     }
03774 
03775     return econv_result_to_symbol(res);
03776 }
03777 
03778 /*
03779  * call-seq:
03780  *   ec.convert(source_string) -> destination_string
03781  *
03782  * Convert source_string and return destination_string.
03783  *
03784  * source_string is assumed as a part of source.
03785  * i.e.  :partial_input=>true is specified internally.
03786  * finish method should be used last.
03787  *
03788  *   ec = Encoding::Converter.new("utf-8", "euc-jp")
03789  *   puts ec.convert("\u3042").dump     #=> "\xA4\xA2"
03790  *   puts ec.finish.dump                #=> ""
03791  *
03792  *   ec = Encoding::Converter.new("euc-jp", "utf-8")
03793  *   puts ec.convert("\xA4").dump       #=> ""
03794  *   puts ec.convert("\xA2").dump       #=> "\xE3\x81\x82"
03795  *   puts ec.finish.dump                #=> ""
03796  *
03797  *   ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
03798  *   puts ec.convert("\xE3").dump       #=> "".force_encoding("ISO-2022-JP")
03799  *   puts ec.convert("\x81").dump       #=> "".force_encoding("ISO-2022-JP")
03800  *   puts ec.convert("\x82").dump       #=> "\e$B$\"".force_encoding("ISO-2022-JP")
03801  *   puts ec.finish.dump                #=> "\e(B".force_encoding("ISO-2022-JP")
03802  *
03803  * If a conversion error occur,
03804  * Encoding::UndefinedConversionError or
03805  * Encoding::InvalidByteSequenceError is raised.
03806  * Encoding::Converter#convert doesn't supply methods to recover or restart
03807  * from these exceptions.
03808  * When you want to handle these conversion errors,
03809  * use Encoding::Converter#primitive_convert.
03810  *
03811  */
03812 static VALUE
03813 econv_convert(VALUE self, VALUE source_string)
03814 {
03815     VALUE ret, dst;
03816     VALUE av[5];
03817     int ac;
03818     rb_econv_t *ec = check_econv(self);
03819 
03820     StringValue(source_string);
03821 
03822     dst = rb_str_new(NULL, 0);
03823 
03824     av[0] = rb_str_dup(source_string);
03825     av[1] = dst;
03826     av[2] = Qnil;
03827     av[3] = Qnil;
03828     av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
03829     ac = 5;
03830 
03831     ret = econv_primitive_convert(ac, av, self);
03832 
03833     if (ret == sym_invalid_byte_sequence ||
03834         ret == sym_undefined_conversion ||
03835         ret == sym_incomplete_input) {
03836         VALUE exc = make_econv_exception(ec);
03837         rb_exc_raise(exc);
03838     }
03839 
03840     if (ret == sym_finished) {
03841         rb_raise(rb_eArgError, "converter already finished");
03842     }
03843 
03844     if (ret != sym_source_buffer_empty) {
03845         rb_bug("unexpected result of econv_primitive_convert");
03846     }
03847 
03848     return dst;
03849 }
03850 
03851 /*
03852  * call-seq:
03853  *   ec.finish -> string
03854  *
03855  * Finishes the converter.
03856  * It returns the last part of the converted string.
03857  *
03858  *   ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
03859  *   p ec.convert("\u3042")     #=> "\e$B$\""
03860  *   p ec.finish                #=> "\e(B"
03861  */
03862 static VALUE
03863 econv_finish(VALUE self)
03864 {
03865     VALUE ret, dst;
03866     VALUE av[5];
03867     int ac;
03868     rb_econv_t *ec = check_econv(self);
03869 
03870     dst = rb_str_new(NULL, 0);
03871 
03872     av[0] = Qnil;
03873     av[1] = dst;
03874     av[2] = Qnil;
03875     av[3] = Qnil;
03876     av[4] = INT2NUM(0);
03877     ac = 5;
03878 
03879     ret = econv_primitive_convert(ac, av, self);
03880 
03881     if (ret == sym_invalid_byte_sequence ||
03882         ret == sym_undefined_conversion ||
03883         ret == sym_incomplete_input) {
03884         VALUE exc = make_econv_exception(ec);
03885         rb_exc_raise(exc);
03886     }
03887 
03888     if (ret != sym_finished) {
03889         rb_bug("unexpected result of econv_primitive_convert");
03890     }
03891 
03892     return dst;
03893 }
03894 
03895 /*
03896  * call-seq:
03897  *   ec.primitive_errinfo -> array
03898  *
03899  * primitive_errinfo returns important information regarding the last error
03900  * as a 5-element array:
03901  *
03902  *   [result, enc1, enc2, error_bytes, readagain_bytes]
03903  *
03904  * result is the last result of primitive_convert.
03905  *
03906  * Other elements are only meaningful when result is
03907  * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
03908  *
03909  * enc1 and enc2 indicate a conversion step as a pair of strings.
03910  * For example, a converter from EUC-JP to ISO-8859-1 converts
03911  * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
03912  * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
03913  *
03914  * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
03915  * error_bytes is discarded portion.
03916  * readagain_bytes is buffered portion which is read again on next conversion.
03917  *
03918  * Example:
03919  *
03920  *   # \xff is invalid as EUC-JP.
03921  *   ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
03922  *   ec.primitive_convert(src="\xff", dst="", nil, 10)
03923  *   p ec.primitive_errinfo
03924  *   #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""]
03925  *
03926  *   # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
03927  *   # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
03928  *   # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
03929  *   ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
03930  *   ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
03931  *   p ec.primitive_errinfo
03932  *   #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
03933  *
03934  *   # partial character is invalid
03935  *   ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
03936  *   ec.primitive_convert(src="\xa4", dst="", nil, 10)
03937  *   p ec.primitive_errinfo
03938  *   #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
03939  *
03940  *   # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
03941  *   # partial characters.
03942  *   ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
03943  *   ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
03944  *   p ec.primitive_errinfo
03945  *   #=> [:source_buffer_empty, nil, nil, nil, nil]
03946  *
03947  *   # \xd8\x00\x00@ is invalid as UTF-16BE because
03948  *   # no low surrogate after high surrogate (\xd8\x00).
03949  *   # It is detected by 3rd byte (\00) which is part of next character.
03950  *   # So the high surrogate (\xd8\x00) is discarded and
03951  *   # the 3rd byte is read again later.
03952  *   # Since the byte is buffered in ec, it is dropped from src.
03953  *   ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
03954  *   ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
03955  *   p ec.primitive_errinfo
03956  *   #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
03957  *   p src
03958  *   #=> "@"
03959  *
03960  *   # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
03961  *   # The problem is detected by 4th byte.
03962  *   ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
03963  *   ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
03964  *   p ec.primitive_errinfo
03965  *   #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
03966  *   p src
03967  *   #=> ""
03968  *
03969  */
03970 static VALUE
03971 econv_primitive_errinfo(VALUE self)
03972 {
03973     rb_econv_t *ec = check_econv(self);
03974 
03975     VALUE ary;
03976 
03977     ary = rb_ary_new2(5);
03978 
03979     rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
03980     rb_ary_store(ary, 4, Qnil);
03981 
03982     if (ec->last_error.source_encoding)
03983         rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
03984 
03985     if (ec->last_error.destination_encoding)
03986         rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
03987 
03988     if (ec->last_error.error_bytes_start) {
03989         rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
03990         rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
03991     }
03992 
03993     return ary;
03994 }
03995 
03996 /*
03997  * call-seq:
03998  *   ec.insert_output(string) -> nil
03999  *
04000  * Inserts string into the encoding converter.
04001  * The string will be converted to the destination encoding and
04002  * output on later conversions.
04003  *
04004  * If the destination encoding is stateful,
04005  * string is converted according to the state and the state is updated.
04006  *
04007  * This method should be used only when a conversion error occurs.
04008  *
04009  *  ec = Encoding::Converter.new("utf-8", "iso-8859-1")
04010  *  src = "HIRAGANA LETTER A is \u{3042}."
04011  *  dst = ""
04012  *  p ec.primitive_convert(src, dst)    #=> :undefined_conversion
04013  *  puts "[#{dst.dump}, #{src.dump}]"   #=> ["HIRAGANA LETTER A is ", "."]
04014  *  ec.insert_output("<err>")
04015  *  p ec.primitive_convert(src, dst)    #=> :finished
04016  *  puts "[#{dst.dump}, #{src.dump}]"   #=> ["HIRAGANA LETTER A is <err>.", ""]
04017  *
04018  *  ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
04019  *  src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
04020  *  dst = ""
04021  *  p ec.primitive_convert(src, dst)    #=> :undefined_conversion
04022  *  puts "[#{dst.dump}, #{src.dump}]"   #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
04023  *  ec.insert_output "?"                # state change required to output "?".
04024  *  p ec.primitive_convert(src, dst)    #=> :finished
04025  *  puts "[#{dst.dump}, #{src.dump}]"   #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
04026  *
04027  */
04028 static VALUE
04029 econv_insert_output(VALUE self, VALUE string)
04030 {
04031     const char *insert_enc;
04032 
04033     int ret;
04034 
04035     rb_econv_t *ec = check_econv(self);
04036 
04037     StringValue(string);
04038     insert_enc = rb_econv_encoding_to_insert_output(ec);
04039     string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
04040 
04041     ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
04042     if (ret == -1) {
04043         rb_raise(rb_eArgError, "too big string");
04044     }
04045 
04046     return Qnil;
04047 }
04048 
04049 /*
04050  * call-seq
04051  *   ec.putback                    -> string
04052  *   ec.putback(max_numbytes)      -> string
04053  *
04054  * Put back the bytes which will be converted.
04055  *
04056  * The bytes are caused by invalid_byte_sequence error.
04057  * When invalid_byte_sequence error, some bytes are discarded and
04058  * some bytes are buffered to be converted later.
04059  * The latter bytes can be put back.
04060  * It can be observed by
04061  * Encoding::InvalidByteSequenceError#readagain_bytes and
04062  * Encoding::Converter#primitive_errinfo.
04063  *
04064  *   ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
04065  *   src = "\x00\xd8\x61\x00"
04066  *   dst = ""
04067  *   p ec.primitive_convert(src, dst)   #=> :invalid_byte_sequence
04068  *   p ec.primitive_errinfo     #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
04069  *   p ec.putback               #=> "a\x00"
04070  *   p ec.putback               #=> ""          # no more bytes to put back
04071  *
04072  */
04073 static VALUE
04074 econv_putback(int argc, VALUE *argv, VALUE self)
04075 {
04076     rb_econv_t *ec = check_econv(self);
04077     int n;
04078     int putbackable;
04079     VALUE str, max;
04080 
04081     rb_scan_args(argc, argv, "01", &max);
04082 
04083     if (NIL_P(max))
04084         n = rb_econv_putbackable(ec);
04085     else {
04086         n = NUM2INT(max);
04087         putbackable = rb_econv_putbackable(ec);
04088         if (putbackable < n)
04089             n = putbackable;
04090     }
04091 
04092     str = rb_str_new(NULL, n);
04093     rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
04094 
04095     if (ec->source_encoding) {
04096         rb_enc_associate(str, ec->source_encoding);
04097     }
04098 
04099     return str;
04100 }
04101 
04102 /*
04103  * call-seq:
04104  *   ec.last_error -> exception or nil
04105  *
04106  * Returns an exception object for the last conversion.
04107  * Returns nil if the last conversion did not produce an error.
04108  *
04109  * "error" means that
04110  * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
04111  * Encoding::Converter#convert and
04112  * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
04113  * Encoding::Converter#primitive_convert.
04114  *
04115  *  ec = Encoding::Converter.new("utf-8", "iso-8859-1")
04116  *  p ec.primitive_convert(src="\xf1abcd", dst="")       #=> :invalid_byte_sequence
04117  *  p ec.last_error      #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
04118  *  p ec.primitive_convert(src, dst, nil, 1)             #=> :destination_buffer_full
04119  *  p ec.last_error      #=> nil
04120  *
04121  */
04122 static VALUE
04123 econv_last_error(VALUE self)
04124 {
04125     rb_econv_t *ec = check_econv(self);
04126     VALUE exc;
04127 
04128     exc = make_econv_exception(ec);
04129     if (NIL_P(exc))
04130         return Qnil;
04131     return exc;
04132 }
04133 
04134 /*
04135  * call-seq:
04136  *   ec.replacement -> string
04137  *
04138  * Returns the replacement string.
04139  *
04140  *  ec = Encoding::Converter.new("euc-jp", "us-ascii")
04141  *  p ec.replacement    #=> "?"
04142  *
04143  *  ec = Encoding::Converter.new("euc-jp", "utf-8")
04144  *  p ec.replacement    #=> "\uFFFD"
04145  */
04146 static VALUE
04147 econv_get_replacement(VALUE self)
04148 {
04149     rb_econv_t *ec = check_econv(self);
04150     int ret;
04151     rb_encoding *enc;
04152 
04153     ret = make_replacement(ec);
04154     if (ret == -1) {
04155         rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
04156     }
04157 
04158     enc = rb_enc_find(ec->replacement_enc);
04159     return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
04160 }
04161 
04162 /*
04163  * call-seq:
04164  *   ec.replacement = string
04165  *
04166  * Sets the replacement string.
04167  *
04168  *  ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
04169  *  ec.replacement = "<undef>"
04170  *  p ec.convert("a \u3042 b")      #=> "a <undef> b"
04171  */
04172 static VALUE
04173 econv_set_replacement(VALUE self, VALUE arg)
04174 {
04175     rb_econv_t *ec = check_econv(self);
04176     VALUE string = arg;
04177     int ret;
04178     rb_encoding *enc;
04179 
04180     StringValue(string);
04181     enc = rb_enc_get(string);
04182 
04183     ret = rb_econv_set_replacement(ec,
04184             (const unsigned char *)RSTRING_PTR(string),
04185             RSTRING_LEN(string),
04186             rb_enc_name(enc));
04187 
04188     if (ret == -1) {
04189         /* xxx: rb_eInvalidByteSequenceError? */
04190         rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
04191     }
04192 
04193     return arg;
04194 }
04195 
04196 VALUE
04197 rb_econv_make_exception(rb_econv_t *ec)
04198 {
04199     return make_econv_exception(ec);
04200 }
04201 
04202 void
04203 rb_econv_check_error(rb_econv_t *ec)
04204 {
04205     VALUE exc;
04206 
04207     exc = make_econv_exception(ec);
04208     if (NIL_P(exc))
04209         return;
04210     rb_exc_raise(exc);
04211 }
04212 
04213 /*
04214  * call-seq:
04215  *   ecerr.source_encoding_name         -> string
04216  *
04217  * Returns the source encoding name as a string.
04218  */
04219 static VALUE
04220 ecerr_source_encoding_name(VALUE self)
04221 {
04222     return rb_attr_get(self, rb_intern("source_encoding_name"));
04223 }
04224 
04225 /*
04226  * call-seq:
04227  *   ecerr.source_encoding              -> encoding
04228  *
04229  * Returns the source encoding as an encoding object.
04230  *
04231  * Note that the result may not be equal to the source encoding of
04232  * the encoding converter if the conversion has multiple steps.
04233  *
04234  *  ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
04235  *  begin
04236  *    ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
04237  *  rescue Encoding::UndefinedConversionError
04238  *    p $!.source_encoding              #=> #<Encoding:UTF-8>
04239  *    p $!.destination_encoding         #=> #<Encoding:EUC-JP>
04240  *    p $!.source_encoding_name         #=> "UTF-8"
04241  *    p $!.destination_encoding_name    #=> "EUC-JP"
04242  *  end
04243  *
04244  */
04245 static VALUE
04246 ecerr_source_encoding(VALUE self)
04247 {
04248     return rb_attr_get(self, rb_intern("source_encoding"));
04249 }
04250 
04251 /*
04252  * call-seq:
04253  *   ecerr.destination_encoding_name         -> string
04254  *
04255  * Returns the destination encoding name as a string.
04256  */
04257 static VALUE
04258 ecerr_destination_encoding_name(VALUE self)
04259 {
04260     return rb_attr_get(self, rb_intern("destination_encoding_name"));
04261 }
04262 
04263 /*
04264  * call-seq:
04265  *   ecerr.destination_encoding         -> string
04266  *
04267  * Returns the destination encoding as an encoding object.
04268  */
04269 static VALUE
04270 ecerr_destination_encoding(VALUE self)
04271 {
04272     return rb_attr_get(self, rb_intern("destination_encoding"));
04273 }
04274 
04275 /*
04276  * call-seq:
04277  *   ecerr.error_char         -> string
04278  *
04279  * Returns the one-character string which cause Encoding::UndefinedConversionError.
04280  *
04281  *  ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
04282  *  begin
04283  *    ec.convert("\xa0")
04284  *  rescue Encoding::UndefinedConversionError
04285  *    puts $!.error_char.dump   #=> "\xC2\xA0"
04286  *    p $!.error_char.encoding  #=> #<Encoding:UTF-8>
04287  *  end
04288  *
04289  */
04290 static VALUE
04291 ecerr_error_char(VALUE self)
04292 {
04293     return rb_attr_get(self, rb_intern("error_char"));
04294 }
04295 
04296 /*
04297  * call-seq:
04298  *   ecerr.error_bytes         -> string
04299  *
04300  * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
04301  *
04302  *  ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
04303  *  begin
04304  *    ec.convert("abc\xA1\xFFdef")
04305  *  rescue Encoding::InvalidByteSequenceError
04306  *    p $!      #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
04307  *    puts $!.error_bytes.dump          #=> "\xA1"
04308  *    puts $!.readagain_bytes.dump      #=> "\xFF"
04309  *  end
04310  */
04311 static VALUE
04312 ecerr_error_bytes(VALUE self)
04313 {
04314     return rb_attr_get(self, rb_intern("error_bytes"));
04315 }
04316 
04317 /*
04318  * call-seq:
04319  *   ecerr.readagain_bytes         -> string
04320  *
04321  * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
04322  */
04323 static VALUE
04324 ecerr_readagain_bytes(VALUE self)
04325 {
04326     return rb_attr_get(self, rb_intern("readagain_bytes"));
04327 }
04328 
04329 /*
04330  * call-seq:
04331  *   ecerr.incomplete_input?         -> true or false
04332  *
04333  * Returns true if the invalid byte sequence error is caused by
04334  * premature end of string.
04335  *
04336  *  ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
04337  *
04338  *  begin
04339  *    ec.convert("abc\xA1z")
04340  *  rescue Encoding::InvalidByteSequenceError
04341  *    p $!      #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
04342  *    p $!.incomplete_input?    #=> false
04343  *  end
04344  *
04345  *  begin
04346  *    ec.convert("abc\xA1")
04347  *    ec.finish
04348  *  rescue Encoding::InvalidByteSequenceError
04349  *    p $!      #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
04350  *    p $!.incomplete_input?    #=> true
04351  *  end
04352  */
04353 static VALUE
04354 ecerr_incomplete_input(VALUE self)
04355 {
04356     return rb_attr_get(self, rb_intern("incomplete_input"));
04357 }
04358 
04359 /*
04360  *  Document-class: Encoding::UndefinedConversionError
04361  *
04362  *  Raised by Encoding and String methods when a transcoding operation
04363  *  fails.
04364  */
04365 
04366 /*
04367  *  Document-class: Encoding::InvalidByteSequenceError
04368  *
04369  *  Raised by Encoding and String methods when the string being
04370  *  transcoded contains a byte invalid for the either the source or
04371  *  target encoding.
04372  */
04373 
04374 /*
04375  *  Document-class: Encoding::ConverterNotFoundError
04376  *
04377  *  Raised by transcoding methods when a named encoding does not
04378  *  correspond with a known converter.
04379  */
04380 
04381 void
04382 Init_transcode(void)
04383 {
04384     rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
04385     rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
04386     rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
04387 
04388     transcoder_table = st_init_strcasetable();
04389 
04390     sym_invalid = ID2SYM(rb_intern("invalid"));
04391     sym_undef = ID2SYM(rb_intern("undef"));
04392     sym_replace = ID2SYM(rb_intern("replace"));
04393     sym_fallback = ID2SYM(rb_intern("fallback"));
04394     sym_aref = ID2SYM(rb_intern("[]"));
04395     sym_xml = ID2SYM(rb_intern("xml"));
04396     sym_text = ID2SYM(rb_intern("text"));
04397     sym_attr = ID2SYM(rb_intern("attr"));
04398 
04399     sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence"));
04400     sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion"));
04401     sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full"));
04402     sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty"));
04403     sym_finished = ID2SYM(rb_intern("finished"));
04404     sym_after_output = ID2SYM(rb_intern("after_output"));
04405     sym_incomplete_input = ID2SYM(rb_intern("incomplete_input"));
04406     sym_universal_newline = ID2SYM(rb_intern("universal_newline"));
04407     sym_crlf_newline = ID2SYM(rb_intern("crlf_newline"));
04408     sym_cr_newline = ID2SYM(rb_intern("cr_newline"));
04409     sym_partial_input = ID2SYM(rb_intern("partial_input"));
04410 
04411 #ifdef ENABLE_ECONV_NEWLINE_OPTION
04412     sym_newline = ID2SYM(rb_intern("newline"));
04413     sym_universal = ID2SYM(rb_intern("universal"));
04414     sym_crlf = ID2SYM(rb_intern("crlf"));
04415     sym_cr = ID2SYM(rb_intern("cr"));
04416     sym_lf = ID2SYM(rb_intern("lf"));
04417 #endif
04418 
04419     rb_define_method(rb_cString, "encode", str_encode, -1);
04420     rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
04421 
04422     rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cData);
04423     rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
04424     rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
04425     rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
04426     rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
04427     rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
04428     rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
04429     rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
04430     rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
04431     rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
04432     rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
04433     rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
04434     rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
04435     rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
04436     rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
04437     rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
04438     rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
04439     rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
04440     rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
04441 
04442     rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
04443     rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
04444     rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
04445     rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
04446     rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
04447     rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
04448     rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
04449     rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
04450     rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
04451     rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
04452     rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
04453     rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
04454     rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
04455 
04456     rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
04457     rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
04458     rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
04459     rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
04460     rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
04461 
04462     rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
04463     rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
04464     rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
04465     rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
04466     rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
04467     rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
04468     rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
04469 
04470     Init_newline();
04471 }
04472