Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /********************************************************************** 00002 00003 transcode.c - 00004 00005 $Author: naruse $ 00006 created at: Tue Oct 30 16:10:22 JST 2007 00007 00008 Copyright (C) 2007 Martin Duerst 00009 00010 **********************************************************************/ 00011 00012 #include "ruby/ruby.h" 00013 #include "ruby/encoding.h" 00014 #include "internal.h" 00015 #include "transcode_data.h" 00016 #include <ctype.h> 00017 00018 #define ENABLE_ECONV_NEWLINE_OPTION 1 00019 00020 /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */ 00021 VALUE rb_eUndefinedConversionError; 00022 VALUE rb_eInvalidByteSequenceError; 00023 VALUE rb_eConverterNotFoundError; 00024 00025 VALUE rb_cEncodingConverter; 00026 00027 static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback, sym_aref; 00028 static VALUE sym_xml, sym_text, sym_attr; 00029 static VALUE sym_universal_newline; 00030 static VALUE sym_crlf_newline; 00031 static VALUE sym_cr_newline; 00032 #ifdef ENABLE_ECONV_NEWLINE_OPTION 00033 static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf; 00034 #endif 00035 static VALUE sym_partial_input; 00036 00037 static VALUE sym_invalid_byte_sequence; 00038 static VALUE sym_undefined_conversion; 00039 static VALUE sym_destination_buffer_full; 00040 static VALUE sym_source_buffer_empty; 00041 static VALUE sym_finished; 00042 static VALUE sym_after_output; 00043 static VALUE sym_incomplete_input; 00044 00045 static unsigned char * 00046 allocate_converted_string(const char *sname, const char *dname, 00047 const unsigned char *str, size_t len, 00048 unsigned char *caller_dst_buf, size_t caller_dst_bufsize, 00049 size_t *dst_len_ptr); 00050 00051 /* dynamic structure, one per conversion (similar to iconv_t) */ 00052 /* may carry conversion state (e.g. for iso-2022-jp) */ 00053 typedef struct rb_transcoding { 00054 const rb_transcoder *transcoder; 00055 00056 int flags; 00057 00058 int resume_position; 00059 unsigned int next_table; 00060 VALUE next_info; 00061 unsigned char next_byte; 00062 unsigned int output_index; 00063 00064 ssize_t recognized_len; /* already interpreted */ 00065 ssize_t readagain_len; /* not yet interpreted */ 00066 union { 00067 unsigned char ary[8]; /* max_input <= sizeof(ary) */ 00068 unsigned char *ptr; /* length: max_input */ 00069 } readbuf; /* recognized_len + readagain_len used */ 00070 00071 ssize_t writebuf_off; 00072 ssize_t writebuf_len; 00073 union { 00074 unsigned char ary[8]; /* max_output <= sizeof(ary) */ 00075 unsigned char *ptr; /* length: max_output */ 00076 } writebuf; 00077 00078 union rb_transcoding_state_t { /* opaque data for stateful encoding */ 00079 void *ptr; 00080 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)]; 00081 double dummy_for_alignment; 00082 } state; 00083 } rb_transcoding; 00084 #define TRANSCODING_READBUF(tc) \ 00085 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \ 00086 (tc)->readbuf.ary : \ 00087 (tc)->readbuf.ptr) 00088 #define TRANSCODING_WRITEBUF(tc) \ 00089 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \ 00090 (tc)->writebuf.ary : \ 00091 (tc)->writebuf.ptr) 00092 #define TRANSCODING_WRITEBUF_SIZE(tc) \ 00093 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \ 00094 sizeof((tc)->writebuf.ary) : \ 00095 (size_t)(tc)->transcoder->max_output) 00096 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t)) 00097 #define TRANSCODING_STATE(tc) \ 00098 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \ 00099 (tc)->state.ary : \ 00100 (tc)->state.ptr) 00101 00102 typedef struct { 00103 struct rb_transcoding *tc; 00104 unsigned char *out_buf_start; 00105 unsigned char *out_data_start; 00106 unsigned char *out_data_end; 00107 unsigned char *out_buf_end; 00108 rb_econv_result_t last_result; 00109 } rb_econv_elem_t; 00110 00111 struct rb_econv_t { 00112 int flags; 00113 const char *source_encoding_name; 00114 const char *destination_encoding_name; 00115 00116 int started; 00117 00118 const unsigned char *replacement_str; 00119 size_t replacement_len; 00120 const char *replacement_enc; 00121 int replacement_allocated; 00122 00123 unsigned char *in_buf_start; 00124 unsigned char *in_data_start; 00125 unsigned char *in_data_end; 00126 unsigned char *in_buf_end; 00127 rb_econv_elem_t *elems; 00128 int num_allocated; 00129 int num_trans; 00130 int num_finished; 00131 struct rb_transcoding *last_tc; 00132 00133 /* last error */ 00134 struct { 00135 rb_econv_result_t result; 00136 struct rb_transcoding *error_tc; 00137 const char *source_encoding; 00138 const char *destination_encoding; 00139 const unsigned char *error_bytes_start; 00140 size_t error_bytes_len; 00141 size_t readagain_len; 00142 } last_error; 00143 00144 /* The following fields are only for Encoding::Converter. 00145 * rb_econv_open set them NULL. */ 00146 rb_encoding *source_encoding; 00147 rb_encoding *destination_encoding; 00148 }; 00149 00150 /* 00151 * Dispatch data and logic 00152 */ 00153 00154 #define DECORATOR_P(sname, dname) (*(sname) == '\0') 00155 00156 typedef struct { 00157 const char *sname; 00158 const char *dname; 00159 const char *lib; /* null means means no need to load a library */ 00160 const rb_transcoder *transcoder; 00161 } transcoder_entry_t; 00162 00163 static st_table *transcoder_table; 00164 00165 static transcoder_entry_t * 00166 make_transcoder_entry(const char *sname, const char *dname) 00167 { 00168 st_data_t val; 00169 st_table *table2; 00170 00171 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) { 00172 val = (st_data_t)st_init_strcasetable(); 00173 st_add_direct(transcoder_table, (st_data_t)sname, val); 00174 } 00175 table2 = (st_table *)val; 00176 if (!st_lookup(table2, (st_data_t)dname, &val)) { 00177 transcoder_entry_t *entry = ALLOC(transcoder_entry_t); 00178 entry->sname = sname; 00179 entry->dname = dname; 00180 entry->lib = NULL; 00181 entry->transcoder = NULL; 00182 val = (st_data_t)entry; 00183 st_add_direct(table2, (st_data_t)dname, val); 00184 } 00185 return (transcoder_entry_t *)val; 00186 } 00187 00188 static transcoder_entry_t * 00189 get_transcoder_entry(const char *sname, const char *dname) 00190 { 00191 st_data_t val; 00192 st_table *table2; 00193 00194 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) { 00195 return NULL; 00196 } 00197 table2 = (st_table *)val; 00198 if (!st_lookup(table2, (st_data_t)dname, &val)) { 00199 return NULL; 00200 } 00201 return (transcoder_entry_t *)val; 00202 } 00203 00204 void 00205 rb_register_transcoder(const rb_transcoder *tr) 00206 { 00207 const char *const sname = tr->src_encoding; 00208 const char *const dname = tr->dst_encoding; 00209 00210 transcoder_entry_t *entry; 00211 00212 entry = make_transcoder_entry(sname, dname); 00213 if (entry->transcoder) { 00214 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered", 00215 sname, dname); 00216 } 00217 00218 entry->transcoder = tr; 00219 } 00220 00221 static void 00222 declare_transcoder(const char *sname, const char *dname, const char *lib) 00223 { 00224 transcoder_entry_t *entry; 00225 00226 entry = make_transcoder_entry(sname, dname); 00227 entry->lib = lib; 00228 } 00229 00230 #define MAX_TRANSCODER_LIBNAME_LEN 64 00231 static const char transcoder_lib_prefix[] = "enc/trans/"; 00232 00233 void 00234 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib) 00235 { 00236 if (!lib || strlen(lib) > MAX_TRANSCODER_LIBNAME_LEN) { 00237 rb_raise(rb_eArgError, "invalid library name - %s", 00238 lib ? lib : "(null)"); 00239 } 00240 declare_transcoder(enc1, enc2, lib); 00241 } 00242 00243 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0) 00244 00245 typedef struct search_path_queue_tag { 00246 struct search_path_queue_tag *next; 00247 const char *enc; 00248 } search_path_queue_t; 00249 00250 typedef struct { 00251 st_table *visited; 00252 search_path_queue_t *queue; 00253 search_path_queue_t **queue_last_ptr; 00254 const char *base_enc; 00255 } search_path_bfs_t; 00256 00257 static int 00258 transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg) 00259 { 00260 const char *dname = (const char *)key; 00261 search_path_bfs_t *bfs = (search_path_bfs_t *)arg; 00262 search_path_queue_t *q; 00263 00264 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) { 00265 return ST_CONTINUE; 00266 } 00267 00268 q = ALLOC(search_path_queue_t); 00269 q->enc = dname; 00270 q->next = NULL; 00271 *bfs->queue_last_ptr = q; 00272 bfs->queue_last_ptr = &q->next; 00273 00274 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc); 00275 return ST_CONTINUE; 00276 } 00277 00278 static int 00279 transcode_search_path(const char *sname, const char *dname, 00280 void (*callback)(const char *sname, const char *dname, int depth, void *arg), 00281 void *arg) 00282 { 00283 search_path_bfs_t bfs; 00284 search_path_queue_t *q; 00285 st_data_t val; 00286 st_table *table2; 00287 int found; 00288 int pathlen = -1; 00289 00290 if (encoding_equal(sname, dname)) 00291 return -1; 00292 00293 q = ALLOC(search_path_queue_t); 00294 q->enc = sname; 00295 q->next = NULL; 00296 bfs.queue_last_ptr = &q->next; 00297 bfs.queue = q; 00298 00299 bfs.visited = st_init_strcasetable(); 00300 st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL); 00301 00302 while (bfs.queue) { 00303 q = bfs.queue; 00304 bfs.queue = q->next; 00305 if (!bfs.queue) 00306 bfs.queue_last_ptr = &bfs.queue; 00307 00308 if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) { 00309 xfree(q); 00310 continue; 00311 } 00312 table2 = (st_table *)val; 00313 00314 if (st_lookup(table2, (st_data_t)dname, &val)) { 00315 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc); 00316 xfree(q); 00317 found = 1; 00318 goto cleanup; 00319 } 00320 00321 bfs.base_enc = q->enc; 00322 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs); 00323 bfs.base_enc = NULL; 00324 00325 xfree(q); 00326 } 00327 found = 0; 00328 00329 cleanup: 00330 while (bfs.queue) { 00331 q = bfs.queue; 00332 bfs.queue = q->next; 00333 xfree(q); 00334 } 00335 00336 if (found) { 00337 const char *enc = dname; 00338 int depth; 00339 pathlen = 0; 00340 while (1) { 00341 st_lookup(bfs.visited, (st_data_t)enc, &val); 00342 if (!val) 00343 break; 00344 pathlen++; 00345 enc = (const char *)val; 00346 } 00347 depth = pathlen; 00348 enc = dname; 00349 while (1) { 00350 st_lookup(bfs.visited, (st_data_t)enc, &val); 00351 if (!val) 00352 break; 00353 callback((const char *)val, enc, --depth, arg); 00354 enc = (const char *)val; 00355 } 00356 } 00357 00358 st_free_table(bfs.visited); 00359 00360 return pathlen; /* is -1 if not found */ 00361 } 00362 00363 static const rb_transcoder * 00364 load_transcoder_entry(transcoder_entry_t *entry) 00365 { 00366 if (entry->transcoder) 00367 return entry->transcoder; 00368 00369 if (entry->lib) { 00370 const char *lib = entry->lib; 00371 size_t len = strlen(lib); 00372 char path[sizeof(transcoder_lib_prefix) + MAX_TRANSCODER_LIBNAME_LEN]; 00373 VALUE fn; 00374 const int safe = rb_safe_level(); 00375 00376 entry->lib = NULL; 00377 00378 if (len > MAX_TRANSCODER_LIBNAME_LEN) 00379 return NULL; 00380 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1); 00381 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1); 00382 fn = rb_str_new2(path); 00383 FL_UNSET(fn, FL_TAINT|FL_UNTRUSTED); 00384 OBJ_FREEZE(fn); 00385 if (!rb_require_safe(fn, safe > 3 ? 3 : safe)) 00386 return NULL; 00387 } 00388 00389 if (entry->transcoder) 00390 return entry->transcoder; 00391 00392 return NULL; 00393 } 00394 00395 static const char* 00396 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr) 00397 { 00398 if (encoding_equal(encname, "UTF-8")) { 00399 *len_ret = 3; 00400 *repl_encname_ptr = "UTF-8"; 00401 return "\xEF\xBF\xBD"; 00402 } 00403 else { 00404 *len_ret = 1; 00405 *repl_encname_ptr = "US-ASCII"; 00406 return "?"; 00407 } 00408 } 00409 00410 /* 00411 * Transcoding engine logic 00412 */ 00413 00414 static const unsigned char * 00415 transcode_char_start(rb_transcoding *tc, 00416 const unsigned char *in_start, 00417 const unsigned char *inchar_start, 00418 const unsigned char *in_p, 00419 size_t *char_len_ptr) 00420 { 00421 const unsigned char *ptr; 00422 if (inchar_start - in_start < tc->recognized_len) { 00423 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len, 00424 inchar_start, unsigned char, in_p - inchar_start); 00425 ptr = TRANSCODING_READBUF(tc); 00426 } 00427 else { 00428 ptr = inchar_start - tc->recognized_len; 00429 } 00430 *char_len_ptr = tc->recognized_len + (in_p - inchar_start); 00431 return ptr; 00432 } 00433 00434 static rb_econv_result_t 00435 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, 00436 const unsigned char *in_stop, unsigned char *out_stop, 00437 rb_transcoding *tc, 00438 const int opt) 00439 { 00440 const rb_transcoder *tr = tc->transcoder; 00441 int unitlen = tr->input_unit_length; 00442 ssize_t readagain_len = 0; 00443 00444 const unsigned char *inchar_start; 00445 const unsigned char *in_p; 00446 00447 unsigned char *out_p; 00448 00449 in_p = inchar_start = *in_pos; 00450 00451 out_p = *out_pos; 00452 00453 #define SUSPEND(ret, num) \ 00454 do { \ 00455 tc->resume_position = (num); \ 00456 if (0 < in_p - inchar_start) \ 00457 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \ 00458 inchar_start, unsigned char, in_p - inchar_start); \ 00459 *in_pos = in_p; \ 00460 *out_pos = out_p; \ 00461 tc->recognized_len += in_p - inchar_start; \ 00462 if (readagain_len) { \ 00463 tc->recognized_len -= readagain_len; \ 00464 tc->readagain_len = readagain_len; \ 00465 } \ 00466 return (ret); \ 00467 resume_label ## num:; \ 00468 } while (0) 00469 #define SUSPEND_OBUF(num) \ 00470 do { \ 00471 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \ 00472 } while (0) 00473 00474 #define SUSPEND_AFTER_OUTPUT(num) \ 00475 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \ 00476 SUSPEND(econv_after_output, num); \ 00477 } 00478 00479 #define next_table (tc->next_table) 00480 #define next_info (tc->next_info) 00481 #define next_byte (tc->next_byte) 00482 #define writebuf_len (tc->writebuf_len) 00483 #define writebuf_off (tc->writebuf_off) 00484 00485 switch (tc->resume_position) { 00486 case 0: break; 00487 case 1: goto resume_label1; 00488 case 2: goto resume_label2; 00489 case 3: goto resume_label3; 00490 case 4: goto resume_label4; 00491 case 5: goto resume_label5; 00492 case 6: goto resume_label6; 00493 case 7: goto resume_label7; 00494 case 8: goto resume_label8; 00495 case 9: goto resume_label9; 00496 case 10: goto resume_label10; 00497 case 11: goto resume_label11; 00498 case 12: goto resume_label12; 00499 case 13: goto resume_label13; 00500 case 14: goto resume_label14; 00501 case 15: goto resume_label15; 00502 case 16: goto resume_label16; 00503 case 17: goto resume_label17; 00504 case 18: goto resume_label18; 00505 case 19: goto resume_label19; 00506 case 20: goto resume_label20; 00507 case 21: goto resume_label21; 00508 case 22: goto resume_label22; 00509 case 23: goto resume_label23; 00510 case 24: goto resume_label24; 00511 case 25: goto resume_label25; 00512 case 26: goto resume_label26; 00513 case 27: goto resume_label27; 00514 case 28: goto resume_label28; 00515 case 29: goto resume_label29; 00516 case 30: goto resume_label30; 00517 case 31: goto resume_label31; 00518 case 32: goto resume_label32; 00519 case 33: goto resume_label33; 00520 case 34: goto resume_label34; 00521 } 00522 00523 while (1) { 00524 inchar_start = in_p; 00525 tc->recognized_len = 0; 00526 next_table = tr->conv_tree_start; 00527 00528 SUSPEND_AFTER_OUTPUT(24); 00529 00530 if (in_stop <= in_p) { 00531 if (!(opt & ECONV_PARTIAL_INPUT)) 00532 break; 00533 SUSPEND(econv_source_buffer_empty, 7); 00534 continue; 00535 } 00536 00537 #define BYTE_ADDR(index) (tr->byte_array + (index)) 00538 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index)) 00539 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table))) 00540 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table))) 00541 #define BL_MIN_BYTE (BL_BASE[0]) 00542 #define BL_MAX_BYTE (BL_BASE[1]) 00543 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE]) 00544 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))]) 00545 00546 next_byte = (unsigned char)*in_p++; 00547 follow_byte: 00548 if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte) 00549 next_info = INVALID; 00550 else { 00551 next_info = (VALUE)BL_ACTION(next_byte); 00552 } 00553 follow_info: 00554 switch (next_info & 0x1F) { 00555 case NOMAP: 00556 { 00557 const unsigned char *p = inchar_start; 00558 writebuf_off = 0; 00559 while (p < in_p) { 00560 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++; 00561 } 00562 writebuf_len = writebuf_off; 00563 writebuf_off = 0; 00564 while (writebuf_off < writebuf_len) { 00565 SUSPEND_OBUF(3); 00566 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; 00567 } 00568 } 00569 continue; 00570 case 0x00: case 0x04: case 0x08: case 0x0C: 00571 case 0x10: case 0x14: case 0x18: case 0x1C: 00572 SUSPEND_AFTER_OUTPUT(25); 00573 while (in_p >= in_stop) { 00574 if (!(opt & ECONV_PARTIAL_INPUT)) 00575 goto incomplete; 00576 SUSPEND(econv_source_buffer_empty, 5); 00577 } 00578 next_byte = (unsigned char)*in_p++; 00579 next_table = (unsigned int)next_info; 00580 goto follow_byte; 00581 case ZERObt: /* drop input */ 00582 continue; 00583 case ONEbt: 00584 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info); 00585 continue; 00586 case TWObt: 00587 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info); 00588 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info); 00589 continue; 00590 case THREEbt: 00591 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info); 00592 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info); 00593 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info); 00594 continue; 00595 case FOURbt: 00596 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info); 00597 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info); 00598 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info); 00599 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info); 00600 continue; 00601 case GB4bt: 00602 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info); 00603 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info); 00604 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info); 00605 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info); 00606 continue; 00607 case STR1: 00608 tc->output_index = 0; 00609 while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) { 00610 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index]; 00611 tc->output_index++; 00612 } 00613 continue; 00614 case FUNii: 00615 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info); 00616 goto follow_info; 00617 case FUNsi: 00618 { 00619 const unsigned char *char_start; 00620 size_t char_len; 00621 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); 00622 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len); 00623 goto follow_info; 00624 } 00625 case FUNio: 00626 SUSPEND_OBUF(13); 00627 if (tr->max_output <= out_stop - out_p) 00628 out_p += tr->func_io(TRANSCODING_STATE(tc), 00629 next_info, out_p, out_stop - out_p); 00630 else { 00631 writebuf_len = tr->func_io(TRANSCODING_STATE(tc), 00632 next_info, 00633 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc)); 00634 writebuf_off = 0; 00635 while (writebuf_off < writebuf_len) { 00636 SUSPEND_OBUF(20); 00637 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; 00638 } 00639 } 00640 break; 00641 case FUNso: 00642 { 00643 const unsigned char *char_start; 00644 size_t char_len; 00645 SUSPEND_OBUF(14); 00646 if (tr->max_output <= out_stop - out_p) { 00647 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); 00648 out_p += tr->func_so(TRANSCODING_STATE(tc), 00649 char_start, (size_t)char_len, 00650 out_p, out_stop - out_p); 00651 } 00652 else { 00653 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); 00654 writebuf_len = tr->func_so(TRANSCODING_STATE(tc), 00655 char_start, (size_t)char_len, 00656 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc)); 00657 writebuf_off = 0; 00658 while (writebuf_off < writebuf_len) { 00659 SUSPEND_OBUF(22); 00660 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; 00661 } 00662 } 00663 break; 00664 } 00665 case FUNsio: 00666 { 00667 const unsigned char *char_start; 00668 size_t char_len; 00669 SUSPEND_OBUF(33); 00670 if (tr->max_output <= out_stop - out_p) { 00671 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); 00672 out_p += tr->func_sio(TRANSCODING_STATE(tc), 00673 char_start, (size_t)char_len, next_info, 00674 out_p, out_stop - out_p); 00675 } 00676 else { 00677 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); 00678 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc), 00679 char_start, (size_t)char_len, next_info, 00680 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc)); 00681 writebuf_off = 0; 00682 while (writebuf_off < writebuf_len) { 00683 SUSPEND_OBUF(34); 00684 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; 00685 } 00686 } 00687 break; 00688 } 00689 case INVALID: 00690 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) { 00691 if (tc->recognized_len + (in_p - inchar_start) < unitlen) 00692 SUSPEND_AFTER_OUTPUT(26); 00693 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) { 00694 in_p = in_stop; 00695 SUSPEND(econv_source_buffer_empty, 8); 00696 } 00697 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) { 00698 in_p = in_stop; 00699 } 00700 else { 00701 in_p = inchar_start + (unitlen - tc->recognized_len); 00702 } 00703 } 00704 else { 00705 ssize_t invalid_len; /* including the last byte which causes invalid */ 00706 ssize_t discard_len; 00707 invalid_len = tc->recognized_len + (in_p - inchar_start); 00708 discard_len = ((invalid_len - 1) / unitlen) * unitlen; 00709 readagain_len = invalid_len - discard_len; 00710 } 00711 goto invalid; 00712 case UNDEF: 00713 goto undef; 00714 default: 00715 rb_raise(rb_eRuntimeError, "unknown transcoding instruction"); 00716 } 00717 continue; 00718 00719 invalid: 00720 SUSPEND(econv_invalid_byte_sequence, 1); 00721 continue; 00722 00723 incomplete: 00724 SUSPEND(econv_incomplete_input, 27); 00725 continue; 00726 00727 undef: 00728 SUSPEND(econv_undefined_conversion, 2); 00729 continue; 00730 } 00731 00732 /* cleanup */ 00733 if (tr->finish_func) { 00734 SUSPEND_OBUF(4); 00735 if (tr->max_output <= out_stop - out_p) { 00736 out_p += tr->finish_func(TRANSCODING_STATE(tc), 00737 out_p, out_stop - out_p); 00738 } 00739 else { 00740 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc), 00741 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc)); 00742 writebuf_off = 0; 00743 while (writebuf_off < writebuf_len) { 00744 SUSPEND_OBUF(23); 00745 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; 00746 } 00747 } 00748 } 00749 while (1) 00750 SUSPEND(econv_finished, 6); 00751 #undef SUSPEND 00752 #undef next_table 00753 #undef next_info 00754 #undef next_byte 00755 #undef writebuf_len 00756 #undef writebuf_off 00757 } 00758 00759 static rb_econv_result_t 00760 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos, 00761 const unsigned char *in_stop, unsigned char *out_stop, 00762 rb_transcoding *tc, 00763 const int opt) 00764 { 00765 if (tc->readagain_len) { 00766 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len); 00767 const unsigned char *readagain_pos = readagain_buf; 00768 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len; 00769 rb_econv_result_t res; 00770 00771 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len, 00772 unsigned char, tc->readagain_len); 00773 tc->readagain_len = 0; 00774 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT); 00775 if (res != econv_source_buffer_empty) { 00776 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len, 00777 readagain_pos, unsigned char, readagain_stop - readagain_pos); 00778 tc->readagain_len += readagain_stop - readagain_pos; 00779 return res; 00780 } 00781 } 00782 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt); 00783 } 00784 00785 static rb_transcoding * 00786 rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags) 00787 { 00788 rb_transcoding *tc; 00789 00790 tc = ALLOC(rb_transcoding); 00791 tc->transcoder = tr; 00792 tc->flags = flags; 00793 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) 00794 tc->state.ptr = xmalloc(tr->state_size); 00795 if (tr->state_init_func) { 00796 (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */ 00797 } 00798 tc->resume_position = 0; 00799 tc->recognized_len = 0; 00800 tc->readagain_len = 0; 00801 tc->writebuf_len = 0; 00802 tc->writebuf_off = 0; 00803 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) { 00804 tc->readbuf.ptr = xmalloc(tr->max_input); 00805 } 00806 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) { 00807 tc->writebuf.ptr = xmalloc(tr->max_output); 00808 } 00809 return tc; 00810 } 00811 00812 static rb_econv_result_t 00813 rb_transcoding_convert(rb_transcoding *tc, 00814 const unsigned char **input_ptr, const unsigned char *input_stop, 00815 unsigned char **output_ptr, unsigned char *output_stop, 00816 int flags) 00817 { 00818 return transcode_restartable( 00819 input_ptr, output_ptr, 00820 input_stop, output_stop, 00821 tc, flags); 00822 } 00823 00824 static void 00825 rb_transcoding_close(rb_transcoding *tc) 00826 { 00827 const rb_transcoder *tr = tc->transcoder; 00828 if (tr->state_fini_func) { 00829 (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */ 00830 } 00831 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) 00832 xfree(tc->state.ptr); 00833 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) 00834 xfree(tc->readbuf.ptr); 00835 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) 00836 xfree(tc->writebuf.ptr); 00837 xfree(tc); 00838 } 00839 00840 static size_t 00841 rb_transcoding_memsize(rb_transcoding *tc) 00842 { 00843 size_t size = sizeof(rb_transcoding); 00844 const rb_transcoder *tr = tc->transcoder; 00845 00846 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) { 00847 size += tr->state_size; 00848 } 00849 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) { 00850 size += tr->max_input; 00851 } 00852 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) { 00853 size += tr->max_output; 00854 } 00855 return size; 00856 } 00857 00858 static rb_econv_t * 00859 rb_econv_alloc(int n_hint) 00860 { 00861 rb_econv_t *ec; 00862 00863 if (n_hint <= 0) 00864 n_hint = 1; 00865 00866 ec = ALLOC(rb_econv_t); 00867 ec->flags = 0; 00868 ec->source_encoding_name = NULL; 00869 ec->destination_encoding_name = NULL; 00870 ec->started = 0; 00871 ec->replacement_str = NULL; 00872 ec->replacement_len = 0; 00873 ec->replacement_enc = NULL; 00874 ec->replacement_allocated = 0; 00875 ec->in_buf_start = NULL; 00876 ec->in_data_start = NULL; 00877 ec->in_data_end = NULL; 00878 ec->in_buf_end = NULL; 00879 ec->num_allocated = n_hint; 00880 ec->num_trans = 0; 00881 ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated); 00882 ec->num_finished = 0; 00883 ec->last_tc = NULL; 00884 ec->last_error.result = econv_source_buffer_empty; 00885 ec->last_error.error_tc = NULL; 00886 ec->last_error.source_encoding = NULL; 00887 ec->last_error.destination_encoding = NULL; 00888 ec->last_error.error_bytes_start = NULL; 00889 ec->last_error.error_bytes_len = 0; 00890 ec->last_error.readagain_len = 0; 00891 ec->source_encoding = NULL; 00892 ec->destination_encoding = NULL; 00893 return ec; 00894 } 00895 00896 static int 00897 rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i) 00898 { 00899 int n, j; 00900 int bufsize = 4096; 00901 unsigned char *p; 00902 00903 if (ec->num_trans == ec->num_allocated) { 00904 n = ec->num_allocated * 2; 00905 REALLOC_N(ec->elems, rb_econv_elem_t, n); 00906 ec->num_allocated = n; 00907 } 00908 00909 p = xmalloc(bufsize); 00910 00911 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i); 00912 00913 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0); 00914 ec->elems[i].out_buf_start = p; 00915 ec->elems[i].out_buf_end = p + bufsize; 00916 ec->elems[i].out_data_start = p; 00917 ec->elems[i].out_data_end = p; 00918 ec->elems[i].last_result = econv_source_buffer_empty; 00919 00920 ec->num_trans++; 00921 00922 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding)) 00923 for (j = ec->num_trans-1; i <= j; j--) { 00924 rb_transcoding *tc = ec->elems[j].tc; 00925 const rb_transcoder *tr2 = tc->transcoder; 00926 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) { 00927 ec->last_tc = tc; 00928 break; 00929 } 00930 } 00931 00932 return 0; 00933 } 00934 00935 static rb_econv_t * 00936 rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries) 00937 { 00938 rb_econv_t *ec; 00939 int i, ret; 00940 00941 for (i = 0; i < n; i++) { 00942 const rb_transcoder *tr; 00943 tr = load_transcoder_entry(entries[i]); 00944 if (!tr) 00945 return NULL; 00946 } 00947 00948 ec = rb_econv_alloc(n); 00949 00950 for (i = 0; i < n; i++) { 00951 const rb_transcoder *tr = load_transcoder_entry(entries[i]); 00952 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans); 00953 if (ret == -1) { 00954 rb_econv_close(ec); 00955 return NULL; 00956 } 00957 } 00958 00959 return ec; 00960 } 00961 00962 struct trans_open_t { 00963 transcoder_entry_t **entries; 00964 int num_additional; 00965 }; 00966 00967 static void 00968 trans_open_i(const char *sname, const char *dname, int depth, void *arg) 00969 { 00970 struct trans_open_t *toarg = arg; 00971 00972 if (!toarg->entries) { 00973 toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional); 00974 } 00975 toarg->entries[depth] = get_transcoder_entry(sname, dname); 00976 } 00977 00978 static rb_econv_t * 00979 rb_econv_open0(const char *sname, const char *dname, int ecflags) 00980 { 00981 transcoder_entry_t **entries = NULL; 00982 int num_trans; 00983 rb_econv_t *ec; 00984 00985 rb_encoding *senc, *denc; 00986 int sidx, didx; 00987 00988 senc = NULL; 00989 if (*sname) { 00990 sidx = rb_enc_find_index(sname); 00991 if (0 <= sidx) { 00992 senc = rb_enc_from_index(sidx); 00993 } 00994 } 00995 00996 denc = NULL; 00997 if (*dname) { 00998 didx = rb_enc_find_index(dname); 00999 if (0 <= didx) { 01000 denc = rb_enc_from_index(didx); 01001 } 01002 } 01003 01004 if (*sname == '\0' && *dname == '\0') { 01005 num_trans = 0; 01006 entries = NULL; 01007 } 01008 else { 01009 struct trans_open_t toarg; 01010 toarg.entries = NULL; 01011 toarg.num_additional = 0; 01012 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg); 01013 entries = toarg.entries; 01014 if (num_trans < 0) { 01015 xfree(entries); 01016 return NULL; 01017 } 01018 } 01019 01020 ec = rb_econv_open_by_transcoder_entries(num_trans, entries); 01021 xfree(entries); 01022 if (!ec) 01023 return NULL; 01024 01025 ec->flags = ecflags; 01026 ec->source_encoding_name = sname; 01027 ec->destination_encoding_name = dname; 01028 01029 return ec; 01030 } 01031 01032 #define MAX_ECFLAGS_DECORATORS 32 01033 01034 static int 01035 decorator_names(int ecflags, const char **decorators_ret) 01036 { 01037 int num_decorators; 01038 01039 switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) { 01040 case ECONV_UNIVERSAL_NEWLINE_DECORATOR: 01041 case ECONV_CRLF_NEWLINE_DECORATOR: 01042 case ECONV_CR_NEWLINE_DECORATOR: 01043 case 0: 01044 break; 01045 default: 01046 return -1; 01047 } 01048 01049 if ((ecflags & ECONV_XML_TEXT_DECORATOR) && 01050 (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)) 01051 return -1; 01052 01053 num_decorators = 0; 01054 01055 if (ecflags & ECONV_XML_TEXT_DECORATOR) 01056 decorators_ret[num_decorators++] = "xml_text_escape"; 01057 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) 01058 decorators_ret[num_decorators++] = "xml_attr_content_escape"; 01059 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) 01060 decorators_ret[num_decorators++] = "xml_attr_quote"; 01061 01062 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) 01063 decorators_ret[num_decorators++] = "crlf_newline"; 01064 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) 01065 decorators_ret[num_decorators++] = "cr_newline"; 01066 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) 01067 decorators_ret[num_decorators++] = "universal_newline"; 01068 01069 return num_decorators; 01070 } 01071 01072 rb_econv_t * 01073 rb_econv_open(const char *sname, const char *dname, int ecflags) 01074 { 01075 rb_econv_t *ec; 01076 int num_decorators; 01077 const char *decorators[MAX_ECFLAGS_DECORATORS]; 01078 int i; 01079 01080 num_decorators = decorator_names(ecflags, decorators); 01081 if (num_decorators == -1) 01082 return NULL; 01083 01084 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK); 01085 if (!ec) 01086 return NULL; 01087 01088 for (i = 0; i < num_decorators; i++) 01089 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) { 01090 rb_econv_close(ec); 01091 return NULL; 01092 } 01093 01094 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK; 01095 01096 return ec; 01097 } 01098 01099 static int 01100 trans_sweep(rb_econv_t *ec, 01101 const unsigned char **input_ptr, const unsigned char *input_stop, 01102 unsigned char **output_ptr, unsigned char *output_stop, 01103 int flags, 01104 int start) 01105 { 01106 int try; 01107 int i, f; 01108 01109 const unsigned char **ipp, *is, *iold; 01110 unsigned char **opp, *os, *oold; 01111 rb_econv_result_t res; 01112 01113 try = 1; 01114 while (try) { 01115 try = 0; 01116 for (i = start; i < ec->num_trans; i++) { 01117 rb_econv_elem_t *te = &ec->elems[i]; 01118 01119 if (i == 0) { 01120 ipp = input_ptr; 01121 is = input_stop; 01122 } 01123 else { 01124 rb_econv_elem_t *prev_te = &ec->elems[i-1]; 01125 ipp = (const unsigned char **)&prev_te->out_data_start; 01126 is = prev_te->out_data_end; 01127 } 01128 01129 if (i == ec->num_trans-1) { 01130 opp = output_ptr; 01131 os = output_stop; 01132 } 01133 else { 01134 if (te->out_buf_start != te->out_data_start) { 01135 ssize_t len = te->out_data_end - te->out_data_start; 01136 ssize_t off = te->out_data_start - te->out_buf_start; 01137 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len); 01138 te->out_data_start = te->out_buf_start; 01139 te->out_data_end -= off; 01140 } 01141 opp = &te->out_data_end; 01142 os = te->out_buf_end; 01143 } 01144 01145 f = flags; 01146 if (ec->num_finished != i) 01147 f |= ECONV_PARTIAL_INPUT; 01148 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) { 01149 start = 1; 01150 flags &= ~ECONV_AFTER_OUTPUT; 01151 } 01152 if (i != 0) 01153 f &= ~ECONV_AFTER_OUTPUT; 01154 iold = *ipp; 01155 oold = *opp; 01156 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f); 01157 if (iold != *ipp || oold != *opp) 01158 try = 1; 01159 01160 switch (res) { 01161 case econv_invalid_byte_sequence: 01162 case econv_incomplete_input: 01163 case econv_undefined_conversion: 01164 case econv_after_output: 01165 return i; 01166 01167 case econv_destination_buffer_full: 01168 case econv_source_buffer_empty: 01169 break; 01170 01171 case econv_finished: 01172 ec->num_finished = i+1; 01173 break; 01174 } 01175 } 01176 } 01177 return -1; 01178 } 01179 01180 static rb_econv_result_t 01181 rb_trans_conv(rb_econv_t *ec, 01182 const unsigned char **input_ptr, const unsigned char *input_stop, 01183 unsigned char **output_ptr, unsigned char *output_stop, 01184 int flags, 01185 int *result_position_ptr) 01186 { 01187 int i; 01188 int needreport_index; 01189 int sweep_start; 01190 01191 unsigned char empty_buf; 01192 unsigned char *empty_ptr = &empty_buf; 01193 01194 if (!input_ptr) { 01195 input_ptr = (const unsigned char **)&empty_ptr; 01196 input_stop = empty_ptr; 01197 } 01198 01199 if (!output_ptr) { 01200 output_ptr = &empty_ptr; 01201 output_stop = empty_ptr; 01202 } 01203 01204 if (ec->elems[0].last_result == econv_after_output) 01205 ec->elems[0].last_result = econv_source_buffer_empty; 01206 01207 needreport_index = -1; 01208 for (i = ec->num_trans-1; 0 <= i; i--) { 01209 switch (ec->elems[i].last_result) { 01210 case econv_invalid_byte_sequence: 01211 case econv_incomplete_input: 01212 case econv_undefined_conversion: 01213 case econv_after_output: 01214 case econv_finished: 01215 sweep_start = i+1; 01216 needreport_index = i; 01217 goto found_needreport; 01218 01219 case econv_destination_buffer_full: 01220 case econv_source_buffer_empty: 01221 break; 01222 01223 default: 01224 rb_bug("unexpected transcode last result"); 01225 } 01226 } 01227 01228 /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */ 01229 01230 if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full && 01231 (flags & ECONV_AFTER_OUTPUT)) { 01232 rb_econv_result_t res; 01233 01234 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop, 01235 (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, 01236 result_position_ptr); 01237 01238 if (res == econv_source_buffer_empty) 01239 return econv_after_output; 01240 return res; 01241 } 01242 01243 sweep_start = 0; 01244 01245 found_needreport: 01246 01247 do { 01248 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start); 01249 sweep_start = needreport_index + 1; 01250 } while (needreport_index != -1 && needreport_index != ec->num_trans-1); 01251 01252 for (i = ec->num_trans-1; 0 <= i; i--) { 01253 if (ec->elems[i].last_result != econv_source_buffer_empty) { 01254 rb_econv_result_t res = ec->elems[i].last_result; 01255 if (res == econv_invalid_byte_sequence || 01256 res == econv_incomplete_input || 01257 res == econv_undefined_conversion || 01258 res == econv_after_output) { 01259 ec->elems[i].last_result = econv_source_buffer_empty; 01260 } 01261 if (result_position_ptr) 01262 *result_position_ptr = i; 01263 return res; 01264 } 01265 } 01266 if (result_position_ptr) 01267 *result_position_ptr = -1; 01268 return econv_source_buffer_empty; 01269 } 01270 01271 static rb_econv_result_t 01272 rb_econv_convert0(rb_econv_t *ec, 01273 const unsigned char **input_ptr, const unsigned char *input_stop, 01274 unsigned char **output_ptr, unsigned char *output_stop, 01275 int flags) 01276 { 01277 rb_econv_result_t res; 01278 int result_position; 01279 int has_output = 0; 01280 01281 memset(&ec->last_error, 0, sizeof(ec->last_error)); 01282 01283 if (ec->num_trans == 0) { 01284 size_t len; 01285 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) { 01286 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) { 01287 len = output_stop - *output_ptr; 01288 memcpy(*output_ptr, ec->in_data_start, len); 01289 *output_ptr = output_stop; 01290 ec->in_data_start += len; 01291 res = econv_destination_buffer_full; 01292 goto gotresult; 01293 } 01294 len = ec->in_data_end - ec->in_data_start; 01295 memcpy(*output_ptr, ec->in_data_start, len); 01296 *output_ptr += len; 01297 ec->in_data_start = ec->in_data_end = ec->in_buf_start; 01298 if (flags & ECONV_AFTER_OUTPUT) { 01299 res = econv_after_output; 01300 goto gotresult; 01301 } 01302 } 01303 if (output_stop - *output_ptr < input_stop - *input_ptr) { 01304 len = output_stop - *output_ptr; 01305 } 01306 else { 01307 len = input_stop - *input_ptr; 01308 } 01309 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) { 01310 *(*output_ptr)++ = *(*input_ptr)++; 01311 res = econv_after_output; 01312 goto gotresult; 01313 } 01314 memcpy(*output_ptr, *input_ptr, len); 01315 *output_ptr += len; 01316 *input_ptr += len; 01317 if (*input_ptr != input_stop) 01318 res = econv_destination_buffer_full; 01319 else if (flags & ECONV_PARTIAL_INPUT) 01320 res = econv_source_buffer_empty; 01321 else 01322 res = econv_finished; 01323 goto gotresult; 01324 } 01325 01326 if (ec->elems[ec->num_trans-1].out_data_start) { 01327 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start; 01328 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end; 01329 if (data_start != data_end) { 01330 size_t len; 01331 if (output_stop - *output_ptr < data_end - data_start) { 01332 len = output_stop - *output_ptr; 01333 memcpy(*output_ptr, data_start, len); 01334 *output_ptr = output_stop; 01335 ec->elems[ec->num_trans-1].out_data_start += len; 01336 res = econv_destination_buffer_full; 01337 goto gotresult; 01338 } 01339 len = data_end - data_start; 01340 memcpy(*output_ptr, data_start, len); 01341 *output_ptr += len; 01342 ec->elems[ec->num_trans-1].out_data_start = 01343 ec->elems[ec->num_trans-1].out_data_end = 01344 ec->elems[ec->num_trans-1].out_buf_start; 01345 has_output = 1; 01346 } 01347 } 01348 01349 if (ec->in_buf_start && 01350 ec->in_data_start != ec->in_data_end) { 01351 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop, 01352 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position); 01353 if (res != econv_source_buffer_empty) 01354 goto gotresult; 01355 } 01356 01357 if (has_output && 01358 (flags & ECONV_AFTER_OUTPUT) && 01359 *input_ptr != input_stop) { 01360 input_stop = *input_ptr; 01361 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position); 01362 if (res == econv_source_buffer_empty) 01363 res = econv_after_output; 01364 } 01365 else if ((flags & ECONV_AFTER_OUTPUT) || 01366 ec->num_trans == 1) { 01367 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position); 01368 } 01369 else { 01370 flags |= ECONV_AFTER_OUTPUT; 01371 do { 01372 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position); 01373 } while (res == econv_after_output); 01374 } 01375 01376 gotresult: 01377 ec->last_error.result = res; 01378 if (res == econv_invalid_byte_sequence || 01379 res == econv_incomplete_input || 01380 res == econv_undefined_conversion) { 01381 rb_transcoding *error_tc = ec->elems[result_position].tc; 01382 ec->last_error.error_tc = error_tc; 01383 ec->last_error.source_encoding = error_tc->transcoder->src_encoding; 01384 ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding; 01385 ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc); 01386 ec->last_error.error_bytes_len = error_tc->recognized_len; 01387 ec->last_error.readagain_len = error_tc->readagain_len; 01388 } 01389 01390 return res; 01391 } 01392 01393 static int output_replacement_character(rb_econv_t *ec); 01394 01395 static int 01396 output_hex_charref(rb_econv_t *ec) 01397 { 01398 int ret; 01399 unsigned char utfbuf[1024]; 01400 const unsigned char *utf; 01401 size_t utf_len; 01402 int utf_allocated = 0; 01403 char charef_buf[16]; 01404 const unsigned char *p; 01405 01406 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) { 01407 utf = ec->last_error.error_bytes_start; 01408 utf_len = ec->last_error.error_bytes_len; 01409 } 01410 else { 01411 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE", 01412 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len, 01413 utfbuf, sizeof(utfbuf), 01414 &utf_len); 01415 if (!utf) 01416 return -1; 01417 if (utf != utfbuf && utf != ec->last_error.error_bytes_start) 01418 utf_allocated = 1; 01419 } 01420 01421 if (utf_len % 4 != 0) 01422 goto fail; 01423 01424 p = utf; 01425 while (4 <= utf_len) { 01426 unsigned int u = 0; 01427 u += p[0] << 24; 01428 u += p[1] << 16; 01429 u += p[2] << 8; 01430 u += p[3]; 01431 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u); 01432 01433 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII"); 01434 if (ret == -1) 01435 goto fail; 01436 01437 p += 4; 01438 utf_len -= 4; 01439 } 01440 01441 if (utf_allocated) 01442 xfree((void *)utf); 01443 return 0; 01444 01445 fail: 01446 if (utf_allocated) 01447 xfree((void *)utf); 01448 return -1; 01449 } 01450 01451 rb_econv_result_t 01452 rb_econv_convert(rb_econv_t *ec, 01453 const unsigned char **input_ptr, const unsigned char *input_stop, 01454 unsigned char **output_ptr, unsigned char *output_stop, 01455 int flags) 01456 { 01457 rb_econv_result_t ret; 01458 01459 unsigned char empty_buf; 01460 unsigned char *empty_ptr = &empty_buf; 01461 01462 ec->started = 1; 01463 01464 if (!input_ptr) { 01465 input_ptr = (const unsigned char **)&empty_ptr; 01466 input_stop = empty_ptr; 01467 } 01468 01469 if (!output_ptr) { 01470 output_ptr = &empty_ptr; 01471 output_stop = empty_ptr; 01472 } 01473 01474 resume: 01475 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags); 01476 01477 if (ret == econv_invalid_byte_sequence || 01478 ret == econv_incomplete_input) { 01479 /* deal with invalid byte sequence */ 01480 /* todo: add more alternative behaviors */ 01481 switch (ec->flags & ECONV_INVALID_MASK) { 01482 case ECONV_INVALID_REPLACE: 01483 if (output_replacement_character(ec) == 0) 01484 goto resume; 01485 } 01486 } 01487 01488 if (ret == econv_undefined_conversion) { 01489 /* valid character in source encoding 01490 * but no related character(s) in destination encoding */ 01491 /* todo: add more alternative behaviors */ 01492 switch (ec->flags & ECONV_UNDEF_MASK) { 01493 case ECONV_UNDEF_REPLACE: 01494 if (output_replacement_character(ec) == 0) 01495 goto resume; 01496 break; 01497 01498 case ECONV_UNDEF_HEX_CHARREF: 01499 if (output_hex_charref(ec) == 0) 01500 goto resume; 01501 break; 01502 } 01503 } 01504 01505 return ret; 01506 } 01507 01508 const char * 01509 rb_econv_encoding_to_insert_output(rb_econv_t *ec) 01510 { 01511 rb_transcoding *tc = ec->last_tc; 01512 const rb_transcoder *tr; 01513 01514 if (tc == NULL) 01515 return ""; 01516 01517 tr = tc->transcoder; 01518 01519 if (tr->asciicompat_type == asciicompat_encoder) 01520 return tr->src_encoding; 01521 return tr->dst_encoding; 01522 } 01523 01524 static unsigned char * 01525 allocate_converted_string(const char *sname, const char *dname, 01526 const unsigned char *str, size_t len, 01527 unsigned char *caller_dst_buf, size_t caller_dst_bufsize, 01528 size_t *dst_len_ptr) 01529 { 01530 unsigned char *dst_str; 01531 size_t dst_len; 01532 size_t dst_bufsize; 01533 01534 rb_econv_t *ec; 01535 rb_econv_result_t res; 01536 01537 const unsigned char *sp; 01538 unsigned char *dp; 01539 01540 if (caller_dst_buf) 01541 dst_bufsize = caller_dst_bufsize; 01542 else if (len == 0) 01543 dst_bufsize = 1; 01544 else 01545 dst_bufsize = len; 01546 01547 ec = rb_econv_open(sname, dname, 0); 01548 if (ec == NULL) 01549 return NULL; 01550 if (caller_dst_buf) 01551 dst_str = caller_dst_buf; 01552 else 01553 dst_str = xmalloc(dst_bufsize); 01554 dst_len = 0; 01555 sp = str; 01556 dp = dst_str+dst_len; 01557 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0); 01558 dst_len = dp - dst_str; 01559 while (res == econv_destination_buffer_full) { 01560 if (SIZE_MAX/2 < dst_bufsize) { 01561 goto fail; 01562 } 01563 dst_bufsize *= 2; 01564 if (dst_str == caller_dst_buf) { 01565 unsigned char *tmp; 01566 tmp = xmalloc(dst_bufsize); 01567 memcpy(tmp, dst_str, dst_bufsize/2); 01568 dst_str = tmp; 01569 } 01570 else { 01571 dst_str = xrealloc(dst_str, dst_bufsize); 01572 } 01573 dp = dst_str+dst_len; 01574 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0); 01575 dst_len = dp - dst_str; 01576 } 01577 if (res != econv_finished) { 01578 goto fail; 01579 } 01580 rb_econv_close(ec); 01581 *dst_len_ptr = dst_len; 01582 return dst_str; 01583 01584 fail: 01585 if (dst_str != caller_dst_buf) 01586 xfree(dst_str); 01587 rb_econv_close(ec); 01588 return NULL; 01589 } 01590 01591 /* result: 0:success -1:failure */ 01592 int 01593 rb_econv_insert_output(rb_econv_t *ec, 01594 const unsigned char *str, size_t len, const char *str_encoding) 01595 { 01596 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec); 01597 unsigned char insert_buf[4096]; 01598 const unsigned char *insert_str = NULL; 01599 size_t insert_len; 01600 01601 int last_trans_index; 01602 rb_transcoding *tc; 01603 01604 unsigned char **buf_start_p; 01605 unsigned char **data_start_p; 01606 unsigned char **data_end_p; 01607 unsigned char **buf_end_p; 01608 01609 size_t need; 01610 01611 ec->started = 1; 01612 01613 if (len == 0) 01614 return 0; 01615 01616 if (encoding_equal(insert_encoding, str_encoding)) { 01617 insert_str = str; 01618 insert_len = len; 01619 } 01620 else { 01621 insert_str = allocate_converted_string(str_encoding, insert_encoding, 01622 str, len, insert_buf, sizeof(insert_buf), &insert_len); 01623 if (insert_str == NULL) 01624 return -1; 01625 } 01626 01627 need = insert_len; 01628 01629 last_trans_index = ec->num_trans-1; 01630 if (ec->num_trans == 0) { 01631 tc = NULL; 01632 buf_start_p = &ec->in_buf_start; 01633 data_start_p = &ec->in_data_start; 01634 data_end_p = &ec->in_data_end; 01635 buf_end_p = &ec->in_buf_end; 01636 } 01637 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) { 01638 tc = ec->elems[last_trans_index].tc; 01639 need += tc->readagain_len; 01640 if (need < insert_len) 01641 goto fail; 01642 if (last_trans_index == 0) { 01643 buf_start_p = &ec->in_buf_start; 01644 data_start_p = &ec->in_data_start; 01645 data_end_p = &ec->in_data_end; 01646 buf_end_p = &ec->in_buf_end; 01647 } 01648 else { 01649 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1]; 01650 buf_start_p = &ee->out_buf_start; 01651 data_start_p = &ee->out_data_start; 01652 data_end_p = &ee->out_data_end; 01653 buf_end_p = &ee->out_buf_end; 01654 } 01655 } 01656 else { 01657 rb_econv_elem_t *ee = &ec->elems[last_trans_index]; 01658 buf_start_p = &ee->out_buf_start; 01659 data_start_p = &ee->out_data_start; 01660 data_end_p = &ee->out_data_end; 01661 buf_end_p = &ee->out_buf_end; 01662 tc = ec->elems[last_trans_index].tc; 01663 } 01664 01665 if (*buf_start_p == NULL) { 01666 unsigned char *buf = xmalloc(need); 01667 *buf_start_p = buf; 01668 *data_start_p = buf; 01669 *data_end_p = buf; 01670 *buf_end_p = buf+need; 01671 } 01672 else if ((size_t)(*buf_end_p - *data_end_p) < need) { 01673 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p); 01674 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p); 01675 *data_start_p = *buf_start_p; 01676 if ((size_t)(*buf_end_p - *data_end_p) < need) { 01677 unsigned char *buf; 01678 size_t s = (*data_end_p - *buf_start_p) + need; 01679 if (s < need) 01680 goto fail; 01681 buf = xrealloc(*buf_start_p, s); 01682 *data_start_p = buf; 01683 *data_end_p = buf + (*data_end_p - *buf_start_p); 01684 *buf_start_p = buf; 01685 *buf_end_p = buf + s; 01686 } 01687 } 01688 01689 memcpy(*data_end_p, insert_str, insert_len); 01690 *data_end_p += insert_len; 01691 if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) { 01692 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len); 01693 *data_end_p += tc->readagain_len; 01694 tc->readagain_len = 0; 01695 } 01696 01697 if (insert_str != str && insert_str != insert_buf) 01698 xfree((void*)insert_str); 01699 return 0; 01700 01701 fail: 01702 if (insert_str != str && insert_str != insert_buf) 01703 xfree((void*)insert_str); 01704 return -1; 01705 } 01706 01707 void 01708 rb_econv_close(rb_econv_t *ec) 01709 { 01710 int i; 01711 01712 if (ec->replacement_allocated) { 01713 xfree((void *)ec->replacement_str); 01714 } 01715 for (i = 0; i < ec->num_trans; i++) { 01716 rb_transcoding_close(ec->elems[i].tc); 01717 if (ec->elems[i].out_buf_start) 01718 xfree(ec->elems[i].out_buf_start); 01719 } 01720 xfree(ec->in_buf_start); 01721 xfree(ec->elems); 01722 xfree(ec); 01723 } 01724 01725 size_t 01726 rb_econv_memsize(rb_econv_t *ec) 01727 { 01728 size_t size = sizeof(rb_econv_t); 01729 int i; 01730 01731 if (ec->replacement_allocated) { 01732 size += ec->replacement_len; 01733 } 01734 for (i = 0; i < ec->num_trans; i++) { 01735 size += rb_transcoding_memsize(ec->elems[i].tc); 01736 01737 if (ec->elems[i].out_buf_start) { 01738 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start; 01739 } 01740 } 01741 size += ec->in_buf_end - ec->in_buf_start; 01742 size += sizeof(rb_econv_elem_t) * ec->num_allocated; 01743 01744 return size; 01745 } 01746 01747 int 01748 rb_econv_putbackable(rb_econv_t *ec) 01749 { 01750 if (ec->num_trans == 0) 01751 return 0; 01752 #if SIZEOF_SIZE_T > SIZEOF_INT 01753 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX; 01754 #endif 01755 return (int)ec->elems[0].tc->readagain_len; 01756 } 01757 01758 void 01759 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n) 01760 { 01761 rb_transcoding *tc; 01762 if (ec->num_trans == 0 || n == 0) 01763 return; 01764 tc = ec->elems[0].tc; 01765 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n); 01766 tc->readagain_len -= n; 01767 } 01768 01769 struct asciicompat_encoding_t { 01770 const char *ascii_compat_name; 01771 const char *ascii_incompat_name; 01772 }; 01773 01774 static int 01775 asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg) 01776 { 01777 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg; 01778 transcoder_entry_t *entry = (transcoder_entry_t *)val; 01779 const rb_transcoder *tr; 01780 01781 if (DECORATOR_P(entry->sname, entry->dname)) 01782 return ST_CONTINUE; 01783 tr = load_transcoder_entry(entry); 01784 if (tr && tr->asciicompat_type == asciicompat_decoder) { 01785 data->ascii_compat_name = tr->dst_encoding; 01786 return ST_STOP; 01787 } 01788 return ST_CONTINUE; 01789 } 01790 01791 const char * 01792 rb_econv_asciicompat_encoding(const char *ascii_incompat_name) 01793 { 01794 st_data_t v; 01795 st_table *table2; 01796 struct asciicompat_encoding_t data; 01797 01798 if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v)) 01799 return NULL; 01800 table2 = (st_table *)v; 01801 01802 /* 01803 * Assumption: 01804 * There is at most one transcoder for 01805 * converting from ASCII incompatible encoding. 01806 * 01807 * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others. 01808 */ 01809 if (table2->num_entries != 1) 01810 return NULL; 01811 01812 data.ascii_incompat_name = ascii_incompat_name; 01813 data.ascii_compat_name = NULL; 01814 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data); 01815 return data.ascii_compat_name; 01816 } 01817 01818 VALUE 01819 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags) 01820 { 01821 unsigned const char *ss, *sp, *se; 01822 unsigned char *ds, *dp, *de; 01823 rb_econv_result_t res; 01824 int max_output; 01825 01826 if (NIL_P(dst)) { 01827 dst = rb_str_buf_new(len); 01828 if (ec->destination_encoding) 01829 rb_enc_associate(dst, ec->destination_encoding); 01830 } 01831 01832 if (ec->last_tc) 01833 max_output = ec->last_tc->transcoder->max_output; 01834 else 01835 max_output = 1; 01836 01837 res = econv_destination_buffer_full; 01838 while (res == econv_destination_buffer_full) { 01839 long dlen = RSTRING_LEN(dst); 01840 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) { 01841 unsigned long new_capa = (unsigned long)dlen + len + max_output; 01842 if (LONG_MAX < new_capa) 01843 rb_raise(rb_eArgError, "too long string"); 01844 rb_str_resize(dst, new_capa); 01845 rb_str_set_len(dst, dlen); 01846 } 01847 ss = sp = (const unsigned char *)RSTRING_PTR(src) + off; 01848 se = ss + len; 01849 ds = (unsigned char *)RSTRING_PTR(dst); 01850 de = ds + rb_str_capacity(dst); 01851 dp = ds += dlen; 01852 res = rb_econv_convert(ec, &sp, se, &dp, de, flags); 01853 off += sp - ss; 01854 len -= sp - ss; 01855 rb_str_set_len(dst, dlen + (dp - ds)); 01856 rb_econv_check_error(ec); 01857 } 01858 01859 return dst; 01860 } 01861 01862 VALUE 01863 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags) 01864 { 01865 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags); 01866 } 01867 01868 VALUE 01869 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags) 01870 { 01871 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags); 01872 } 01873 01874 VALUE 01875 rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags) 01876 { 01877 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags); 01878 } 01879 01880 static int 01881 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n) 01882 { 01883 transcoder_entry_t *entry; 01884 const rb_transcoder *tr; 01885 01886 if (ec->started != 0) 01887 return -1; 01888 01889 entry = get_transcoder_entry(sname, dname); 01890 if (!entry) 01891 return -1; 01892 01893 tr = load_transcoder_entry(entry); 01894 if (!tr) return -1; 01895 01896 return rb_econv_add_transcoder_at(ec, tr, n); 01897 } 01898 01899 static int 01900 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n) 01901 { 01902 return rb_econv_add_converter(ec, "", decorator_name, n); 01903 } 01904 01905 int 01906 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name) 01907 { 01908 const rb_transcoder *tr; 01909 01910 if (ec->num_trans == 0) 01911 return rb_econv_decorate_at(ec, decorator_name, 0); 01912 01913 tr = ec->elems[0].tc->transcoder; 01914 01915 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) && 01916 tr->asciicompat_type == asciicompat_decoder) 01917 return rb_econv_decorate_at(ec, decorator_name, 1); 01918 01919 return rb_econv_decorate_at(ec, decorator_name, 0); 01920 } 01921 01922 int 01923 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name) 01924 { 01925 const rb_transcoder *tr; 01926 01927 if (ec->num_trans == 0) 01928 return rb_econv_decorate_at(ec, decorator_name, 0); 01929 01930 tr = ec->elems[ec->num_trans-1].tc->transcoder; 01931 01932 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) && 01933 tr->asciicompat_type == asciicompat_encoder) 01934 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1); 01935 01936 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans); 01937 } 01938 01939 void 01940 rb_econv_binmode(rb_econv_t *ec) 01941 { 01942 const rb_transcoder *trs[3]; 01943 int n, i, j; 01944 transcoder_entry_t *entry; 01945 int num_trans; 01946 01947 n = 0; 01948 if (ec->flags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) { 01949 entry = get_transcoder_entry("", "universal_newline"); 01950 if (entry->transcoder) 01951 trs[n++] = entry->transcoder; 01952 } 01953 if (ec->flags & ECONV_CRLF_NEWLINE_DECORATOR) { 01954 entry = get_transcoder_entry("", "crlf_newline"); 01955 if (entry->transcoder) 01956 trs[n++] = entry->transcoder; 01957 } 01958 if (ec->flags & ECONV_CR_NEWLINE_DECORATOR) { 01959 entry = get_transcoder_entry("", "cr_newline"); 01960 if (entry->transcoder) 01961 trs[n++] = entry->transcoder; 01962 } 01963 01964 num_trans = ec->num_trans; 01965 j = 0; 01966 for (i = 0; i < num_trans; i++) { 01967 int k; 01968 for (k = 0; k < n; k++) 01969 if (trs[k] == ec->elems[i].tc->transcoder) 01970 break; 01971 if (k == n) { 01972 ec->elems[j] = ec->elems[i]; 01973 j++; 01974 } 01975 else { 01976 rb_transcoding_close(ec->elems[i].tc); 01977 xfree(ec->elems[i].out_buf_start); 01978 ec->num_trans--; 01979 } 01980 } 01981 01982 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK; 01983 01984 } 01985 01986 static VALUE 01987 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg) 01988 { 01989 int has_description = 0; 01990 01991 if (NIL_P(mesg)) 01992 mesg = rb_str_new(NULL, 0); 01993 01994 if (*sname != '\0' || *dname != '\0') { 01995 if (*sname == '\0') 01996 rb_str_cat2(mesg, dname); 01997 else if (*dname == '\0') 01998 rb_str_cat2(mesg, sname); 01999 else 02000 rb_str_catf(mesg, "%s to %s", sname, dname); 02001 has_description = 1; 02002 } 02003 02004 if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK| 02005 ECONV_XML_TEXT_DECORATOR| 02006 ECONV_XML_ATTR_CONTENT_DECORATOR| 02007 ECONV_XML_ATTR_QUOTE_DECORATOR)) { 02008 const char *pre = ""; 02009 if (has_description) 02010 rb_str_cat2(mesg, " with "); 02011 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) { 02012 rb_str_cat2(mesg, pre); pre = ","; 02013 rb_str_cat2(mesg, "universal_newline"); 02014 } 02015 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) { 02016 rb_str_cat2(mesg, pre); pre = ","; 02017 rb_str_cat2(mesg, "crlf_newline"); 02018 } 02019 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) { 02020 rb_str_cat2(mesg, pre); pre = ","; 02021 rb_str_cat2(mesg, "cr_newline"); 02022 } 02023 if (ecflags & ECONV_XML_TEXT_DECORATOR) { 02024 rb_str_cat2(mesg, pre); pre = ","; 02025 rb_str_cat2(mesg, "xml_text"); 02026 } 02027 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) { 02028 rb_str_cat2(mesg, pre); pre = ","; 02029 rb_str_cat2(mesg, "xml_attr_content"); 02030 } 02031 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) { 02032 rb_str_cat2(mesg, pre); pre = ","; 02033 rb_str_cat2(mesg, "xml_attr_quote"); 02034 } 02035 has_description = 1; 02036 } 02037 if (!has_description) { 02038 rb_str_cat2(mesg, "no-conversion"); 02039 } 02040 02041 return mesg; 02042 } 02043 02044 VALUE 02045 rb_econv_open_exc(const char *sname, const char *dname, int ecflags) 02046 { 02047 VALUE mesg, exc; 02048 mesg = rb_str_new_cstr("code converter not found ("); 02049 econv_description(sname, dname, ecflags, mesg); 02050 rb_str_cat2(mesg, ")"); 02051 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg); 02052 return exc; 02053 } 02054 02055 static VALUE 02056 make_econv_exception(rb_econv_t *ec) 02057 { 02058 VALUE mesg, exc; 02059 if (ec->last_error.result == econv_invalid_byte_sequence || 02060 ec->last_error.result == econv_incomplete_input) { 02061 const char *err = (const char *)ec->last_error.error_bytes_start; 02062 size_t error_len = ec->last_error.error_bytes_len; 02063 VALUE bytes = rb_str_new(err, error_len); 02064 VALUE dumped = rb_str_dump(bytes); 02065 size_t readagain_len = ec->last_error.readagain_len; 02066 VALUE bytes2 = Qnil; 02067 VALUE dumped2; 02068 int idx; 02069 if (ec->last_error.result == econv_incomplete_input) { 02070 mesg = rb_sprintf("incomplete %s on %s", 02071 StringValueCStr(dumped), 02072 ec->last_error.source_encoding); 02073 } 02074 else if (readagain_len) { 02075 bytes2 = rb_str_new(err+error_len, readagain_len); 02076 dumped2 = rb_str_dump(bytes2); 02077 mesg = rb_sprintf("%s followed by %s on %s", 02078 StringValueCStr(dumped), 02079 StringValueCStr(dumped2), 02080 ec->last_error.source_encoding); 02081 } 02082 else { 02083 mesg = rb_sprintf("%s on %s", 02084 StringValueCStr(dumped), 02085 ec->last_error.source_encoding); 02086 } 02087 02088 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg); 02089 rb_ivar_set(exc, rb_intern("error_bytes"), bytes); 02090 rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2); 02091 rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse); 02092 02093 set_encs: 02094 rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding)); 02095 rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding)); 02096 idx = rb_enc_find_index(ec->last_error.source_encoding); 02097 if (0 <= idx) 02098 rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx))); 02099 idx = rb_enc_find_index(ec->last_error.destination_encoding); 02100 if (0 <= idx) 02101 rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx))); 02102 return exc; 02103 } 02104 if (ec->last_error.result == econv_undefined_conversion) { 02105 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start, 02106 ec->last_error.error_bytes_len); 02107 VALUE dumped = Qnil; 02108 int idx; 02109 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) { 02110 rb_encoding *utf8 = rb_utf8_encoding(); 02111 const char *start, *end; 02112 int n; 02113 start = (const char *)ec->last_error.error_bytes_start; 02114 end = start + ec->last_error.error_bytes_len; 02115 n = rb_enc_precise_mbclen(start, end, utf8); 02116 if (MBCLEN_CHARFOUND_P(n) && 02117 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) { 02118 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8); 02119 dumped = rb_sprintf("U+%04X", cc); 02120 } 02121 } 02122 if (dumped == Qnil) 02123 dumped = rb_str_dump(bytes); 02124 if (strcmp(ec->last_error.source_encoding, 02125 ec->source_encoding_name) == 0 && 02126 strcmp(ec->last_error.destination_encoding, 02127 ec->destination_encoding_name) == 0) { 02128 mesg = rb_sprintf("%s from %s to %s", 02129 StringValueCStr(dumped), 02130 ec->last_error.source_encoding, 02131 ec->last_error.destination_encoding); 02132 } 02133 else { 02134 int i; 02135 mesg = rb_sprintf("%s to %s in conversion from %s", 02136 StringValueCStr(dumped), 02137 ec->last_error.destination_encoding, 02138 ec->source_encoding_name); 02139 for (i = 0; i < ec->num_trans; i++) { 02140 const rb_transcoder *tr = ec->elems[i].tc->transcoder; 02141 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding)) 02142 rb_str_catf(mesg, " to %s", 02143 ec->elems[i].tc->transcoder->dst_encoding); 02144 } 02145 } 02146 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg); 02147 idx = rb_enc_find_index(ec->last_error.source_encoding); 02148 if (0 <= idx) 02149 rb_enc_associate_index(bytes, idx); 02150 rb_ivar_set(exc, rb_intern("error_char"), bytes); 02151 goto set_encs; 02152 } 02153 return Qnil; 02154 } 02155 02156 static void 02157 more_output_buffer( 02158 VALUE destination, 02159 unsigned char *(*resize_destination)(VALUE, size_t, size_t), 02160 int max_output, 02161 unsigned char **out_start_ptr, 02162 unsigned char **out_pos, 02163 unsigned char **out_stop_ptr) 02164 { 02165 size_t len = (*out_pos - *out_start_ptr); 02166 size_t new_len = (len + max_output) * 2; 02167 *out_start_ptr = resize_destination(destination, len, new_len); 02168 *out_pos = *out_start_ptr + len; 02169 *out_stop_ptr = *out_start_ptr + new_len; 02170 } 02171 02172 static int 02173 make_replacement(rb_econv_t *ec) 02174 { 02175 rb_transcoding *tc; 02176 const rb_transcoder *tr; 02177 rb_encoding *enc; 02178 const unsigned char *replacement; 02179 const char *repl_enc; 02180 const char *ins_enc; 02181 size_t len; 02182 02183 if (ec->replacement_str) 02184 return 0; 02185 02186 ins_enc = rb_econv_encoding_to_insert_output(ec); 02187 02188 tc = ec->last_tc; 02189 if (*ins_enc) { 02190 tr = tc->transcoder; 02191 enc = rb_enc_find(tr->dst_encoding); 02192 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc); 02193 } 02194 else { 02195 replacement = (unsigned char *)"?"; 02196 len = 1; 02197 repl_enc = ""; 02198 } 02199 02200 ec->replacement_str = replacement; 02201 ec->replacement_len = len; 02202 ec->replacement_enc = repl_enc; 02203 ec->replacement_allocated = 0; 02204 return 0; 02205 } 02206 02207 int 02208 rb_econv_set_replacement(rb_econv_t *ec, 02209 const unsigned char *str, size_t len, const char *encname) 02210 { 02211 unsigned char *str2; 02212 size_t len2; 02213 const char *encname2; 02214 02215 encname2 = rb_econv_encoding_to_insert_output(ec); 02216 02217 if (encoding_equal(encname, encname2)) { 02218 str2 = xmalloc(len); 02219 MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */ 02220 len2 = len; 02221 encname2 = encname; 02222 } 02223 else { 02224 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2); 02225 if (!str2) 02226 return -1; 02227 } 02228 02229 if (ec->replacement_allocated) { 02230 xfree((void *)ec->replacement_str); 02231 } 02232 ec->replacement_allocated = 1; 02233 ec->replacement_str = str2; 02234 ec->replacement_len = len2; 02235 ec->replacement_enc = encname2; 02236 return 0; 02237 } 02238 02239 static int 02240 output_replacement_character(rb_econv_t *ec) 02241 { 02242 int ret; 02243 02244 if (make_replacement(ec) == -1) 02245 return -1; 02246 02247 ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc); 02248 if (ret == -1) 02249 return -1; 02250 02251 return 0; 02252 } 02253 02254 #if 1 02255 #define hash_fallback rb_hash_aref 02256 02257 static VALUE 02258 proc_fallback(VALUE fallback, VALUE c) 02259 { 02260 return rb_proc_call(fallback, rb_ary_new4(1, &c)); 02261 } 02262 02263 static VALUE 02264 method_fallback(VALUE fallback, VALUE c) 02265 { 02266 return rb_method_call(1, &c, fallback); 02267 } 02268 02269 static VALUE 02270 aref_fallback(VALUE fallback, VALUE c) 02271 { 02272 return rb_funcall3(fallback, sym_aref, 1, &c); 02273 } 02274 02275 static void 02276 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, 02277 const unsigned char *in_stop, unsigned char *out_stop, 02278 VALUE destination, 02279 unsigned char *(*resize_destination)(VALUE, size_t, size_t), 02280 const char *src_encoding, 02281 const char *dst_encoding, 02282 int ecflags, 02283 VALUE ecopts) 02284 { 02285 rb_econv_t *ec; 02286 rb_transcoding *last_tc; 02287 rb_econv_result_t ret; 02288 unsigned char *out_start = *out_pos; 02289 int max_output; 02290 VALUE exc; 02291 VALUE fallback = Qnil; 02292 VALUE (*fallback_func)(VALUE, VALUE) = 0; 02293 02294 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts); 02295 if (!ec) 02296 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags)); 02297 02298 if (!NIL_P(ecopts) && TYPE(ecopts) == T_HASH) { 02299 fallback = rb_hash_aref(ecopts, sym_fallback); 02300 if (RB_TYPE_P(fallback, T_HASH)) { 02301 fallback_func = hash_fallback; 02302 } 02303 else if (rb_obj_is_proc(fallback)) { 02304 fallback_func = proc_fallback; 02305 } 02306 else if (rb_obj_is_method(fallback)) { 02307 fallback_func = method_fallback; 02308 } 02309 else { 02310 fallback_func = aref_fallback; 02311 } 02312 } 02313 last_tc = ec->last_tc; 02314 max_output = last_tc ? last_tc->transcoder->max_output : 1; 02315 02316 resume: 02317 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0); 02318 02319 if (!NIL_P(fallback) && ret == econv_undefined_conversion) { 02320 VALUE rep = rb_enc_str_new( 02321 (const char *)ec->last_error.error_bytes_start, 02322 ec->last_error.error_bytes_len, 02323 rb_enc_find(ec->last_error.source_encoding)); 02324 rep = (*fallback_func)(fallback, rep); 02325 if (rep != Qundef && !NIL_P(rep)) { 02326 StringValue(rep); 02327 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep), 02328 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep))); 02329 if ((int)ret == -1) { 02330 rb_raise(rb_eArgError, "too big fallback string"); 02331 } 02332 goto resume; 02333 } 02334 } 02335 02336 if (ret == econv_invalid_byte_sequence || 02337 ret == econv_incomplete_input || 02338 ret == econv_undefined_conversion) { 02339 exc = make_econv_exception(ec); 02340 rb_econv_close(ec); 02341 rb_exc_raise(exc); 02342 } 02343 02344 if (ret == econv_destination_buffer_full) { 02345 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop); 02346 goto resume; 02347 } 02348 02349 rb_econv_close(ec); 02350 return; 02351 } 02352 #else 02353 /* sample transcode_loop implementation in byte-by-byte stream style */ 02354 static void 02355 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, 02356 const unsigned char *in_stop, unsigned char *out_stop, 02357 VALUE destination, 02358 unsigned char *(*resize_destination)(VALUE, size_t, size_t), 02359 const char *src_encoding, 02360 const char *dst_encoding, 02361 int ecflags, 02362 VALUE ecopts) 02363 { 02364 rb_econv_t *ec; 02365 rb_transcoding *last_tc; 02366 rb_econv_result_t ret; 02367 unsigned char *out_start = *out_pos; 02368 const unsigned char *ptr; 02369 int max_output; 02370 VALUE exc; 02371 02372 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts); 02373 if (!ec) 02374 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags)); 02375 02376 last_tc = ec->last_tc; 02377 max_output = last_tc ? last_tc->transcoder->max_output : 1; 02378 02379 ret = econv_source_buffer_empty; 02380 ptr = *in_pos; 02381 while (ret != econv_finished) { 02382 unsigned char input_byte; 02383 const unsigned char *p = &input_byte; 02384 02385 if (ret == econv_source_buffer_empty) { 02386 if (ptr < in_stop) { 02387 input_byte = *ptr; 02388 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT); 02389 } 02390 else { 02391 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0); 02392 } 02393 } 02394 else { 02395 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT); 02396 } 02397 if (&input_byte != p) 02398 ptr += p - &input_byte; 02399 switch (ret) { 02400 case econv_invalid_byte_sequence: 02401 case econv_incomplete_input: 02402 case econv_undefined_conversion: 02403 exc = make_econv_exception(ec); 02404 rb_econv_close(ec); 02405 rb_exc_raise(exc); 02406 break; 02407 02408 case econv_destination_buffer_full: 02409 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop); 02410 break; 02411 02412 case econv_source_buffer_empty: 02413 break; 02414 02415 case econv_finished: 02416 break; 02417 } 02418 } 02419 rb_econv_close(ec); 02420 *in_pos = in_stop; 02421 return; 02422 } 02423 #endif 02424 02425 02426 /* 02427 * String-specific code 02428 */ 02429 02430 static unsigned char * 02431 str_transcoding_resize(VALUE destination, size_t len, size_t new_len) 02432 { 02433 rb_str_resize(destination, new_len); 02434 return (unsigned char *)RSTRING_PTR(destination); 02435 } 02436 02437 static int 02438 econv_opts(VALUE opt, int ecflags) 02439 { 02440 VALUE v; 02441 02442 v = rb_hash_aref(opt, sym_invalid); 02443 if (NIL_P(v)) { 02444 } 02445 else if (v==sym_replace) { 02446 ecflags |= ECONV_INVALID_REPLACE; 02447 } 02448 else { 02449 rb_raise(rb_eArgError, "unknown value for invalid character option"); 02450 } 02451 02452 v = rb_hash_aref(opt, sym_undef); 02453 if (NIL_P(v)) { 02454 } 02455 else if (v==sym_replace) { 02456 ecflags |= ECONV_UNDEF_REPLACE; 02457 } 02458 else { 02459 rb_raise(rb_eArgError, "unknown value for undefined character option"); 02460 } 02461 02462 v = rb_hash_aref(opt, sym_replace); 02463 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) { 02464 ecflags |= ECONV_UNDEF_REPLACE; 02465 } 02466 02467 v = rb_hash_aref(opt, sym_xml); 02468 if (!NIL_P(v)) { 02469 if (v==sym_text) { 02470 ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF; 02471 } 02472 else if (v==sym_attr) { 02473 ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF; 02474 } 02475 else if (TYPE(v) == T_SYMBOL) { 02476 rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v))); 02477 } 02478 else { 02479 rb_raise(rb_eArgError, "unexpected value for xml option"); 02480 } 02481 } 02482 02483 #ifdef ENABLE_ECONV_NEWLINE_OPTION 02484 v = rb_hash_aref(opt, sym_newline); 02485 if (!NIL_P(v)) { 02486 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK; 02487 if (v == sym_universal) { 02488 ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR; 02489 } 02490 else if (v == sym_crlf) { 02491 ecflags |= ECONV_CRLF_NEWLINE_DECORATOR; 02492 } 02493 else if (v == sym_cr) { 02494 ecflags |= ECONV_CR_NEWLINE_DECORATOR; 02495 } 02496 else if (v == sym_lf) { 02497 /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */ 02498 } 02499 else if (SYMBOL_P(v)) { 02500 rb_raise(rb_eArgError, "unexpected value for newline option: %s", 02501 rb_id2name(SYM2ID(v))); 02502 } 02503 else { 02504 rb_raise(rb_eArgError, "unexpected value for newline option"); 02505 } 02506 } 02507 else 02508 #endif 02509 { 02510 int setflags = 0, newlineflag = 0; 02511 02512 v = rb_hash_aref(opt, sym_universal_newline); 02513 if (RTEST(v)) 02514 setflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR; 02515 newlineflag |= !NIL_P(v); 02516 02517 v = rb_hash_aref(opt, sym_crlf_newline); 02518 if (RTEST(v)) 02519 setflags |= ECONV_CRLF_NEWLINE_DECORATOR; 02520 newlineflag |= !NIL_P(v); 02521 02522 v = rb_hash_aref(opt, sym_cr_newline); 02523 if (RTEST(v)) 02524 setflags |= ECONV_CR_NEWLINE_DECORATOR; 02525 newlineflag |= !NIL_P(v); 02526 02527 if (newlineflag) { 02528 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK; 02529 ecflags |= setflags; 02530 } 02531 } 02532 02533 return ecflags; 02534 } 02535 02536 int 02537 rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags) 02538 { 02539 VALUE newhash = Qnil; 02540 VALUE v; 02541 02542 if (NIL_P(opthash)) { 02543 *opts = Qnil; 02544 return ecflags; 02545 } 02546 ecflags = econv_opts(opthash, ecflags); 02547 02548 v = rb_hash_aref(opthash, sym_replace); 02549 if (!NIL_P(v)) { 02550 StringValue(v); 02551 if (rb_enc_str_coderange(v) == ENC_CODERANGE_BROKEN) { 02552 VALUE dumped = rb_str_dump(v); 02553 rb_raise(rb_eArgError, "replacement string is broken: %s as %s", 02554 StringValueCStr(dumped), 02555 rb_enc_name(rb_enc_get(v))); 02556 } 02557 v = rb_str_new_frozen(v); 02558 newhash = rb_hash_new(); 02559 rb_hash_aset(newhash, sym_replace, v); 02560 } 02561 02562 v = rb_hash_aref(opthash, sym_fallback); 02563 if (!NIL_P(v)) { 02564 VALUE h = rb_check_hash_type(v); 02565 if (NIL_P(h) 02566 ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, sym_aref)) 02567 : (v = h, 1)) { 02568 if (NIL_P(newhash)) 02569 newhash = rb_hash_new(); 02570 rb_hash_aset(newhash, sym_fallback, v); 02571 } 02572 } 02573 02574 if (!NIL_P(newhash)) 02575 rb_hash_freeze(newhash); 02576 *opts = newhash; 02577 02578 return ecflags; 02579 } 02580 02581 int 02582 rb_econv_prepare_opts(VALUE opthash, VALUE *opts) 02583 { 02584 return rb_econv_prepare_options(opthash, opts, 0); 02585 } 02586 02587 rb_econv_t * 02588 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash) 02589 { 02590 rb_econv_t *ec; 02591 VALUE replacement; 02592 02593 if (NIL_P(opthash)) { 02594 replacement = Qnil; 02595 } 02596 else { 02597 if (TYPE(opthash) != T_HASH || !OBJ_FROZEN(opthash)) 02598 rb_bug("rb_econv_open_opts called with invalid opthash"); 02599 replacement = rb_hash_aref(opthash, sym_replace); 02600 } 02601 02602 ec = rb_econv_open(source_encoding, destination_encoding, ecflags); 02603 if (!ec) 02604 return ec; 02605 02606 if (!NIL_P(replacement)) { 02607 int ret; 02608 rb_encoding *enc = rb_enc_get(replacement); 02609 02610 ret = rb_econv_set_replacement(ec, 02611 (const unsigned char *)RSTRING_PTR(replacement), 02612 RSTRING_LEN(replacement), 02613 rb_enc_name(enc)); 02614 if (ret == -1) { 02615 rb_econv_close(ec); 02616 return NULL; 02617 } 02618 } 02619 return ec; 02620 } 02621 02622 static int 02623 enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p) 02624 { 02625 rb_encoding *enc; 02626 const char *n; 02627 int encidx; 02628 VALUE encval; 02629 02630 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) || 02631 !(enc = rb_enc_from_index(encidx))) { 02632 enc = NULL; 02633 encidx = 0; 02634 n = StringValueCStr(*arg); 02635 } 02636 else { 02637 n = rb_enc_name(enc); 02638 } 02639 02640 *name_p = n; 02641 *enc_p = enc; 02642 02643 return encidx; 02644 } 02645 02646 static int 02647 str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2, 02648 const char **sname_p, rb_encoding **senc_p, 02649 const char **dname_p, rb_encoding **denc_p) 02650 { 02651 rb_encoding *senc, *denc; 02652 const char *sname, *dname; 02653 int sencidx, dencidx; 02654 02655 dencidx = enc_arg(arg1, &dname, &denc); 02656 02657 if (NIL_P(*arg2)) { 02658 sencidx = rb_enc_get_index(str); 02659 senc = rb_enc_from_index(sencidx); 02660 sname = rb_enc_name(senc); 02661 } 02662 else { 02663 sencidx = enc_arg(arg2, &sname, &senc); 02664 } 02665 02666 *sname_p = sname; 02667 *senc_p = senc; 02668 *dname_p = dname; 02669 *denc_p = denc; 02670 return dencidx; 02671 } 02672 02673 static int 02674 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts) 02675 { 02676 VALUE dest; 02677 VALUE str = *self; 02678 volatile VALUE arg1, arg2; 02679 long blen, slen; 02680 unsigned char *buf, *bp, *sp; 02681 const unsigned char *fromp; 02682 rb_encoding *senc, *denc; 02683 const char *sname, *dname; 02684 int dencidx; 02685 02686 if (argc <0 || argc > 2) { 02687 rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc); 02688 } 02689 02690 if (argc == 0) { 02691 arg1 = rb_enc_default_internal(); 02692 if (NIL_P(arg1)) { 02693 if (!ecflags) return -1; 02694 arg1 = rb_obj_encoding(str); 02695 } 02696 ecflags |= ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE; 02697 } 02698 else { 02699 arg1 = argv[0]; 02700 } 02701 arg2 = argc<=1 ? Qnil : argv[1]; 02702 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc); 02703 02704 if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK| 02705 ECONV_XML_TEXT_DECORATOR| 02706 ECONV_XML_ATTR_CONTENT_DECORATOR| 02707 ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) { 02708 if (senc && senc == denc) { 02709 return NIL_P(arg2) ? -1 : dencidx; 02710 } 02711 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) { 02712 if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) { 02713 return dencidx; 02714 } 02715 } 02716 if (encoding_equal(sname, dname)) { 02717 return NIL_P(arg2) ? -1 : dencidx; 02718 } 02719 } 02720 else { 02721 if (encoding_equal(sname, dname)) { 02722 sname = ""; 02723 dname = ""; 02724 } 02725 } 02726 02727 fromp = sp = (unsigned char *)RSTRING_PTR(str); 02728 slen = RSTRING_LEN(str); 02729 blen = slen + 30; /* len + margin */ 02730 dest = rb_str_tmp_new(blen); 02731 bp = (unsigned char *)RSTRING_PTR(dest); 02732 02733 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts); 02734 if (fromp != sp+slen) { 02735 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp); 02736 } 02737 buf = (unsigned char *)RSTRING_PTR(dest); 02738 *bp = '\0'; 02739 rb_str_set_len(dest, bp - buf); 02740 02741 /* set encoding */ 02742 if (!denc) { 02743 dencidx = rb_define_dummy_encoding(dname); 02744 } 02745 *self = dest; 02746 02747 return dencidx; 02748 } 02749 02750 static int 02751 str_transcode(int argc, VALUE *argv, VALUE *self) 02752 { 02753 VALUE opt; 02754 int ecflags = 0; 02755 VALUE ecopts = Qnil; 02756 02757 argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt); 02758 if (!NIL_P(opt)) { 02759 ecflags = rb_econv_prepare_opts(opt, &ecopts); 02760 } 02761 return str_transcode0(argc, argv, self, ecflags, ecopts); 02762 } 02763 02764 static inline VALUE 02765 str_encode_associate(VALUE str, int encidx) 02766 { 02767 int cr = 0; 02768 02769 rb_enc_associate_index(str, encidx); 02770 02771 /* transcoded string never be broken. */ 02772 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) { 02773 rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr); 02774 } 02775 else { 02776 cr = ENC_CODERANGE_VALID; 02777 } 02778 ENC_CODERANGE_SET(str, cr); 02779 return str; 02780 } 02781 02782 /* 02783 * call-seq: 02784 * str.encode!(encoding [, options] ) -> str 02785 * str.encode!(dst_encoding, src_encoding [, options] ) -> str 02786 * 02787 * The first form transcodes the contents of <i>str</i> from 02788 * str.encoding to +encoding+. 02789 * The second form transcodes the contents of <i>str</i> from 02790 * src_encoding to dst_encoding. 02791 * The options Hash gives details for conversion. See String#encode 02792 * for details. 02793 * Returns the string even if no changes were made. 02794 */ 02795 02796 static VALUE 02797 str_encode_bang(int argc, VALUE *argv, VALUE str) 02798 { 02799 VALUE newstr; 02800 int encidx; 02801 02802 rb_check_frozen(str); 02803 02804 newstr = str; 02805 encidx = str_transcode(argc, argv, &newstr); 02806 02807 if (encidx < 0) return str; 02808 rb_str_shared_replace(str, newstr); 02809 return str_encode_associate(str, encidx); 02810 } 02811 02812 static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx); 02813 02814 /* 02815 * call-seq: 02816 * str.encode(encoding [, options] ) -> str 02817 * str.encode(dst_encoding, src_encoding [, options] ) -> str 02818 * str.encode([options]) -> str 02819 * 02820 * The first form returns a copy of +str+ transcoded 02821 * to encoding +encoding+. 02822 * The second form returns a copy of +str+ transcoded 02823 * from src_encoding to dst_encoding. 02824 * The last form returns a copy of +str+ transcoded to 02825 * <tt>Encoding.default_internal</tt>. 02826 * 02827 * By default, the first and second form raise 02828 * Encoding::UndefinedConversionError for characters that are 02829 * undefined in the destination encoding, and 02830 * Encoding::InvalidByteSequenceError for invalid byte sequences 02831 * in the source encoding. The last form by default does not raise 02832 * exceptions but uses replacement strings. 02833 * 02834 * The +options+ Hash gives details for conversion and can have the following 02835 * keys: 02836 * 02837 * :invalid :: 02838 * If the value is +:replace+, #encode replaces invalid byte sequences in 02839 * +str+ with the replacement character. The default is to raise the 02840 * Encoding::InvalidByteSequenceError exception 02841 * :undef :: 02842 * If the value is +:replace+, #encode replaces characters which are 02843 * undefined in the destination encoding with the replacement character. 02844 * The default is to raise the Encoding::UndefinedConversionError. 02845 * :replace :: 02846 * Sets the replacement string to the given value. The default replacement 02847 * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise. 02848 * :fallback :: 02849 * Sets the replacement string by the given object for undefined 02850 * character. The object should be a Hash, a Proc, a Method, or an 02851 * object which has [] method. 02852 * Its key is an undefined character encoded in the source encoding 02853 * of current transcoder. Its value can be any encoding until it 02854 * can be converted into the destination encoding of the transcoder. 02855 * :xml :: 02856 * The value must be +:text+ or +:attr+. 02857 * If the value is +:text+ #encode replaces undefined characters with their 02858 * (upper-case hexadecimal) numeric character references. '&', '<', and '>' 02859 * are converted to "&", "<", and ">", respectively. 02860 * If the value is +:attr+, #encode also quotes the replacement result 02861 * (using '"'), and replaces '"' with """. 02862 * :cr_newline :: 02863 * Replaces LF ("\n") with CR ("\r") if value is true. 02864 * :crlf_newline :: 02865 * Replaces LF ("\n") with CRLF ("\r\n") if value is true. 02866 * :universal_newline :: 02867 * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true. 02868 */ 02869 02870 static VALUE 02871 str_encode(int argc, VALUE *argv, VALUE str) 02872 { 02873 VALUE newstr = str; 02874 int encidx = str_transcode(argc, argv, &newstr); 02875 return encoded_dup(newstr, str, encidx); 02876 } 02877 02878 VALUE 02879 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts) 02880 { 02881 int argc = 1; 02882 VALUE *argv = &to; 02883 VALUE newstr = str; 02884 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts); 02885 return encoded_dup(newstr, str, encidx); 02886 } 02887 02888 static VALUE 02889 encoded_dup(VALUE newstr, VALUE str, int encidx) 02890 { 02891 if (encidx < 0) return rb_str_dup(str); 02892 if (newstr == str) { 02893 newstr = rb_str_dup(str); 02894 } 02895 else { 02896 RBASIC(newstr)->klass = rb_obj_class(str); 02897 } 02898 return str_encode_associate(newstr, encidx); 02899 } 02900 02901 static void 02902 econv_free(void *ptr) 02903 { 02904 rb_econv_t *ec = ptr; 02905 rb_econv_close(ec); 02906 } 02907 02908 static size_t 02909 econv_memsize(const void *ptr) 02910 { 02911 return ptr ? sizeof(rb_econv_t) : 0; 02912 } 02913 02914 static const rb_data_type_t econv_data_type = { 02915 "econv", 02916 {NULL, econv_free, econv_memsize,}, 02917 }; 02918 02919 static VALUE 02920 econv_s_allocate(VALUE klass) 02921 { 02922 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL); 02923 } 02924 02925 static rb_encoding * 02926 make_dummy_encoding(const char *name) 02927 { 02928 rb_encoding *enc; 02929 int idx; 02930 idx = rb_define_dummy_encoding(name); 02931 enc = rb_enc_from_index(idx); 02932 return enc; 02933 } 02934 02935 static rb_encoding * 02936 make_encoding(const char *name) 02937 { 02938 rb_encoding *enc; 02939 enc = rb_enc_find(name); 02940 if (!enc) 02941 enc = make_dummy_encoding(name); 02942 return enc; 02943 } 02944 02945 static VALUE 02946 make_encobj(const char *name) 02947 { 02948 return rb_enc_from_encoding(make_encoding(name)); 02949 } 02950 02951 /* 02952 * call-seq: 02953 * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil 02954 * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil 02955 * 02956 * Returns the corresponding ASCII compatible encoding. 02957 * 02958 * Returns nil if the argument is an ASCII compatible encoding. 02959 * 02960 * "corresponding ASCII compatible encoding" is a ASCII compatible encoding which 02961 * can represents exactly the same characters as the given ASCII incompatible encoding. 02962 * So, no conversion undefined error occurs when converting between the two encodings. 02963 * 02964 * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP> 02965 * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8> 02966 * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil 02967 * 02968 */ 02969 static VALUE 02970 econv_s_asciicompat_encoding(VALUE klass, VALUE arg) 02971 { 02972 const char *arg_name, *result_name; 02973 rb_encoding *arg_enc, *result_enc; 02974 02975 enc_arg(&arg, &arg_name, &arg_enc); 02976 02977 result_name = rb_econv_asciicompat_encoding(arg_name); 02978 02979 if (result_name == NULL) 02980 return Qnil; 02981 02982 result_enc = make_encoding(result_name); 02983 02984 return rb_enc_from_encoding(result_enc); 02985 } 02986 02987 static void 02988 econv_args(int argc, VALUE *argv, 02989 volatile VALUE *snamev_p, volatile VALUE *dnamev_p, 02990 const char **sname_p, const char **dname_p, 02991 rb_encoding **senc_p, rb_encoding **denc_p, 02992 int *ecflags_p, 02993 VALUE *ecopts_p) 02994 { 02995 VALUE opt, flags_v, ecopts; 02996 int sidx, didx; 02997 const char *sname, *dname; 02998 rb_encoding *senc, *denc; 02999 int ecflags; 03000 03001 argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt); 03002 03003 if (!NIL_P(flags_v)) { 03004 if (!NIL_P(opt)) { 03005 rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", 03006 argc + 1); 03007 } 03008 ecflags = NUM2INT(rb_to_int(flags_v)); 03009 ecopts = Qnil; 03010 } 03011 else if (!NIL_P(opt)) { 03012 ecflags = rb_econv_prepare_opts(opt, &ecopts); 03013 } 03014 else { 03015 ecflags = 0; 03016 ecopts = Qnil; 03017 } 03018 03019 senc = NULL; 03020 sidx = rb_to_encoding_index(*snamev_p); 03021 if (0 <= sidx) { 03022 senc = rb_enc_from_index(sidx); 03023 } 03024 else { 03025 StringValue(*snamev_p); 03026 } 03027 03028 denc = NULL; 03029 didx = rb_to_encoding_index(*dnamev_p); 03030 if (0 <= didx) { 03031 denc = rb_enc_from_index(didx); 03032 } 03033 else { 03034 StringValue(*dnamev_p); 03035 } 03036 03037 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p); 03038 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p); 03039 03040 *sname_p = sname; 03041 *dname_p = dname; 03042 *senc_p = senc; 03043 *denc_p = denc; 03044 *ecflags_p = ecflags; 03045 *ecopts_p = ecopts; 03046 } 03047 03048 static int 03049 decorate_convpath(VALUE convpath, int ecflags) 03050 { 03051 int num_decorators; 03052 const char *decorators[MAX_ECFLAGS_DECORATORS]; 03053 int i; 03054 int n, len; 03055 03056 num_decorators = decorator_names(ecflags, decorators); 03057 if (num_decorators == -1) 03058 return -1; 03059 03060 len = n = RARRAY_LENINT(convpath); 03061 if (n != 0) { 03062 VALUE pair = RARRAY_PTR(convpath)[n-1]; 03063 if (TYPE(pair) == T_ARRAY) { 03064 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[0])); 03065 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[1])); 03066 transcoder_entry_t *entry = get_transcoder_entry(sname, dname); 03067 const rb_transcoder *tr = load_transcoder_entry(entry); 03068 if (!tr) 03069 return -1; 03070 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) && 03071 tr->asciicompat_type == asciicompat_encoder) { 03072 n--; 03073 rb_ary_store(convpath, len + num_decorators - 1, pair); 03074 } 03075 } 03076 else { 03077 rb_ary_store(convpath, len + num_decorators - 1, pair); 03078 } 03079 } 03080 03081 for (i = 0; i < num_decorators; i++) 03082 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i])); 03083 03084 return 0; 03085 } 03086 03087 static void 03088 search_convpath_i(const char *sname, const char *dname, int depth, void *arg) 03089 { 03090 VALUE *ary_p = arg; 03091 VALUE v; 03092 03093 if (*ary_p == Qnil) { 03094 *ary_p = rb_ary_new(); 03095 } 03096 03097 if (DECORATOR_P(sname, dname)) { 03098 v = rb_str_new_cstr(dname); 03099 } 03100 else { 03101 v = rb_assoc_new(make_encobj(sname), make_encobj(dname)); 03102 } 03103 rb_ary_store(*ary_p, depth, v); 03104 } 03105 03106 /* 03107 * call-seq: 03108 * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary 03109 * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary 03110 * 03111 * Returns a conversion path. 03112 * 03113 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP") 03114 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], 03115 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]] 03116 * 03117 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true) 03118 * or 03119 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal) 03120 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], 03121 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>], 03122 * # "universal_newline"] 03123 * 03124 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true) 03125 * or 03126 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal) 03127 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], 03128 * # "universal_newline", 03129 * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]] 03130 */ 03131 static VALUE 03132 econv_s_search_convpath(int argc, VALUE *argv, VALUE klass) 03133 { 03134 volatile VALUE snamev, dnamev; 03135 const char *sname, *dname; 03136 rb_encoding *senc, *denc; 03137 int ecflags; 03138 VALUE ecopts; 03139 VALUE convpath; 03140 03141 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts); 03142 03143 convpath = Qnil; 03144 transcode_search_path(sname, dname, search_convpath_i, &convpath); 03145 03146 if (NIL_P(convpath)) 03147 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags)); 03148 03149 if (decorate_convpath(convpath, ecflags) == -1) 03150 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags)); 03151 03152 return convpath; 03153 } 03154 03155 /* 03156 * Check the existence of a conversion path. 03157 * Returns the number of converters in the conversion path. 03158 * result: >=0:success -1:failure 03159 */ 03160 int 03161 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding) 03162 { 03163 VALUE convpath = Qnil; 03164 transcode_search_path(from_encoding, to_encoding, search_convpath_i, 03165 &convpath); 03166 return RTEST(convpath); 03167 } 03168 03169 struct rb_econv_init_by_convpath_t { 03170 rb_econv_t *ec; 03171 int index; 03172 int ret; 03173 }; 03174 03175 static void 03176 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg) 03177 { 03178 struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg; 03179 int ret; 03180 03181 if (a->ret == -1) 03182 return; 03183 03184 ret = rb_econv_add_converter(a->ec, sname, dname, a->index); 03185 03186 a->ret = ret; 03187 return; 03188 } 03189 03190 static rb_econv_t * 03191 rb_econv_init_by_convpath(VALUE self, VALUE convpath, 03192 const char **sname_p, const char **dname_p, 03193 rb_encoding **senc_p, rb_encoding**denc_p) 03194 { 03195 rb_econv_t *ec; 03196 long i; 03197 int ret, first=1; 03198 VALUE elt; 03199 rb_encoding *senc = 0, *denc = 0; 03200 const char *sname, *dname; 03201 03202 ec = rb_econv_alloc(RARRAY_LENINT(convpath)); 03203 DATA_PTR(self) = ec; 03204 03205 for (i = 0; i < RARRAY_LEN(convpath); i++) { 03206 volatile VALUE snamev, dnamev; 03207 VALUE pair; 03208 elt = rb_ary_entry(convpath, i); 03209 if (!NIL_P(pair = rb_check_array_type(elt))) { 03210 if (RARRAY_LEN(pair) != 2) 03211 rb_raise(rb_eArgError, "not a 2-element array in convpath"); 03212 snamev = rb_ary_entry(pair, 0); 03213 enc_arg(&snamev, &sname, &senc); 03214 dnamev = rb_ary_entry(pair, 1); 03215 enc_arg(&dnamev, &dname, &denc); 03216 } 03217 else { 03218 sname = ""; 03219 dname = StringValueCStr(elt); 03220 } 03221 if (DECORATOR_P(sname, dname)) { 03222 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans); 03223 if (ret == -1) 03224 rb_raise(rb_eArgError, "decoration failed: %s", dname); 03225 } 03226 else { 03227 int j = ec->num_trans; 03228 struct rb_econv_init_by_convpath_t arg; 03229 arg.ec = ec; 03230 arg.index = ec->num_trans; 03231 arg.ret = 0; 03232 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg); 03233 if (ret == -1 || arg.ret == -1) 03234 rb_raise(rb_eArgError, "adding conversion failed: %s to %s", sname, dname); 03235 if (first) { 03236 first = 0; 03237 *senc_p = senc; 03238 *sname_p = ec->elems[j].tc->transcoder->src_encoding; 03239 } 03240 *denc_p = denc; 03241 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding; 03242 } 03243 } 03244 03245 if (first) { 03246 *senc_p = NULL; 03247 *denc_p = NULL; 03248 *sname_p = ""; 03249 *dname_p = ""; 03250 } 03251 03252 ec->source_encoding_name = *sname_p; 03253 ec->destination_encoding_name = *dname_p; 03254 03255 return ec; 03256 } 03257 03258 /* 03259 * call-seq: 03260 * Encoding::Converter.new(source_encoding, destination_encoding) 03261 * Encoding::Converter.new(source_encoding, destination_encoding, opt) 03262 * Encoding::Converter.new(convpath) 03263 * 03264 * possible options elements: 03265 * hash form: 03266 * :invalid => nil # raise error on invalid byte sequence (default) 03267 * :invalid => :replace # replace invalid byte sequence 03268 * :undef => nil # raise error on undefined conversion (default) 03269 * :undef => :replace # replace undefined conversion 03270 * :replace => string # replacement string ("?" or "\uFFFD" if not specified) 03271 * :newline => :universal # decorator for converting CRLF and CR to LF 03272 * :newline => :crlf # decorator for converting LF to CRLF 03273 * :newline => :cr # decorator for converting LF to CR 03274 * :universal_newline => true # decorator for converting CRLF and CR to LF 03275 * :crlf_newline => true # decorator for converting LF to CRLF 03276 * :cr_newline => true # decorator for converting LF to CR 03277 * :xml => :text # escape as XML CharData. 03278 * :xml => :attr # escape as XML AttValue 03279 * integer form: 03280 * Encoding::Converter::INVALID_REPLACE 03281 * Encoding::Converter::UNDEF_REPLACE 03282 * Encoding::Converter::UNDEF_HEX_CHARREF 03283 * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR 03284 * Encoding::Converter::CRLF_NEWLINE_DECORATOR 03285 * Encoding::Converter::CR_NEWLINE_DECORATOR 03286 * Encoding::Converter::XML_TEXT_DECORATOR 03287 * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR 03288 * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR 03289 * 03290 * Encoding::Converter.new creates an instance of Encoding::Converter. 03291 * 03292 * Source_encoding and destination_encoding should be a string or 03293 * Encoding object. 03294 * 03295 * opt should be nil, a hash or an integer. 03296 * 03297 * convpath should be an array. 03298 * convpath may contain 03299 * - two-element arrays which contain encodings or encoding names, or 03300 * - strings representing decorator names. 03301 * 03302 * Encoding::Converter.new optionally takes an option. 03303 * The option should be a hash or an integer. 03304 * The option hash can contain :invalid => nil, etc. 03305 * The option integer should be logical-or of constants such as 03306 * Encoding::Converter::INVALID_REPLACE, etc. 03307 * 03308 * [:invalid => nil] 03309 * Raise error on invalid byte sequence. This is a default behavior. 03310 * [:invalid => :replace] 03311 * Replace invalid byte sequence by replacement string. 03312 * [:undef => nil] 03313 * Raise an error if a character in source_encoding is not defined in destination_encoding. 03314 * This is a default behavior. 03315 * [:undef => :replace] 03316 * Replace undefined character in destination_encoding with replacement string. 03317 * [:replace => string] 03318 * Specify the replacement string. 03319 * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others. 03320 * [:universal_newline => true] 03321 * Convert CRLF and CR to LF. 03322 * [:crlf_newline => true] 03323 * Convert LF to CRLF. 03324 * [:cr_newline => true] 03325 * Convert LF to CR. 03326 * [:xml => :text] 03327 * Escape as XML CharData. 03328 * This form can be used as a HTML 4.0 #PCDATA. 03329 * - '&' -> '&' 03330 * - '<' -> '<' 03331 * - '>' -> '>' 03332 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH; 03333 * [:xml => :attr] 03334 * Escape as XML AttValue. 03335 * The converted result is quoted as "...". 03336 * This form can be used as a HTML 4.0 attribute value. 03337 * - '&' -> '&' 03338 * - '<' -> '<' 03339 * - '>' -> '>' 03340 * - '"' -> '"' 03341 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH; 03342 * 03343 * Examples: 03344 * # UTF-16BE to UTF-8 03345 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8") 03346 * 03347 * # Usually, decorators such as newline conversion are inserted last. 03348 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true) 03349 * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>], 03350 * # "universal_newline"] 03351 * 03352 * # But, if the last encoding is ASCII incompatible, 03353 * # decorators are inserted before the last conversion. 03354 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true) 03355 * p ec.convpath #=> ["crlf_newline", 03356 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]] 03357 * 03358 * # Conversion path can be specified directly. 03359 * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]]) 03360 * p ec.convpath #=> ["universal_newline", 03361 * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>], 03362 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]] 03363 */ 03364 static VALUE 03365 econv_init(int argc, VALUE *argv, VALUE self) 03366 { 03367 VALUE ecopts; 03368 volatile VALUE snamev, dnamev; 03369 const char *sname, *dname; 03370 rb_encoding *senc, *denc; 03371 rb_econv_t *ec; 03372 int ecflags; 03373 VALUE convpath; 03374 03375 if (rb_check_typeddata(self, &econv_data_type)) { 03376 rb_raise(rb_eTypeError, "already initialized"); 03377 } 03378 03379 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) { 03380 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc); 03381 ecflags = 0; 03382 ecopts = Qnil; 03383 } 03384 else { 03385 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts); 03386 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts); 03387 } 03388 03389 if (!ec) { 03390 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags)); 03391 } 03392 03393 if (!DECORATOR_P(sname, dname)) { 03394 if (!senc) 03395 senc = make_dummy_encoding(sname); 03396 if (!denc) 03397 denc = make_dummy_encoding(dname); 03398 } 03399 03400 ec->source_encoding = senc; 03401 ec->destination_encoding = denc; 03402 03403 DATA_PTR(self) = ec; 03404 03405 return self; 03406 } 03407 03408 /* 03409 * call-seq: 03410 * ec.inspect -> string 03411 * 03412 * Returns a printable version of <i>ec</i> 03413 * 03414 * ec = Encoding::Converter.new("iso-8859-1", "utf-8") 03415 * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8> 03416 * 03417 */ 03418 static VALUE 03419 econv_inspect(VALUE self) 03420 { 03421 const char *cname = rb_obj_classname(self); 03422 rb_econv_t *ec; 03423 03424 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec); 03425 if (!ec) 03426 return rb_sprintf("#<%s: uninitialized>", cname); 03427 else { 03428 const char *sname = ec->source_encoding_name; 03429 const char *dname = ec->destination_encoding_name; 03430 VALUE str; 03431 str = rb_sprintf("#<%s: ", cname); 03432 econv_description(sname, dname, ec->flags, str); 03433 rb_str_cat2(str, ">"); 03434 return str; 03435 } 03436 } 03437 03438 static rb_econv_t * 03439 check_econv(VALUE self) 03440 { 03441 rb_econv_t *ec; 03442 03443 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec); 03444 if (!ec) { 03445 rb_raise(rb_eTypeError, "uninitialized encoding converter"); 03446 } 03447 return ec; 03448 } 03449 03450 /* 03451 * call-seq: 03452 * ec.source_encoding -> encoding 03453 * 03454 * Returns the source encoding as an Encoding object. 03455 */ 03456 static VALUE 03457 econv_source_encoding(VALUE self) 03458 { 03459 rb_econv_t *ec = check_econv(self); 03460 if (!ec->source_encoding) 03461 return Qnil; 03462 return rb_enc_from_encoding(ec->source_encoding); 03463 } 03464 03465 /* 03466 * call-seq: 03467 * ec.destination_encoding -> encoding 03468 * 03469 * Returns the destination encoding as an Encoding object. 03470 */ 03471 static VALUE 03472 econv_destination_encoding(VALUE self) 03473 { 03474 rb_econv_t *ec = check_econv(self); 03475 if (!ec->destination_encoding) 03476 return Qnil; 03477 return rb_enc_from_encoding(ec->destination_encoding); 03478 } 03479 03480 /* 03481 * call-seq: 03482 * ec.convpath -> ary 03483 * 03484 * Returns the conversion path of ec. 03485 * 03486 * The result is an array of conversions. 03487 * 03488 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true) 03489 * p ec.convpath 03490 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], 03491 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>], 03492 * # "crlf_newline"] 03493 * 03494 * Each element of the array is a pair of encodings or a string. 03495 * A pair means an encoding conversion. 03496 * A string means a decorator. 03497 * 03498 * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means 03499 * a converter from ISO-8859-1 to UTF-8. 03500 * "crlf_newline" means newline converter from LF to CRLF. 03501 */ 03502 static VALUE 03503 econv_convpath(VALUE self) 03504 { 03505 rb_econv_t *ec = check_econv(self); 03506 VALUE result; 03507 int i; 03508 03509 result = rb_ary_new(); 03510 for (i = 0; i < ec->num_trans; i++) { 03511 const rb_transcoder *tr = ec->elems[i].tc->transcoder; 03512 VALUE v; 03513 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding)) 03514 v = rb_str_new_cstr(tr->dst_encoding); 03515 else 03516 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding)); 03517 rb_ary_push(result, v); 03518 } 03519 return result; 03520 } 03521 03522 /* 03523 * call-seq: 03524 * ec == other -> true or false 03525 */ 03526 static VALUE 03527 econv_equal(VALUE self, VALUE other) 03528 { 03529 rb_econv_t *ec1 = check_econv(self); 03530 rb_econv_t *ec2; 03531 int i; 03532 03533 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) { 03534 return Qnil; 03535 } 03536 ec2 = DATA_PTR(other); 03537 if (!ec2) return Qfalse; 03538 if (ec1->source_encoding_name != ec2->source_encoding_name && 03539 strcmp(ec1->source_encoding_name, ec2->source_encoding_name)) 03540 return Qfalse; 03541 if (ec1->destination_encoding_name != ec2->destination_encoding_name && 03542 strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name)) 03543 return Qfalse; 03544 if (ec1->flags != ec2->flags) return Qfalse; 03545 if (ec1->replacement_enc != ec2->replacement_enc && 03546 strcmp(ec1->replacement_enc, ec2->replacement_enc)) 03547 return Qfalse; 03548 if (ec1->replacement_len != ec2->replacement_len) return Qfalse; 03549 if (ec1->replacement_str != ec2->replacement_str && 03550 memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len)) 03551 return Qfalse; 03552 03553 if (ec1->num_trans != ec2->num_trans) return Qfalse; 03554 for (i = 0; i < ec1->num_trans; i++) { 03555 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder) 03556 return Qfalse; 03557 } 03558 return Qtrue; 03559 } 03560 03561 static VALUE 03562 econv_result_to_symbol(rb_econv_result_t res) 03563 { 03564 switch (res) { 03565 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence; 03566 case econv_incomplete_input: return sym_incomplete_input; 03567 case econv_undefined_conversion: return sym_undefined_conversion; 03568 case econv_destination_buffer_full: return sym_destination_buffer_full; 03569 case econv_source_buffer_empty: return sym_source_buffer_empty; 03570 case econv_finished: return sym_finished; 03571 case econv_after_output: return sym_after_output; 03572 default: return INT2NUM(res); /* should not be reached */ 03573 } 03574 } 03575 03576 /* 03577 * call-seq: 03578 * ec.primitive_convert(source_buffer, destination_buffer) -> symbol 03579 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol 03580 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol 03581 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol 03582 * 03583 * possible opt elements: 03584 * hash form: 03585 * :partial_input => true # source buffer may be part of larger source 03586 * :after_output => true # stop conversion after output before input 03587 * integer form: 03588 * Encoding::Converter::PARTIAL_INPUT 03589 * Encoding::Converter::AFTER_OUTPUT 03590 * 03591 * possible results: 03592 * :invalid_byte_sequence 03593 * :incomplete_input 03594 * :undefined_conversion 03595 * :after_output 03596 * :destination_buffer_full 03597 * :source_buffer_empty 03598 * :finished 03599 * 03600 * primitive_convert converts source_buffer into destination_buffer. 03601 * 03602 * source_buffer should be a string or nil. 03603 * nil means a empty string. 03604 * 03605 * destination_buffer should be a string. 03606 * 03607 * destination_byteoffset should be an integer or nil. 03608 * nil means the end of destination_buffer. 03609 * If it is omitted, nil is assumed. 03610 * 03611 * destination_bytesize should be an integer or nil. 03612 * nil means unlimited. 03613 * If it is omitted, nil is assumed. 03614 * 03615 * opt should be nil, a hash or an integer. 03616 * nil means no flags. 03617 * If it is omitted, nil is assumed. 03618 * 03619 * primitive_convert converts the content of source_buffer from beginning 03620 * and store the result into destination_buffer. 03621 * 03622 * destination_byteoffset and destination_bytesize specify the region which 03623 * the converted result is stored. 03624 * destination_byteoffset specifies the start position in destination_buffer in bytes. 03625 * If destination_byteoffset is nil, 03626 * destination_buffer.bytesize is used for appending the result. 03627 * destination_bytesize specifies maximum number of bytes. 03628 * If destination_bytesize is nil, 03629 * destination size is unlimited. 03630 * After conversion, destination_buffer is resized to 03631 * destination_byteoffset + actually produced number of bytes. 03632 * Also destination_buffer's encoding is set to destination_encoding. 03633 * 03634 * primitive_convert drops the converted part of source_buffer. 03635 * the dropped part is converted in destination_buffer or 03636 * buffered in Encoding::Converter object. 03637 * 03638 * primitive_convert stops conversion when one of following condition met. 03639 * - invalid byte sequence found in source buffer (:invalid_byte_sequence) 03640 * - unexpected end of source buffer (:incomplete_input) 03641 * this occur only when :partial_input is not specified. 03642 * - character not representable in output encoding (:undefined_conversion) 03643 * - after some output is generated, before input is done (:after_output) 03644 * this occur only when :after_output is specified. 03645 * - destination buffer is full (:destination_buffer_full) 03646 * this occur only when destination_bytesize is non-nil. 03647 * - source buffer is empty (:source_buffer_empty) 03648 * this occur only when :partial_input is specified. 03649 * - conversion is finished (:finished) 03650 * 03651 * example: 03652 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE") 03653 * ret = ec.primitive_convert(src="pi", dst="", nil, 100) 03654 * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"] 03655 * 03656 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE") 03657 * ret = ec.primitive_convert(src="pi", dst="", nil, 1) 03658 * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"] 03659 * ret = ec.primitive_convert(src, dst="", nil, 1) 03660 * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"] 03661 * ret = ec.primitive_convert(src, dst="", nil, 1) 03662 * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"] 03663 * ret = ec.primitive_convert(src, dst="", nil, 1) 03664 * p [ret, src, dst] #=> [:finished, "", "i"] 03665 * 03666 */ 03667 static VALUE 03668 econv_primitive_convert(int argc, VALUE *argv, VALUE self) 03669 { 03670 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v; 03671 rb_econv_t *ec = check_econv(self); 03672 rb_econv_result_t res; 03673 const unsigned char *ip, *is; 03674 unsigned char *op, *os; 03675 long output_byteoffset, output_bytesize; 03676 unsigned long output_byteend; 03677 int flags; 03678 03679 argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt); 03680 03681 if (NIL_P(output_byteoffset_v)) 03682 output_byteoffset = 0; /* dummy */ 03683 else 03684 output_byteoffset = NUM2LONG(output_byteoffset_v); 03685 03686 if (NIL_P(output_bytesize_v)) 03687 output_bytesize = 0; /* dummy */ 03688 else 03689 output_bytesize = NUM2LONG(output_bytesize_v); 03690 03691 if (!NIL_P(flags_v)) { 03692 if (!NIL_P(opt)) { 03693 rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..5)", 03694 argc + 1); 03695 } 03696 flags = NUM2INT(rb_to_int(flags_v)); 03697 } 03698 else if (!NIL_P(opt)) { 03699 VALUE v; 03700 flags = 0; 03701 v = rb_hash_aref(opt, sym_partial_input); 03702 if (RTEST(v)) 03703 flags |= ECONV_PARTIAL_INPUT; 03704 v = rb_hash_aref(opt, sym_after_output); 03705 if (RTEST(v)) 03706 flags |= ECONV_AFTER_OUTPUT; 03707 } 03708 else { 03709 flags = 0; 03710 } 03711 03712 StringValue(output); 03713 if (!NIL_P(input)) 03714 StringValue(input); 03715 rb_str_modify(output); 03716 03717 if (NIL_P(output_bytesize_v)) { 03718 output_bytesize = RSTRING_EMBED_LEN_MAX; 03719 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input)) 03720 output_bytesize = RSTRING_LEN(input); 03721 } 03722 03723 retry: 03724 03725 if (NIL_P(output_byteoffset_v)) 03726 output_byteoffset = RSTRING_LEN(output); 03727 03728 if (output_byteoffset < 0) 03729 rb_raise(rb_eArgError, "negative output_byteoffset"); 03730 03731 if (RSTRING_LEN(output) < output_byteoffset) 03732 rb_raise(rb_eArgError, "output_byteoffset too big"); 03733 03734 if (output_bytesize < 0) 03735 rb_raise(rb_eArgError, "negative output_bytesize"); 03736 03737 output_byteend = (unsigned long)output_byteoffset + 03738 (unsigned long)output_bytesize; 03739 03740 if (output_byteend < (unsigned long)output_byteoffset || 03741 LONG_MAX < output_byteend) 03742 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big"); 03743 03744 if (rb_str_capacity(output) < output_byteend) 03745 rb_str_resize(output, output_byteend); 03746 03747 if (NIL_P(input)) { 03748 ip = is = NULL; 03749 } 03750 else { 03751 ip = (const unsigned char *)RSTRING_PTR(input); 03752 is = ip + RSTRING_LEN(input); 03753 } 03754 03755 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset; 03756 os = op + output_bytesize; 03757 03758 res = rb_econv_convert(ec, &ip, is, &op, os, flags); 03759 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output)); 03760 if (!NIL_P(input)) 03761 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input)); 03762 03763 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) { 03764 if (LONG_MAX / 2 < output_bytesize) 03765 rb_raise(rb_eArgError, "too long conversion result"); 03766 output_bytesize *= 2; 03767 output_byteoffset_v = Qnil; 03768 goto retry; 03769 } 03770 03771 if (ec->destination_encoding) { 03772 rb_enc_associate(output, ec->destination_encoding); 03773 } 03774 03775 return econv_result_to_symbol(res); 03776 } 03777 03778 /* 03779 * call-seq: 03780 * ec.convert(source_string) -> destination_string 03781 * 03782 * Convert source_string and return destination_string. 03783 * 03784 * source_string is assumed as a part of source. 03785 * i.e. :partial_input=>true is specified internally. 03786 * finish method should be used last. 03787 * 03788 * ec = Encoding::Converter.new("utf-8", "euc-jp") 03789 * puts ec.convert("\u3042").dump #=> "\xA4\xA2" 03790 * puts ec.finish.dump #=> "" 03791 * 03792 * ec = Encoding::Converter.new("euc-jp", "utf-8") 03793 * puts ec.convert("\xA4").dump #=> "" 03794 * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82" 03795 * puts ec.finish.dump #=> "" 03796 * 03797 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp") 03798 * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP") 03799 * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP") 03800 * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP") 03801 * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP") 03802 * 03803 * If a conversion error occur, 03804 * Encoding::UndefinedConversionError or 03805 * Encoding::InvalidByteSequenceError is raised. 03806 * Encoding::Converter#convert doesn't supply methods to recover or restart 03807 * from these exceptions. 03808 * When you want to handle these conversion errors, 03809 * use Encoding::Converter#primitive_convert. 03810 * 03811 */ 03812 static VALUE 03813 econv_convert(VALUE self, VALUE source_string) 03814 { 03815 VALUE ret, dst; 03816 VALUE av[5]; 03817 int ac; 03818 rb_econv_t *ec = check_econv(self); 03819 03820 StringValue(source_string); 03821 03822 dst = rb_str_new(NULL, 0); 03823 03824 av[0] = rb_str_dup(source_string); 03825 av[1] = dst; 03826 av[2] = Qnil; 03827 av[3] = Qnil; 03828 av[4] = INT2NUM(ECONV_PARTIAL_INPUT); 03829 ac = 5; 03830 03831 ret = econv_primitive_convert(ac, av, self); 03832 03833 if (ret == sym_invalid_byte_sequence || 03834 ret == sym_undefined_conversion || 03835 ret == sym_incomplete_input) { 03836 VALUE exc = make_econv_exception(ec); 03837 rb_exc_raise(exc); 03838 } 03839 03840 if (ret == sym_finished) { 03841 rb_raise(rb_eArgError, "converter already finished"); 03842 } 03843 03844 if (ret != sym_source_buffer_empty) { 03845 rb_bug("unexpected result of econv_primitive_convert"); 03846 } 03847 03848 return dst; 03849 } 03850 03851 /* 03852 * call-seq: 03853 * ec.finish -> string 03854 * 03855 * Finishes the converter. 03856 * It returns the last part of the converted string. 03857 * 03858 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp") 03859 * p ec.convert("\u3042") #=> "\e$B$\"" 03860 * p ec.finish #=> "\e(B" 03861 */ 03862 static VALUE 03863 econv_finish(VALUE self) 03864 { 03865 VALUE ret, dst; 03866 VALUE av[5]; 03867 int ac; 03868 rb_econv_t *ec = check_econv(self); 03869 03870 dst = rb_str_new(NULL, 0); 03871 03872 av[0] = Qnil; 03873 av[1] = dst; 03874 av[2] = Qnil; 03875 av[3] = Qnil; 03876 av[4] = INT2NUM(0); 03877 ac = 5; 03878 03879 ret = econv_primitive_convert(ac, av, self); 03880 03881 if (ret == sym_invalid_byte_sequence || 03882 ret == sym_undefined_conversion || 03883 ret == sym_incomplete_input) { 03884 VALUE exc = make_econv_exception(ec); 03885 rb_exc_raise(exc); 03886 } 03887 03888 if (ret != sym_finished) { 03889 rb_bug("unexpected result of econv_primitive_convert"); 03890 } 03891 03892 return dst; 03893 } 03894 03895 /* 03896 * call-seq: 03897 * ec.primitive_errinfo -> array 03898 * 03899 * primitive_errinfo returns important information regarding the last error 03900 * as a 5-element array: 03901 * 03902 * [result, enc1, enc2, error_bytes, readagain_bytes] 03903 * 03904 * result is the last result of primitive_convert. 03905 * 03906 * Other elements are only meaningful when result is 03907 * :invalid_byte_sequence, :incomplete_input or :undefined_conversion. 03908 * 03909 * enc1 and enc2 indicate a conversion step as a pair of strings. 03910 * For example, a converter from EUC-JP to ISO-8859-1 converts 03911 * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1. 03912 * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"]. 03913 * 03914 * error_bytes and readagain_bytes indicate the byte sequences which caused the error. 03915 * error_bytes is discarded portion. 03916 * readagain_bytes is buffered portion which is read again on next conversion. 03917 * 03918 * Example: 03919 * 03920 * # \xff is invalid as EUC-JP. 03921 * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS") 03922 * ec.primitive_convert(src="\xff", dst="", nil, 10) 03923 * p ec.primitive_errinfo 03924 * #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""] 03925 * 03926 * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1. 03927 * # Since this error is occur in UTF-8 to ISO-8859-1 conversion, 03928 * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82). 03929 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") 03930 * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10) 03931 * p ec.primitive_errinfo 03932 * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""] 03933 * 03934 * # partial character is invalid 03935 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") 03936 * ec.primitive_convert(src="\xa4", dst="", nil, 10) 03937 * p ec.primitive_errinfo 03938 * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""] 03939 * 03940 * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by 03941 * # partial characters. 03942 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") 03943 * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT) 03944 * p ec.primitive_errinfo 03945 * #=> [:source_buffer_empty, nil, nil, nil, nil] 03946 * 03947 * # \xd8\x00\x00@ is invalid as UTF-16BE because 03948 * # no low surrogate after high surrogate (\xd8\x00). 03949 * # It is detected by 3rd byte (\00) which is part of next character. 03950 * # So the high surrogate (\xd8\x00) is discarded and 03951 * # the 3rd byte is read again later. 03952 * # Since the byte is buffered in ec, it is dropped from src. 03953 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8") 03954 * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10) 03955 * p ec.primitive_errinfo 03956 * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"] 03957 * p src 03958 * #=> "@" 03959 * 03960 * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE. 03961 * # The problem is detected by 4th byte. 03962 * ec = Encoding::Converter.new("UTF-16LE", "UTF-8") 03963 * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10) 03964 * p ec.primitive_errinfo 03965 * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"] 03966 * p src 03967 * #=> "" 03968 * 03969 */ 03970 static VALUE 03971 econv_primitive_errinfo(VALUE self) 03972 { 03973 rb_econv_t *ec = check_econv(self); 03974 03975 VALUE ary; 03976 03977 ary = rb_ary_new2(5); 03978 03979 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result)); 03980 rb_ary_store(ary, 4, Qnil); 03981 03982 if (ec->last_error.source_encoding) 03983 rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding)); 03984 03985 if (ec->last_error.destination_encoding) 03986 rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding)); 03987 03988 if (ec->last_error.error_bytes_start) { 03989 rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len)); 03990 rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len)); 03991 } 03992 03993 return ary; 03994 } 03995 03996 /* 03997 * call-seq: 03998 * ec.insert_output(string) -> nil 03999 * 04000 * Inserts string into the encoding converter. 04001 * The string will be converted to the destination encoding and 04002 * output on later conversions. 04003 * 04004 * If the destination encoding is stateful, 04005 * string is converted according to the state and the state is updated. 04006 * 04007 * This method should be used only when a conversion error occurs. 04008 * 04009 * ec = Encoding::Converter.new("utf-8", "iso-8859-1") 04010 * src = "HIRAGANA LETTER A is \u{3042}." 04011 * dst = "" 04012 * p ec.primitive_convert(src, dst) #=> :undefined_conversion 04013 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."] 04014 * ec.insert_output("<err>") 04015 * p ec.primitive_convert(src, dst) #=> :finished 04016 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""] 04017 * 04018 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp") 04019 * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp 04020 * dst = "" 04021 * p ec.primitive_convert(src, dst) #=> :undefined_conversion 04022 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"] 04023 * ec.insert_output "?" # state change required to output "?". 04024 * p ec.primitive_convert(src, dst) #=> :finished 04025 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""] 04026 * 04027 */ 04028 static VALUE 04029 econv_insert_output(VALUE self, VALUE string) 04030 { 04031 const char *insert_enc; 04032 04033 int ret; 04034 04035 rb_econv_t *ec = check_econv(self); 04036 04037 StringValue(string); 04038 insert_enc = rb_econv_encoding_to_insert_output(ec); 04039 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil); 04040 04041 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc); 04042 if (ret == -1) { 04043 rb_raise(rb_eArgError, "too big string"); 04044 } 04045 04046 return Qnil; 04047 } 04048 04049 /* 04050 * call-seq 04051 * ec.putback -> string 04052 * ec.putback(max_numbytes) -> string 04053 * 04054 * Put back the bytes which will be converted. 04055 * 04056 * The bytes are caused by invalid_byte_sequence error. 04057 * When invalid_byte_sequence error, some bytes are discarded and 04058 * some bytes are buffered to be converted later. 04059 * The latter bytes can be put back. 04060 * It can be observed by 04061 * Encoding::InvalidByteSequenceError#readagain_bytes and 04062 * Encoding::Converter#primitive_errinfo. 04063 * 04064 * ec = Encoding::Converter.new("utf-16le", "iso-8859-1") 04065 * src = "\x00\xd8\x61\x00" 04066 * dst = "" 04067 * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence 04068 * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"] 04069 * p ec.putback #=> "a\x00" 04070 * p ec.putback #=> "" # no more bytes to put back 04071 * 04072 */ 04073 static VALUE 04074 econv_putback(int argc, VALUE *argv, VALUE self) 04075 { 04076 rb_econv_t *ec = check_econv(self); 04077 int n; 04078 int putbackable; 04079 VALUE str, max; 04080 04081 rb_scan_args(argc, argv, "01", &max); 04082 04083 if (NIL_P(max)) 04084 n = rb_econv_putbackable(ec); 04085 else { 04086 n = NUM2INT(max); 04087 putbackable = rb_econv_putbackable(ec); 04088 if (putbackable < n) 04089 n = putbackable; 04090 } 04091 04092 str = rb_str_new(NULL, n); 04093 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n); 04094 04095 if (ec->source_encoding) { 04096 rb_enc_associate(str, ec->source_encoding); 04097 } 04098 04099 return str; 04100 } 04101 04102 /* 04103 * call-seq: 04104 * ec.last_error -> exception or nil 04105 * 04106 * Returns an exception object for the last conversion. 04107 * Returns nil if the last conversion did not produce an error. 04108 * 04109 * "error" means that 04110 * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for 04111 * Encoding::Converter#convert and 04112 * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for 04113 * Encoding::Converter#primitive_convert. 04114 * 04115 * ec = Encoding::Converter.new("utf-8", "iso-8859-1") 04116 * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence 04117 * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8> 04118 * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full 04119 * p ec.last_error #=> nil 04120 * 04121 */ 04122 static VALUE 04123 econv_last_error(VALUE self) 04124 { 04125 rb_econv_t *ec = check_econv(self); 04126 VALUE exc; 04127 04128 exc = make_econv_exception(ec); 04129 if (NIL_P(exc)) 04130 return Qnil; 04131 return exc; 04132 } 04133 04134 /* 04135 * call-seq: 04136 * ec.replacement -> string 04137 * 04138 * Returns the replacement string. 04139 * 04140 * ec = Encoding::Converter.new("euc-jp", "us-ascii") 04141 * p ec.replacement #=> "?" 04142 * 04143 * ec = Encoding::Converter.new("euc-jp", "utf-8") 04144 * p ec.replacement #=> "\uFFFD" 04145 */ 04146 static VALUE 04147 econv_get_replacement(VALUE self) 04148 { 04149 rb_econv_t *ec = check_econv(self); 04150 int ret; 04151 rb_encoding *enc; 04152 04153 ret = make_replacement(ec); 04154 if (ret == -1) { 04155 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed"); 04156 } 04157 04158 enc = rb_enc_find(ec->replacement_enc); 04159 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc); 04160 } 04161 04162 /* 04163 * call-seq: 04164 * ec.replacement = string 04165 * 04166 * Sets the replacement string. 04167 * 04168 * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace) 04169 * ec.replacement = "<undef>" 04170 * p ec.convert("a \u3042 b") #=> "a <undef> b" 04171 */ 04172 static VALUE 04173 econv_set_replacement(VALUE self, VALUE arg) 04174 { 04175 rb_econv_t *ec = check_econv(self); 04176 VALUE string = arg; 04177 int ret; 04178 rb_encoding *enc; 04179 04180 StringValue(string); 04181 enc = rb_enc_get(string); 04182 04183 ret = rb_econv_set_replacement(ec, 04184 (const unsigned char *)RSTRING_PTR(string), 04185 RSTRING_LEN(string), 04186 rb_enc_name(enc)); 04187 04188 if (ret == -1) { 04189 /* xxx: rb_eInvalidByteSequenceError? */ 04190 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed"); 04191 } 04192 04193 return arg; 04194 } 04195 04196 VALUE 04197 rb_econv_make_exception(rb_econv_t *ec) 04198 { 04199 return make_econv_exception(ec); 04200 } 04201 04202 void 04203 rb_econv_check_error(rb_econv_t *ec) 04204 { 04205 VALUE exc; 04206 04207 exc = make_econv_exception(ec); 04208 if (NIL_P(exc)) 04209 return; 04210 rb_exc_raise(exc); 04211 } 04212 04213 /* 04214 * call-seq: 04215 * ecerr.source_encoding_name -> string 04216 * 04217 * Returns the source encoding name as a string. 04218 */ 04219 static VALUE 04220 ecerr_source_encoding_name(VALUE self) 04221 { 04222 return rb_attr_get(self, rb_intern("source_encoding_name")); 04223 } 04224 04225 /* 04226 * call-seq: 04227 * ecerr.source_encoding -> encoding 04228 * 04229 * Returns the source encoding as an encoding object. 04230 * 04231 * Note that the result may not be equal to the source encoding of 04232 * the encoding converter if the conversion has multiple steps. 04233 * 04234 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP 04235 * begin 04236 * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP. 04237 * rescue Encoding::UndefinedConversionError 04238 * p $!.source_encoding #=> #<Encoding:UTF-8> 04239 * p $!.destination_encoding #=> #<Encoding:EUC-JP> 04240 * p $!.source_encoding_name #=> "UTF-8" 04241 * p $!.destination_encoding_name #=> "EUC-JP" 04242 * end 04243 * 04244 */ 04245 static VALUE 04246 ecerr_source_encoding(VALUE self) 04247 { 04248 return rb_attr_get(self, rb_intern("source_encoding")); 04249 } 04250 04251 /* 04252 * call-seq: 04253 * ecerr.destination_encoding_name -> string 04254 * 04255 * Returns the destination encoding name as a string. 04256 */ 04257 static VALUE 04258 ecerr_destination_encoding_name(VALUE self) 04259 { 04260 return rb_attr_get(self, rb_intern("destination_encoding_name")); 04261 } 04262 04263 /* 04264 * call-seq: 04265 * ecerr.destination_encoding -> string 04266 * 04267 * Returns the destination encoding as an encoding object. 04268 */ 04269 static VALUE 04270 ecerr_destination_encoding(VALUE self) 04271 { 04272 return rb_attr_get(self, rb_intern("destination_encoding")); 04273 } 04274 04275 /* 04276 * call-seq: 04277 * ecerr.error_char -> string 04278 * 04279 * Returns the one-character string which cause Encoding::UndefinedConversionError. 04280 * 04281 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") 04282 * begin 04283 * ec.convert("\xa0") 04284 * rescue Encoding::UndefinedConversionError 04285 * puts $!.error_char.dump #=> "\xC2\xA0" 04286 * p $!.error_char.encoding #=> #<Encoding:UTF-8> 04287 * end 04288 * 04289 */ 04290 static VALUE 04291 ecerr_error_char(VALUE self) 04292 { 04293 return rb_attr_get(self, rb_intern("error_char")); 04294 } 04295 04296 /* 04297 * call-seq: 04298 * ecerr.error_bytes -> string 04299 * 04300 * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs. 04301 * 04302 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") 04303 * begin 04304 * ec.convert("abc\xA1\xFFdef") 04305 * rescue Encoding::InvalidByteSequenceError 04306 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP> 04307 * puts $!.error_bytes.dump #=> "\xA1" 04308 * puts $!.readagain_bytes.dump #=> "\xFF" 04309 * end 04310 */ 04311 static VALUE 04312 ecerr_error_bytes(VALUE self) 04313 { 04314 return rb_attr_get(self, rb_intern("error_bytes")); 04315 } 04316 04317 /* 04318 * call-seq: 04319 * ecerr.readagain_bytes -> string 04320 * 04321 * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs. 04322 */ 04323 static VALUE 04324 ecerr_readagain_bytes(VALUE self) 04325 { 04326 return rb_attr_get(self, rb_intern("readagain_bytes")); 04327 } 04328 04329 /* 04330 * call-seq: 04331 * ecerr.incomplete_input? -> true or false 04332 * 04333 * Returns true if the invalid byte sequence error is caused by 04334 * premature end of string. 04335 * 04336 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") 04337 * 04338 * begin 04339 * ec.convert("abc\xA1z") 04340 * rescue Encoding::InvalidByteSequenceError 04341 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP> 04342 * p $!.incomplete_input? #=> false 04343 * end 04344 * 04345 * begin 04346 * ec.convert("abc\xA1") 04347 * ec.finish 04348 * rescue Encoding::InvalidByteSequenceError 04349 * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP> 04350 * p $!.incomplete_input? #=> true 04351 * end 04352 */ 04353 static VALUE 04354 ecerr_incomplete_input(VALUE self) 04355 { 04356 return rb_attr_get(self, rb_intern("incomplete_input")); 04357 } 04358 04359 /* 04360 * Document-class: Encoding::UndefinedConversionError 04361 * 04362 * Raised by Encoding and String methods when a transcoding operation 04363 * fails. 04364 */ 04365 04366 /* 04367 * Document-class: Encoding::InvalidByteSequenceError 04368 * 04369 * Raised by Encoding and String methods when the string being 04370 * transcoded contains a byte invalid for the either the source or 04371 * target encoding. 04372 */ 04373 04374 /* 04375 * Document-class: Encoding::ConverterNotFoundError 04376 * 04377 * Raised by transcoding methods when a named encoding does not 04378 * correspond with a known converter. 04379 */ 04380 04381 void 04382 Init_transcode(void) 04383 { 04384 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError); 04385 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError); 04386 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError); 04387 04388 transcoder_table = st_init_strcasetable(); 04389 04390 sym_invalid = ID2SYM(rb_intern("invalid")); 04391 sym_undef = ID2SYM(rb_intern("undef")); 04392 sym_replace = ID2SYM(rb_intern("replace")); 04393 sym_fallback = ID2SYM(rb_intern("fallback")); 04394 sym_aref = ID2SYM(rb_intern("[]")); 04395 sym_xml = ID2SYM(rb_intern("xml")); 04396 sym_text = ID2SYM(rb_intern("text")); 04397 sym_attr = ID2SYM(rb_intern("attr")); 04398 04399 sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence")); 04400 sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion")); 04401 sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full")); 04402 sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty")); 04403 sym_finished = ID2SYM(rb_intern("finished")); 04404 sym_after_output = ID2SYM(rb_intern("after_output")); 04405 sym_incomplete_input = ID2SYM(rb_intern("incomplete_input")); 04406 sym_universal_newline = ID2SYM(rb_intern("universal_newline")); 04407 sym_crlf_newline = ID2SYM(rb_intern("crlf_newline")); 04408 sym_cr_newline = ID2SYM(rb_intern("cr_newline")); 04409 sym_partial_input = ID2SYM(rb_intern("partial_input")); 04410 04411 #ifdef ENABLE_ECONV_NEWLINE_OPTION 04412 sym_newline = ID2SYM(rb_intern("newline")); 04413 sym_universal = ID2SYM(rb_intern("universal")); 04414 sym_crlf = ID2SYM(rb_intern("crlf")); 04415 sym_cr = ID2SYM(rb_intern("cr")); 04416 sym_lf = ID2SYM(rb_intern("lf")); 04417 #endif 04418 04419 rb_define_method(rb_cString, "encode", str_encode, -1); 04420 rb_define_method(rb_cString, "encode!", str_encode_bang, -1); 04421 04422 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cData); 04423 rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate); 04424 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1); 04425 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1); 04426 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1); 04427 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0); 04428 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0); 04429 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0); 04430 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0); 04431 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1); 04432 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1); 04433 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0); 04434 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0); 04435 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1); 04436 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1); 04437 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0); 04438 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0); 04439 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1); 04440 rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1); 04441 04442 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK)); 04443 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE)); 04444 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK)); 04445 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE)); 04446 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF)); 04447 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT)); 04448 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT)); 04449 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR)); 04450 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR)); 04451 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR)); 04452 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR)); 04453 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR)); 04454 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR)); 04455 04456 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0); 04457 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0); 04458 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0); 04459 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0); 04460 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0); 04461 04462 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0); 04463 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0); 04464 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0); 04465 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0); 04466 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0); 04467 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0); 04468 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0); 04469 04470 Init_newline(); 04471 } 04472