Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /* 00002 * NKF - Ruby extension for Network Kanji Filter 00003 * 00004 * original nkf2.x is maintained at http://sourceforge.jp/projects/nkf/ 00005 * 00006 * $Id: nkf.c 27947 2010-05-21 10:11:44Z nobu $ 00007 * 00008 */ 00009 00010 #define RUBY_NKF_REVISION "$Revision: 27947 $" 00011 #define RUBY_NKF_VERSION NKF_VERSION " (" NKF_RELEASE_DATE ")" 00012 00013 #include "ruby/ruby.h" 00014 #include "ruby/encoding.h" 00015 00016 /* Replace nkf's getchar/putchar for variable modification */ 00017 /* we never use getc, ungetc */ 00018 00019 #undef getc 00020 #undef ungetc 00021 #define getc(f) (input_ctr>=i_len?-1:input[input_ctr++]) 00022 #define ungetc(c,f) input_ctr-- 00023 00024 #define INCSIZE 32 00025 #undef putchar 00026 #undef TRUE 00027 #undef FALSE 00028 #define putchar(c) rb_nkf_putchar(c) 00029 00030 /* Input/Output pointers */ 00031 00032 static unsigned char *output; 00033 static unsigned char *input; 00034 static int input_ctr; 00035 static int i_len; 00036 static int output_ctr; 00037 static int o_len; 00038 static int incsize; 00039 00040 static VALUE result; 00041 00042 static int 00043 rb_nkf_putchar(unsigned int c) 00044 { 00045 if (output_ctr >= o_len) { 00046 o_len += incsize; 00047 rb_str_resize(result, o_len); 00048 incsize *= 2; 00049 output = (unsigned char *)RSTRING_PTR(result); 00050 } 00051 output[output_ctr++] = c; 00052 00053 return c; 00054 } 00055 00056 /* Include kanji filter main part */ 00057 /* getchar and putchar will be replaced during inclusion */ 00058 00059 #define PERL_XS 1 00060 #include "nkf-utf8/config.h" 00061 #include "nkf-utf8/utf8tbl.c" 00062 #include "nkf-utf8/nkf.c" 00063 00064 rb_encoding* rb_nkf_enc_get(const char *name) 00065 { 00066 int idx = rb_enc_find_index(name); 00067 if (idx < 0) { 00068 nkf_encoding *nkf_enc = nkf_enc_find(name); 00069 idx = rb_enc_find_index(nkf_enc_name(nkf_enc_to_base_encoding(nkf_enc))); 00070 if (idx < 0) { 00071 idx = rb_define_dummy_encoding(name); 00072 } 00073 } 00074 return rb_enc_from_index(idx); 00075 } 00076 00077 int nkf_split_options(const char *arg) 00078 { 00079 int count = 0; 00080 unsigned char option[256]; 00081 int i = 0, j = 0; 00082 int is_escaped = FALSE; 00083 int is_single_quoted = FALSE; 00084 int is_double_quoted = FALSE; 00085 for(i = 0; arg[i]; i++){ 00086 if(j == 255){ 00087 return -1; 00088 }else if(is_single_quoted){ 00089 if(arg[i] == '\''){ 00090 is_single_quoted = FALSE; 00091 }else{ 00092 option[j++] = arg[i]; 00093 } 00094 }else if(is_escaped){ 00095 is_escaped = FALSE; 00096 option[j++] = arg[i]; 00097 }else if(arg[i] == '\\'){ 00098 is_escaped = TRUE; 00099 }else if(is_double_quoted){ 00100 if(arg[i] == '"'){ 00101 is_double_quoted = FALSE; 00102 }else{ 00103 option[j++] = arg[i]; 00104 } 00105 }else if(arg[i] == '\''){ 00106 is_single_quoted = TRUE; 00107 }else if(arg[i] == '"'){ 00108 is_double_quoted = TRUE; 00109 }else if(arg[i] == ' '){ 00110 option[j] = '\0'; 00111 options(option); 00112 j = 0; 00113 }else{ 00114 option[j++] = arg[i]; 00115 } 00116 } 00117 if(j){ 00118 option[j] = '\0'; 00119 options(option); 00120 } 00121 return count; 00122 } 00123 00124 /* 00125 * call-seq: 00126 * NKF.nkf(opt, str) => string 00127 * 00128 * Convert _str_ and return converted result. 00129 * Conversion details are specified by _opt_ as String. 00130 * 00131 * require 'nkf' 00132 * output = NKF.nkf("-s", input) 00133 */ 00134 00135 static VALUE 00136 rb_nkf_convert(VALUE obj, VALUE opt, VALUE src) 00137 { 00138 volatile VALUE tmp; 00139 reinit(); 00140 StringValue(opt); 00141 nkf_split_options(RSTRING_PTR(opt)); 00142 if (!output_encoding) rb_raise(rb_eArgError, "no output encoding given"); 00143 00144 switch (nkf_enc_to_index(output_encoding)) { 00145 case UTF_8_BOM: output_encoding = nkf_enc_from_index(UTF_8); break; 00146 case UTF_16BE_BOM: output_encoding = nkf_enc_from_index(UTF_16BE); break; 00147 case UTF_16LE_BOM: output_encoding = nkf_enc_from_index(UTF_16LE); break; 00148 case UTF_32BE_BOM: output_encoding = nkf_enc_from_index(UTF_32BE); break; 00149 case UTF_32LE_BOM: output_encoding = nkf_enc_from_index(UTF_32LE); break; 00150 } 00151 output_bom_f = FALSE; 00152 00153 incsize = INCSIZE; 00154 00155 input_ctr = 0; 00156 StringValue(src); 00157 input = (unsigned char *)RSTRING_PTR(src); 00158 i_len = RSTRING_LENINT(src); 00159 tmp = result = rb_str_new(0, i_len*3 + 10); 00160 00161 output_ctr = 0; 00162 output = (unsigned char *)RSTRING_PTR(result); 00163 o_len = RSTRING_LENINT(result); 00164 *output = '\0'; 00165 00166 kanji_convert(NULL); 00167 rb_str_set_len(result, output_ctr); 00168 OBJ_INFECT(result, src); 00169 00170 if (mimeout_f) 00171 rb_enc_associate(result, rb_usascii_encoding()); 00172 else 00173 rb_enc_associate(result, rb_nkf_enc_get(nkf_enc_name(output_encoding))); 00174 00175 return result; 00176 } 00177 00178 00179 /* 00180 * call-seq: 00181 * NKF.guess(str) => encoding 00182 * 00183 * Returns guessed encoding of _str_ by nkf routine. 00184 * 00185 */ 00186 00187 static VALUE 00188 rb_nkf_guess(VALUE obj, VALUE src) 00189 { 00190 reinit(); 00191 00192 input_ctr = 0; 00193 StringValue(src); 00194 input = (unsigned char *)RSTRING_PTR(src); 00195 i_len = RSTRING_LENINT(src); 00196 00197 guess_f = TRUE; 00198 kanji_convert( NULL ); 00199 guess_f = FALSE; 00200 00201 return rb_enc_from_encoding(rb_nkf_enc_get(get_guessed_code())); 00202 } 00203 00204 00205 /* 00206 * NKF - Ruby extension for Network Kanji Filter 00207 * 00208 * == Description 00209 * 00210 * This is a Ruby Extension version of nkf (Network Kanji Filter). 00211 * It converts the first argument and returns converted result. Conversion 00212 * details are specified by flags as the first argument. 00213 * 00214 * *Nkf* is a yet another kanji code converter among networks, hosts and terminals. 00215 * It converts input kanji code to designated kanji code 00216 * such as ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8 or UTF-16. 00217 * 00218 * One of the most unique faculty of *nkf* is the guess of the input kanji encodings. 00219 * It currently recognizes ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8 and UTF-16. 00220 * So users needn't set the input kanji code explicitly. 00221 * 00222 * By default, X0201 kana is converted into X0208 kana. 00223 * For X0201 kana, SO/SI, SSO and ESC-(-I methods are supported. 00224 * For automatic code detection, nkf assumes no X0201 kana in Shift_JIS. 00225 * To accept X0201 in Shift_JIS, use <b>-X</b>, <b>-x</b> or <b>-S</b>. 00226 * 00227 * == Flags 00228 * 00229 * === -b -u 00230 * 00231 * Output is buffered (DEFAULT), Output is unbuffered. 00232 * 00233 * === -j -s -e -w -w16 -w32 00234 * 00235 * Output code is ISO-2022-JP (7bit JIS), Shift_JIS, EUC-JP, 00236 * UTF-8N, UTF-16BE, UTF-32BE. 00237 * Without this option and compile option, ISO-2022-JP is assumed. 00238 * 00239 * === -J -S -E -W -W16 -W32 00240 * 00241 * Input assumption is JIS 7 bit, Shift_JIS, EUC-JP, 00242 * UTF-8, UTF-16, UTF-32. 00243 * 00244 * ==== -J 00245 * 00246 * Assume JIS input. It also accepts EUC-JP. 00247 * This is the default. This flag does not exclude Shift_JIS. 00248 * 00249 * ==== -S 00250 * 00251 * Assume Shift_JIS and X0201 kana input. It also accepts JIS. 00252 * EUC-JP is recognized as X0201 kana. Without <b>-x</b> flag, 00253 * X0201 kana (halfwidth kana) is converted into X0208. 00254 * 00255 * ==== -E 00256 * 00257 * Assume EUC-JP input. It also accepts JIS. 00258 * Same as -J. 00259 * 00260 * === -t 00261 * 00262 * No conversion. 00263 * 00264 * === -i_ 00265 * 00266 * Output sequence to designate JIS-kanji. (DEFAULT B) 00267 * 00268 * === -o_ 00269 * 00270 * Output sequence to designate ASCII. (DEFAULT B) 00271 * 00272 * === -r 00273 * 00274 * {de/en}crypt ROT13/47 00275 * 00276 * === -h[123] --hiragana --katakana --katakana-hiragana 00277 * 00278 * [-h1 --hiragana] Katakana to Hiragana conversion. 00279 * 00280 * [-h2 --katakana] Hiragana to Katakana conversion. 00281 * 00282 * [-h3 --katakana-hiragana] Katakana to Hiragana and Hiragana to Katakana conversion. 00283 * 00284 * === -T 00285 * 00286 * Text mode output (MS-DOS) 00287 * 00288 * === -l 00289 * 00290 * ISO8859-1 (Latin-1) support 00291 * 00292 * === -f[<code>m</code> [- <code>n</code>]] 00293 * 00294 * Folding on <code>m</code> length with <code>n</code> margin in a line. 00295 * Without this option, fold length is 60 and fold margin is 10. 00296 * 00297 * === -F 00298 * 00299 * New line preserving line folding. 00300 * 00301 * === -Z[0-3] 00302 * 00303 * Convert X0208 alphabet (Fullwidth Alphabets) to ASCII. 00304 * 00305 * [-Z -Z0] Convert X0208 alphabet to ASCII. 00306 * 00307 * [-Z1] Converts X0208 kankaku to single ASCII space. 00308 * 00309 * [-Z2] Converts X0208 kankaku to double ASCII spaces. 00310 * 00311 * [-Z3] Replacing Fullwidth >, <, ", & into '>', '<', '"', '&' as in HTML. 00312 * 00313 * === -X -x 00314 * 00315 * Assume X0201 kana in MS-Kanji. 00316 * With <b>-X</b> or without this option, X0201 is converted into X0208 Kana. 00317 * With <b>-x</b>, try to preserve X0208 kana and do not convert X0201 kana to X0208. 00318 * In JIS output, ESC-(-I is used. In EUC output, SSO is used. 00319 * 00320 * === -B[0-2] 00321 * 00322 * Assume broken JIS-Kanji input, which lost ESC. 00323 * Useful when your site is using old B-News Nihongo patch. 00324 * 00325 * [-B1] allows any char after ESC-( or ESC-$. 00326 * 00327 * [-B2] forces ASCII after NL. 00328 * 00329 * === -I 00330 * 00331 * Replacing non iso-2022-jp char into a geta character 00332 * (substitute character in Japanese). 00333 * 00334 * === -d -c 00335 * 00336 * Delete \r in line feed, Add \r in line feed. 00337 * 00338 * === -m[BQN0] 00339 * 00340 * MIME ISO-2022-JP/ISO8859-1 decode. (DEFAULT) 00341 * To see ISO8859-1 (Latin-1) -l is necessary. 00342 * 00343 * [-mB] Decode MIME base64 encoded stream. Remove header or other part before 00344 * conversion. 00345 * 00346 * [-mQ] Decode MIME quoted stream. '_' in quoted stream is converted to space. 00347 * 00348 * [-mN] Non-strict decoding. 00349 * It allows line break in the middle of the base64 encoding. 00350 * 00351 * [-m0] No MIME decode. 00352 * 00353 * === -M 00354 * 00355 * MIME encode. Header style. All ASCII code and control characters are intact. 00356 * Kanji conversion is performed before encoding, so this cannot be used as a picture encoder. 00357 * 00358 * [-MB] MIME encode Base64 stream. 00359 * 00360 * [-MQ] Perfome quoted encoding. 00361 * 00362 * === -l 00363 * 00364 * Input and output code is ISO8859-1 (Latin-1) and ISO-2022-JP. 00365 * <b>-s</b>, <b>-e</b> and <b>-x</b> are not compatible with this option. 00366 * 00367 * === -L[uwm] 00368 * 00369 * new line mode 00370 * Without this option, nkf doesn't convert line breaks. 00371 * 00372 * [-Lu] unix (LF) 00373 * 00374 * [-Lw] windows (CRLF) 00375 * 00376 * [-Lm] mac (CR) 00377 * 00378 * === --fj --unix --mac --msdos --windows 00379 * 00380 * convert for these system 00381 * 00382 * === --jis --euc --sjis --mime --base64 00383 * 00384 * convert for named code 00385 * 00386 * === --jis-input --euc-input --sjis-input --mime-input --base64-input 00387 * 00388 * assume input system 00389 * 00390 * === --ic=<code>input codeset</code> --oc=<code>output codeset</code> 00391 * 00392 * Set the input or output codeset. 00393 * NKF supports following codesets and those codeset name are case insensitive. 00394 * 00395 * [ISO-2022-JP] a.k.a. RFC1468, 7bit JIS, JUNET 00396 * 00397 * [EUC-JP (eucJP-nkf)] a.k.a. AT&T JIS, Japanese EUC, UJIS 00398 * 00399 * [eucJP-ascii] a.k.a. x-eucjp-open-19970715-ascii 00400 * 00401 * [eucJP-ms] a.k.a. x-eucjp-open-19970715-ms 00402 * 00403 * [CP51932] Microsoft Version of EUC-JP. 00404 * 00405 * [Shift_JIS] SJIS, MS-Kanji 00406 * 00407 * [Windows-31J] a.k.a. CP932 00408 * 00409 * [UTF-8] same as UTF-8N 00410 * 00411 * [UTF-8N] UTF-8 without BOM 00412 * 00413 * [UTF-8-BOM] UTF-8 with BOM 00414 * 00415 * [UTF-16] same as UTF-16BE 00416 * 00417 * [UTF-16BE] UTF-16 Big Endian without BOM 00418 * 00419 * [UTF-16BE-BOM] UTF-16 Big Endian with BOM 00420 * 00421 * [UTF-16LE] UTF-16 Little Endian without BOM 00422 * 00423 * [UTF-16LE-BOM] UTF-16 Little Endian with BOM 00424 * 00425 * [UTF-32] same as UTF-32BE 00426 * 00427 * [UTF-32BE] UTF-32 Big Endian without BOM 00428 * 00429 * [UTF-32BE-BOM] UTF-32 Big Endian with BOM 00430 * 00431 * [UTF-32LE] UTF-32 Little Endian without BOM 00432 * 00433 * [UTF-32LE-BOM] UTF-32 Little Endian with BOM 00434 * 00435 * [UTF8-MAC] NKDed UTF-8, a.k.a. UTF8-NFD (input only) 00436 * 00437 * === --fb-{skip, html, xml, perl, java, subchar} 00438 * 00439 * Specify the way that nkf handles unassigned characters. 00440 * Without this option, --fb-skip is assumed. 00441 * 00442 * === --prefix= <code>escape character</code> <code>target character</code> .. 00443 * 00444 * When nkf converts to Shift_JIS, 00445 * nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters. 00446 * 1st byte of argument is the escape character and following bytes are target characters. 00447 * 00448 * === --no-cp932ext 00449 * 00450 * Handle the characters extended in CP932 as unassigned characters. 00451 * 00452 * == --no-best-fit-chars 00453 * 00454 * When Unicode to Encoded byte conversion, 00455 * don't convert characters which is not round trip safe. 00456 * When Unicode to Unicode conversion, 00457 * with this and -x option, nkf can be used as UTF converter. 00458 * (In other words, without this and -x option, nkf doesn't save some characters) 00459 * 00460 * When nkf convert string which related to path, you should use this opion. 00461 * 00462 * === --cap-input 00463 * 00464 * Decode hex encoded characters. 00465 * 00466 * === --url-input 00467 * 00468 * Unescape percent escaped characters. 00469 * 00470 * === -- 00471 * 00472 * Ignore rest of -option. 00473 */ 00474 00475 void 00476 Init_nkf() 00477 { 00478 VALUE mNKF = rb_define_module("NKF"); 00479 00480 rb_define_module_function(mNKF, "nkf", rb_nkf_convert, 2); 00481 rb_define_module_function(mNKF, "guess", rb_nkf_guess, 1); 00482 rb_define_alias(rb_singleton_class(mNKF), "guess", "guess"); 00483 00484 rb_define_const(mNKF, "AUTO", Qnil); 00485 rb_define_const(mNKF, "NOCONV", Qnil); 00486 rb_define_const(mNKF, "UNKNOWN", Qnil); 00487 rb_define_const(mNKF, "BINARY", rb_enc_from_encoding(rb_nkf_enc_get("BINARY"))); 00488 rb_define_const(mNKF, "ASCII", rb_enc_from_encoding(rb_nkf_enc_get("US-ASCII"))); 00489 rb_define_const(mNKF, "JIS", rb_enc_from_encoding(rb_nkf_enc_get("ISO-2022-JP"))); 00490 rb_define_const(mNKF, "EUC", rb_enc_from_encoding(rb_nkf_enc_get("EUC-JP"))); 00491 rb_define_const(mNKF, "SJIS", rb_enc_from_encoding(rb_nkf_enc_get("Shift_JIS"))); 00492 rb_define_const(mNKF, "UTF8", rb_enc_from_encoding(rb_utf8_encoding())); 00493 rb_define_const(mNKF, "UTF16", rb_enc_from_encoding(rb_nkf_enc_get("UTF-16BE"))); 00494 rb_define_const(mNKF, "UTF32", rb_enc_from_encoding(rb_nkf_enc_get("UTF-32BE"))); 00495 00496 /* Full version string of nkf */ 00497 rb_define_const(mNKF, "VERSION", rb_str_new2(RUBY_NKF_VERSION)); 00498 /* Version of nkf */ 00499 rb_define_const(mNKF, "NKF_VERSION", rb_str_new2(NKF_VERSION)); 00500 /* Release date of nkf */ 00501 rb_define_const(mNKF, "NKF_RELEASE_DATE", rb_str_new2(NKF_RELEASE_DATE)); 00502 } 00503