Ruby 1.9.3p327(2012-11-10revision37606)
|
00001 /* 00002 * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA). 00003 * Copyright (c) 1996-2010, The nkf Project. 00004 * 00005 * This software is provided 'as-is', without any express or implied 00006 * warranty. In no event will the authors be held liable for any damages 00007 * arising from the use of this software. 00008 * 00009 * Permission is granted to anyone to use this software for any purpose, 00010 * including commercial applications, and to alter it and redistribute it 00011 * freely, subject to the following restrictions: 00012 * 00013 * 1. The origin of this software must not be misrepresented; you must not 00014 * claim that you wrote the original software. If you use this software 00015 * in a product, an acknowledgment in the product documentation would be 00016 * appreciated but is not required. 00017 * 00018 * 2. Altered source versions must be plainly marked as such, and must not be 00019 * misrepresented as being the original software. 00020 * 00021 * 3. This notice may not be removed or altered from any source distribution. 00022 */ 00023 #define NKF_VERSION "2.1.2" 00024 #define NKF_RELEASE_DATE "2011-09-08" 00025 #define COPY_RIGHT \ 00026 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \ 00027 "Copyright (C) 1996-2011, The nkf Project." 00028 00029 #include "config.h" 00030 #include "nkf.h" 00031 #include "utf8tbl.h" 00032 #ifdef __WIN32__ 00033 #include <windows.h> 00034 #include <locale.h> 00035 #endif 00036 #if defined(__OS2__) 00037 # define INCL_DOS 00038 # define INCL_DOSERRORS 00039 # include <os2.h> 00040 #endif 00041 #include <assert.h> 00042 00043 00044 /* state of output_mode and input_mode 00045 00046 c2 0 means ASCII 00047 JIS_X_0201_1976_K 00048 ISO_8859_1 00049 JIS_X_0208 00050 EOF all termination 00051 c1 32bit data 00052 00053 */ 00054 00055 /* MIME ENCODE */ 00056 00057 #define FIXED_MIME 7 00058 #define STRICT_MIME 8 00059 00060 /* byte order */ 00061 enum byte_order { 00062 ENDIAN_BIG = 1, 00063 ENDIAN_LITTLE = 2, 00064 ENDIAN_2143 = 3, 00065 ENDIAN_3412 = 4 00066 }; 00067 00068 /* ASCII CODE */ 00069 00070 #define BS 0x08 00071 #define TAB 0x09 00072 #define LF 0x0a 00073 #define CR 0x0d 00074 #define ESC 0x1b 00075 #define SP 0x20 00076 #define DEL 0x7f 00077 #define SI 0x0f 00078 #define SO 0x0e 00079 #define SS2 0x8e 00080 #define SS3 0x8f 00081 #define CRLF 0x0D0A 00082 00083 00084 /* encodings */ 00085 00086 enum nkf_encodings { 00087 ASCII, 00088 ISO_8859_1, 00089 ISO_2022_JP, 00090 CP50220, 00091 CP50221, 00092 CP50222, 00093 ISO_2022_JP_1, 00094 ISO_2022_JP_3, 00095 ISO_2022_JP_2004, 00096 SHIFT_JIS, 00097 WINDOWS_31J, 00098 CP10001, 00099 EUC_JP, 00100 EUCJP_NKF, 00101 CP51932, 00102 EUCJP_MS, 00103 EUCJP_ASCII, 00104 SHIFT_JISX0213, 00105 SHIFT_JIS_2004, 00106 EUC_JISX0213, 00107 EUC_JIS_2004, 00108 UTF_8, 00109 UTF_8N, 00110 UTF_8_BOM, 00111 UTF8_MAC, 00112 UTF_16, 00113 UTF_16BE, 00114 UTF_16BE_BOM, 00115 UTF_16LE, 00116 UTF_16LE_BOM, 00117 UTF_32, 00118 UTF_32BE, 00119 UTF_32BE_BOM, 00120 UTF_32LE, 00121 UTF_32LE_BOM, 00122 BINARY, 00123 NKF_ENCODING_TABLE_SIZE, 00124 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */ 00125 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */ 00126 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */ 00127 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */ 00128 JIS_X_0208 = 0x1168, /* @B */ 00129 JIS_X_0212 = 0x1159, /* D */ 00130 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */ 00131 JIS_X_0213_2 = 0x1229, /* P */ 00132 JIS_X_0213_1 = 0x1233 /* Q */ 00133 }; 00134 00135 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0); 00136 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0); 00137 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0); 00138 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0); 00139 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0); 00140 static void j_oconv(nkf_char c2, nkf_char c1); 00141 static void s_oconv(nkf_char c2, nkf_char c1); 00142 static void e_oconv(nkf_char c2, nkf_char c1); 00143 static void w_oconv(nkf_char c2, nkf_char c1); 00144 static void w_oconv16(nkf_char c2, nkf_char c1); 00145 static void w_oconv32(nkf_char c2, nkf_char c1); 00146 00147 typedef struct { 00148 const char *name; 00149 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0); 00150 void (*oconv)(nkf_char c2, nkf_char c1); 00151 } nkf_native_encoding; 00152 00153 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv }; 00154 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv }; 00155 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv }; 00156 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv }; 00157 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv }; 00158 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 }; 00159 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 }; 00160 00161 typedef struct { 00162 const int id; 00163 const char *name; 00164 const nkf_native_encoding *base_encoding; 00165 } nkf_encoding; 00166 00167 nkf_encoding nkf_encoding_table[] = { 00168 {ASCII, "US-ASCII", &NkfEncodingASCII}, 00169 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII}, 00170 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP}, 00171 {CP50220, "CP50220", &NkfEncodingISO_2022_JP}, 00172 {CP50221, "CP50221", &NkfEncodingISO_2022_JP}, 00173 {CP50222, "CP50222", &NkfEncodingISO_2022_JP}, 00174 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP}, 00175 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP}, 00176 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP}, 00177 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS}, 00178 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS}, 00179 {CP10001, "CP10001", &NkfEncodingShift_JIS}, 00180 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP}, 00181 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP}, 00182 {CP51932, "CP51932", &NkfEncodingEUC_JP}, 00183 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP}, 00184 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP}, 00185 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS}, 00186 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS}, 00187 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP}, 00188 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP}, 00189 {UTF_8, "UTF-8", &NkfEncodingUTF_8}, 00190 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8}, 00191 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8}, 00192 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8}, 00193 {UTF_16, "UTF-16", &NkfEncodingUTF_16}, 00194 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16}, 00195 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16}, 00196 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16}, 00197 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16}, 00198 {UTF_32, "UTF-32", &NkfEncodingUTF_32}, 00199 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32}, 00200 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32}, 00201 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32}, 00202 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32}, 00203 {BINARY, "BINARY", &NkfEncodingASCII}, 00204 {-1, NULL, NULL} 00205 }; 00206 00207 struct { 00208 const char *name; 00209 const int id; 00210 } encoding_name_to_id_table[] = { 00211 {"US-ASCII", ASCII}, 00212 {"ASCII", ASCII}, 00213 {"646", ASCII}, 00214 {"ROMAN8", ASCII}, 00215 {"ISO-2022-JP", ISO_2022_JP}, 00216 {"ISO2022JP-CP932", CP50220}, 00217 {"CP50220", CP50220}, 00218 {"CP50221", CP50221}, 00219 {"CSISO2022JP", CP50221}, 00220 {"CP50222", CP50222}, 00221 {"ISO-2022-JP-1", ISO_2022_JP_1}, 00222 {"ISO-2022-JP-3", ISO_2022_JP_3}, 00223 {"ISO-2022-JP-2004", ISO_2022_JP_2004}, 00224 {"SHIFT_JIS", SHIFT_JIS}, 00225 {"SJIS", SHIFT_JIS}, 00226 {"MS_Kanji", SHIFT_JIS}, 00227 {"PCK", SHIFT_JIS}, 00228 {"WINDOWS-31J", WINDOWS_31J}, 00229 {"CSWINDOWS31J", WINDOWS_31J}, 00230 {"CP932", WINDOWS_31J}, 00231 {"MS932", WINDOWS_31J}, 00232 {"CP10001", CP10001}, 00233 {"EUCJP", EUC_JP}, 00234 {"EUC-JP", EUC_JP}, 00235 {"EUCJP-NKF", EUCJP_NKF}, 00236 {"CP51932", CP51932}, 00237 {"EUC-JP-MS", EUCJP_MS}, 00238 {"EUCJP-MS", EUCJP_MS}, 00239 {"EUCJPMS", EUCJP_MS}, 00240 {"EUC-JP-ASCII", EUCJP_ASCII}, 00241 {"EUCJP-ASCII", EUCJP_ASCII}, 00242 {"SHIFT_JISX0213", SHIFT_JISX0213}, 00243 {"SHIFT_JIS-2004", SHIFT_JIS_2004}, 00244 {"EUC-JISX0213", EUC_JISX0213}, 00245 {"EUC-JIS-2004", EUC_JIS_2004}, 00246 {"UTF-8", UTF_8}, 00247 {"UTF-8N", UTF_8N}, 00248 {"UTF-8-BOM", UTF_8_BOM}, 00249 {"UTF8-MAC", UTF8_MAC}, 00250 {"UTF-8-MAC", UTF8_MAC}, 00251 {"UTF-16", UTF_16}, 00252 {"UTF-16BE", UTF_16BE}, 00253 {"UTF-16BE-BOM", UTF_16BE_BOM}, 00254 {"UTF-16LE", UTF_16LE}, 00255 {"UTF-16LE-BOM", UTF_16LE_BOM}, 00256 {"UTF-32", UTF_32}, 00257 {"UTF-32BE", UTF_32BE}, 00258 {"UTF-32BE-BOM", UTF_32BE_BOM}, 00259 {"UTF-32LE", UTF_32LE}, 00260 {"UTF-32LE-BOM", UTF_32LE_BOM}, 00261 {"BINARY", BINARY}, 00262 {NULL, -1} 00263 }; 00264 00265 #if defined(DEFAULT_CODE_JIS) 00266 #define DEFAULT_ENCIDX ISO_2022_JP 00267 #elif defined(DEFAULT_CODE_SJIS) 00268 #define DEFAULT_ENCIDX SHIFT_JIS 00269 #elif defined(DEFAULT_CODE_WINDOWS_31J) 00270 #define DEFAULT_ENCIDX WINDOWS_31J 00271 #elif defined(DEFAULT_CODE_EUC) 00272 #define DEFAULT_ENCIDX EUC_JP 00273 #elif defined(DEFAULT_CODE_UTF8) 00274 #define DEFAULT_ENCIDX UTF_8 00275 #endif 00276 00277 00278 #define is_alnum(c) \ 00279 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9')) 00280 00281 /* I don't trust portablity of toupper */ 00282 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c) 00283 #define nkf_isoctal(c) ('0'<=c && c<='7') 00284 #define nkf_isdigit(c) ('0'<=c && c<='9') 00285 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F')) 00286 #define nkf_isblank(c) (c == SP || c == TAB) 00287 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF) 00288 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) 00289 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c)) 00290 #define nkf_isprint(c) (SP<=c && c<='~') 00291 #define nkf_isgraph(c) ('!'<=c && c<='~') 00292 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \ 00293 ('A'<=c&&c<='F') ? (c-'A'+10) : \ 00294 ('a'<=c&&c<='f') ? (c-'a'+10) : 0) 00295 #define bin2hex(c) ("0123456789ABCDEF"[c&15]) 00296 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3) 00297 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \ 00298 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \ 00299 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22))) 00300 00301 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END) 00302 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F) 00303 00304 #define HOLD_SIZE 1024 00305 #if defined(INT_IS_SHORT) 00306 #define IOBUF_SIZE 2048 00307 #else 00308 #define IOBUF_SIZE 16384 00309 #endif 00310 00311 #define DEFAULT_J 'B' 00312 #define DEFAULT_R 'B' 00313 00314 00315 #define GETA1 0x22 00316 #define GETA2 0x2e 00317 00318 00319 /* MIME preprocessor */ 00320 00321 #ifdef EASYWIN /*Easy Win */ 00322 extern POINT _BufferSize; 00323 #endif 00324 00325 struct input_code{ 00326 const char *name; 00327 nkf_char stat; 00328 nkf_char score; 00329 nkf_char index; 00330 nkf_char buf[3]; 00331 void (*status_func)(struct input_code *, nkf_char); 00332 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0); 00333 int _file_stat; 00334 }; 00335 00336 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */ 00337 static nkf_encoding *input_encoding = NULL; 00338 static nkf_encoding *output_encoding = NULL; 00339 00340 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE) 00341 /* UCS Mapping 00342 * 0: Shift_JIS, eucJP-ascii 00343 * 1: eucJP-ms 00344 * 2: CP932, CP51932 00345 * 3: CP10001 00346 */ 00347 #define UCS_MAP_ASCII 0 00348 #define UCS_MAP_MS 1 00349 #define UCS_MAP_CP932 2 00350 #define UCS_MAP_CP10001 3 00351 static int ms_ucs_map_f = UCS_MAP_ASCII; 00352 #endif 00353 #ifdef UTF8_INPUT_ENABLE 00354 /* no NEC special, NEC-selected IBM extended and IBM extended characters */ 00355 static int no_cp932ext_f = FALSE; 00356 /* ignore ZERO WIDTH NO-BREAK SPACE */ 00357 static int no_best_fit_chars_f = FALSE; 00358 static int input_endian = ENDIAN_BIG; 00359 static nkf_char unicode_subchar = '?'; /* the regular substitution character */ 00360 static void (*encode_fallback)(nkf_char c) = NULL; 00361 static void w_status(struct input_code *, nkf_char); 00362 #endif 00363 #ifdef UTF8_OUTPUT_ENABLE 00364 static int output_bom_f = FALSE; 00365 static int output_endian = ENDIAN_BIG; 00366 #endif 00367 00368 static void std_putc(nkf_char c); 00369 static nkf_char std_getc(FILE *f); 00370 static nkf_char std_ungetc(nkf_char c,FILE *f); 00371 00372 static nkf_char broken_getc(FILE *f); 00373 static nkf_char broken_ungetc(nkf_char c,FILE *f); 00374 00375 static nkf_char mime_getc(FILE *f); 00376 00377 static void mime_putc(nkf_char c); 00378 00379 /* buffers */ 00380 00381 #if !defined(PERL_XS) && !defined(WIN32DLL) 00382 static unsigned char stdibuf[IOBUF_SIZE]; 00383 static unsigned char stdobuf[IOBUF_SIZE]; 00384 #endif 00385 00386 #define NKF_UNSPECIFIED (-TRUE) 00387 00388 /* flags */ 00389 static int unbuf_f = FALSE; 00390 static int estab_f = FALSE; 00391 static int nop_f = FALSE; 00392 static int binmode_f = TRUE; /* binary mode */ 00393 static int rot_f = FALSE; /* rot14/43 mode */ 00394 static int hira_f = FALSE; /* hira/kata henkan */ 00395 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */ 00396 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */ 00397 static int mime_decode_f = FALSE; /* mime decode is explicitly on */ 00398 static int mimebuf_f = FALSE; /* MIME buffered input */ 00399 static int broken_f = FALSE; /* convert ESC-less broken JIS */ 00400 static int iso8859_f = FALSE; /* ISO8859 through */ 00401 static int mimeout_f = FALSE; /* base64 mode */ 00402 static int x0201_f = NKF_UNSPECIFIED; /* convert JIS X 0201 */ 00403 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */ 00404 00405 #ifdef UNICODE_NORMALIZATION 00406 static int nfc_f = FALSE; 00407 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */ 00408 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc; 00409 #endif 00410 00411 #ifdef INPUT_OPTION 00412 static int cap_f = FALSE; 00413 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */ 00414 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc; 00415 00416 static int url_f = FALSE; 00417 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */ 00418 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc; 00419 #endif 00420 00421 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00) 00422 #define CLASS_MASK NKF_INT32_C(0xFF000000) 00423 #define CLASS_UNICODE NKF_INT32_C(0x01000000) 00424 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF) 00425 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF) 00426 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF) 00427 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3) 00428 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE) 00429 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE) 00430 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX) 00431 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX) 00432 00433 #ifdef NUMCHAR_OPTION 00434 static int numchar_f = FALSE; 00435 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */ 00436 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc; 00437 #endif 00438 00439 #ifdef CHECK_OPTION 00440 static int noout_f = FALSE; 00441 static void no_putc(nkf_char c); 00442 static int debug_f = FALSE; 00443 static void debug(const char *str); 00444 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0; 00445 #endif 00446 00447 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */ 00448 static void set_input_codename(const char *codename); 00449 00450 #ifdef EXEC_IO 00451 static int exec_f = 0; 00452 #endif 00453 00454 #ifdef SHIFTJIS_CP932 00455 /* invert IBM extended characters to others */ 00456 static int cp51932_f = FALSE; 00457 00458 /* invert NEC-selected IBM extended characters to IBM extended characters */ 00459 static int cp932inv_f = TRUE; 00460 00461 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */ 00462 #endif /* SHIFTJIS_CP932 */ 00463 00464 static int x0212_f = FALSE; 00465 static int x0213_f = FALSE; 00466 00467 static unsigned char prefix_table[256]; 00468 00469 static void e_status(struct input_code *, nkf_char); 00470 static void s_status(struct input_code *, nkf_char); 00471 00472 struct input_code input_code_list[] = { 00473 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0}, 00474 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0}, 00475 #ifdef UTF8_INPUT_ENABLE 00476 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0}, 00477 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0}, 00478 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0}, 00479 #endif 00480 {NULL, 0, 0, 0, {0, 0, 0}, NULL, NULL, 0} 00481 }; 00482 00483 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */ 00484 static int base64_count = 0; 00485 00486 /* X0208 -> ASCII converter */ 00487 00488 /* fold parameter */ 00489 static int f_line = 0; /* chars in line */ 00490 static int f_prev = 0; 00491 static int fold_preserve_f = FALSE; /* preserve new lines */ 00492 static int fold_f = FALSE; 00493 static int fold_len = 0; 00494 00495 /* options */ 00496 static unsigned char kanji_intro = DEFAULT_J; 00497 static unsigned char ascii_intro = DEFAULT_R; 00498 00499 /* Folding */ 00500 00501 #define FOLD_MARGIN 10 00502 #define DEFAULT_FOLD 60 00503 00504 static int fold_margin = FOLD_MARGIN; 00505 00506 /* process default */ 00507 00508 static nkf_char 00509 no_connection2(nkf_char c2, nkf_char c1, nkf_char c0) 00510 { 00511 fprintf(stderr,"nkf internal module connection failure.\n"); 00512 exit(EXIT_FAILURE); 00513 return 0; /* LINT */ 00514 } 00515 00516 static void 00517 no_connection(nkf_char c2, nkf_char c1) 00518 { 00519 no_connection2(c2,c1,0); 00520 } 00521 00522 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2; 00523 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection; 00524 00525 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection; 00526 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection; 00527 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection; 00528 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection; 00529 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection; 00530 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection; 00531 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection; 00532 00533 /* static redirections */ 00534 00535 static void (*o_putc)(nkf_char c) = std_putc; 00536 00537 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */ 00538 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc; 00539 00540 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */ 00541 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc; 00542 00543 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */ 00544 00545 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */ 00546 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc; 00547 00548 /* for strict mime */ 00549 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */ 00550 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc; 00551 00552 /* Global states */ 00553 static int output_mode = ASCII; /* output kanji mode */ 00554 static int input_mode = ASCII; /* input kanji mode */ 00555 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */ 00556 00557 /* X0201 / X0208 conversion tables */ 00558 00559 /* X0201 kana conversion table */ 00560 /* 90-9F A0-DF */ 00561 static const unsigned char cv[]= { 00562 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57, 00563 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21, 00564 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29, 00565 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43, 00566 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26, 00567 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d, 00568 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35, 00569 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d, 00570 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46, 00571 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c, 00572 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52, 00573 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e, 00574 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62, 00575 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69, 00576 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d, 00577 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c, 00578 0x00,0x00}; 00579 00580 00581 /* X0201 kana conversion table for daguten */ 00582 /* 90-9F A0-DF */ 00583 static const unsigned char dv[]= { 00584 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00585 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00586 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00588 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74, 00589 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e, 00590 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36, 00591 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e, 00592 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47, 00593 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00, 00594 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53, 00595 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00, 00596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00600 0x00,0x00}; 00601 00602 /* X0201 kana conversion table for han-daguten */ 00603 /* 90-9F A0-DF */ 00604 static const unsigned char ev[]= { 00605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00615 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54, 00616 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00, 00617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00618 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00619 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00621 0x00,0x00}; 00622 00623 00624 /* X0208 kigou conversion table */ 00625 /* 0x8140 - 0x819e */ 00626 static const unsigned char fv[] = { 00627 00628 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a, 00629 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00, 00630 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00, 00631 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f, 00632 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27, 00633 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d, 00634 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00, 00635 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00, 00636 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00, 00637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00638 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40, 00639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 00640 } ; 00641 00642 00643 00644 static int option_mode = 0; 00645 static int file_out_f = FALSE; 00646 #ifdef OVERWRITE 00647 static int overwrite_f = FALSE; 00648 static int preserve_time_f = FALSE; 00649 static int backup_f = FALSE; 00650 static char *backup_suffix = ""; 00651 #endif 00652 00653 static int eolmode_f = 0; /* CR, LF, CRLF */ 00654 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */ 00655 static nkf_char prev_cr = 0; /* CR or 0 */ 00656 #ifdef EASYWIN /*Easy Win */ 00657 static int end_check; 00658 #endif /*Easy Win */ 00659 00660 static void * 00661 nkf_xmalloc(size_t size) 00662 { 00663 void *ptr; 00664 00665 if (size == 0) size = 1; 00666 00667 ptr = malloc(size); 00668 if (ptr == NULL) { 00669 perror("can't malloc"); 00670 exit(EXIT_FAILURE); 00671 } 00672 00673 return ptr; 00674 } 00675 00676 static void * 00677 nkf_xrealloc(void *ptr, size_t size) 00678 { 00679 if (size == 0) size = 1; 00680 00681 ptr = realloc(ptr, size); 00682 if (ptr == NULL) { 00683 perror("can't realloc"); 00684 exit(EXIT_FAILURE); 00685 } 00686 00687 return ptr; 00688 } 00689 00690 #define nkf_xfree(ptr) free(ptr) 00691 00692 static int 00693 nkf_str_caseeql(const char *src, const char *target) 00694 { 00695 int i; 00696 for (i = 0; src[i] && target[i]; i++) { 00697 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE; 00698 } 00699 if (src[i] || target[i]) return FALSE; 00700 else return TRUE; 00701 } 00702 00703 static nkf_encoding* 00704 nkf_enc_from_index(int idx) 00705 { 00706 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) { 00707 return 0; 00708 } 00709 return &nkf_encoding_table[idx]; 00710 } 00711 00712 static int 00713 nkf_enc_find_index(const char *name) 00714 { 00715 int i; 00716 if (name[0] == 'X' && *(name+1) == '-') name += 2; 00717 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) { 00718 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) { 00719 return encoding_name_to_id_table[i].id; 00720 } 00721 } 00722 return -1; 00723 } 00724 00725 static nkf_encoding* 00726 nkf_enc_find(const char *name) 00727 { 00728 int idx = -1; 00729 idx = nkf_enc_find_index(name); 00730 if (idx < 0) return 0; 00731 return nkf_enc_from_index(idx); 00732 } 00733 00734 #define nkf_enc_name(enc) (enc)->name 00735 #define nkf_enc_to_index(enc) (enc)->id 00736 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding 00737 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv 00738 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv 00739 #define nkf_enc_asciicompat(enc) (\ 00740 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\ 00741 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP) 00742 #define nkf_enc_unicode_p(enc) (\ 00743 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\ 00744 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\ 00745 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32) 00746 #define nkf_enc_cp5022x_p(enc) (\ 00747 nkf_enc_to_index(enc) == CP50220 ||\ 00748 nkf_enc_to_index(enc) == CP50221 ||\ 00749 nkf_enc_to_index(enc) == CP50222) 00750 00751 #ifdef DEFAULT_CODE_LOCALE 00752 static const char* 00753 nkf_locale_charmap() 00754 { 00755 #ifdef HAVE_LANGINFO_H 00756 return nl_langinfo(CODESET); 00757 #elif defined(__WIN32__) 00758 static char buf[16]; 00759 sprintf(buf, "CP%d", GetACP()); 00760 return buf; 00761 #elif defined(__OS2__) 00762 # if defined(INT_IS_SHORT) 00763 /* OS/2 1.x */ 00764 return NULL; 00765 # else 00766 /* OS/2 32bit */ 00767 static char buf[16]; 00768 ULONG ulCP[1], ulncp; 00769 DosQueryCp(sizeof(ulCP), ulCP, &ulncp); 00770 if (ulCP[0] == 932 || ulCP[0] == 943) 00771 strcpy(buf, "Shift_JIS"); 00772 else 00773 sprintf(buf, "CP%lu", ulCP[0]); 00774 return buf; 00775 # endif 00776 #endif 00777 return NULL; 00778 } 00779 00780 static nkf_encoding* 00781 nkf_locale_encoding() 00782 { 00783 nkf_encoding *enc = 0; 00784 const char *encname = nkf_locale_charmap(); 00785 if (encname) 00786 enc = nkf_enc_find(encname); 00787 return enc; 00788 } 00789 #endif /* DEFAULT_CODE_LOCALE */ 00790 00791 static nkf_encoding* 00792 nkf_utf8_encoding() 00793 { 00794 return &nkf_encoding_table[UTF_8]; 00795 } 00796 00797 static nkf_encoding* 00798 nkf_default_encoding() 00799 { 00800 nkf_encoding *enc = 0; 00801 #ifdef DEFAULT_CODE_LOCALE 00802 enc = nkf_locale_encoding(); 00803 #elif defined(DEFAULT_ENCIDX) 00804 enc = nkf_enc_from_index(DEFAULT_ENCIDX); 00805 #endif 00806 if (!enc) enc = nkf_utf8_encoding(); 00807 return enc; 00808 } 00809 00810 typedef struct { 00811 long capa; 00812 long len; 00813 nkf_char *ptr; 00814 } nkf_buf_t; 00815 00816 static nkf_buf_t * 00817 nkf_buf_new(int length) 00818 { 00819 nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t)); 00820 buf->ptr = nkf_xmalloc(sizeof(nkf_char) * length); 00821 buf->capa = length; 00822 buf->len = 0; 00823 return buf; 00824 } 00825 00826 #if 0 00827 static void 00828 nkf_buf_dispose(nkf_buf_t *buf) 00829 { 00830 nkf_xfree(buf->ptr); 00831 nkf_xfree(buf); 00832 } 00833 #endif 00834 00835 #define nkf_buf_length(buf) ((buf)->len) 00836 #define nkf_buf_empty_p(buf) ((buf)->len == 0) 00837 00838 static nkf_char 00839 nkf_buf_at(nkf_buf_t *buf, int index) 00840 { 00841 assert(index <= buf->len); 00842 return buf->ptr[index]; 00843 } 00844 00845 static void 00846 nkf_buf_clear(nkf_buf_t *buf) 00847 { 00848 buf->len = 0; 00849 } 00850 00851 static void 00852 nkf_buf_push(nkf_buf_t *buf, nkf_char c) 00853 { 00854 if (buf->capa <= buf->len) { 00855 exit(EXIT_FAILURE); 00856 } 00857 buf->ptr[buf->len++] = c; 00858 } 00859 00860 static nkf_char 00861 nkf_buf_pop(nkf_buf_t *buf) 00862 { 00863 assert(!nkf_buf_empty_p(buf)); 00864 return buf->ptr[--buf->len]; 00865 } 00866 00867 /* Normalization Form C */ 00868 #ifndef PERL_XS 00869 #ifdef WIN32DLL 00870 #define fprintf dllprintf 00871 #endif 00872 00873 static void 00874 version(void) 00875 { 00876 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n"); 00877 } 00878 00879 static void 00880 usage(void) 00881 { 00882 fprintf(HELP_OUTPUT, 00883 "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n" 00884 #ifdef UTF8_OUTPUT_ENABLE 00885 " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" 00886 " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n" 00887 #else 00888 #endif 00889 #ifdef UTF8_INPUT_ENABLE 00890 " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" 00891 " UTF option is -W[8,[16,32][B,L]]\n" 00892 #else 00893 " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" 00894 #endif 00895 ); 00896 fprintf(HELP_OUTPUT, 00897 " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n" 00898 " M[BQ] MIME encode [B:base64 Q:quoted]\n" 00899 " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n" 00900 ); 00901 fprintf(HELP_OUTPUT, 00902 " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n" 00903 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n" 00904 " 4: JISX0208 Katakana to JISX0201 Katakana\n" 00905 " X,x Convert Halfwidth Katakana to Fullwidth or preserve it\n" 00906 ); 00907 fprintf(HELP_OUTPUT, 00908 " O Output to File (DEFAULT 'nkf.out')\n" 00909 " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n" 00910 ); 00911 fprintf(HELP_OUTPUT, 00912 " --ic=<encoding> Specify the input encoding\n" 00913 " --oc=<encoding> Specify the output encoding\n" 00914 " --hiragana --katakana Hiragana/Katakana Conversion\n" 00915 " --katakana-hiragana Converts each other\n" 00916 ); 00917 fprintf(HELP_OUTPUT, 00918 #ifdef INPUT_OPTION 00919 " --{cap, url}-input Convert hex after ':' or '%%'\n" 00920 #endif 00921 #ifdef NUMCHAR_OPTION 00922 " --numchar-input Convert Unicode Character Reference\n" 00923 #endif 00924 #ifdef UTF8_INPUT_ENABLE 00925 " --fb-{skip, html, xml, perl, java, subchar}\n" 00926 " Specify unassigned character's replacement\n" 00927 #endif 00928 ); 00929 fprintf(HELP_OUTPUT, 00930 #ifdef OVERWRITE 00931 " --in-place[=SUF] Overwrite original files\n" 00932 " --overwrite[=SUF] Preserve timestamp of original files\n" 00933 #endif 00934 " -g --guess Guess the input code\n" 00935 " -v --version Print the version\n" 00936 " --help/-V Print this help / configuration\n" 00937 ); 00938 version(); 00939 } 00940 00941 static void 00942 show_configuration(void) 00943 { 00944 fprintf(HELP_OUTPUT, 00945 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n" 00946 " Compile-time options:\n" 00947 " Compiled at: " __DATE__ " " __TIME__ "\n" 00948 ); 00949 fprintf(HELP_OUTPUT, 00950 " Default output encoding: " 00951 #ifdef DEFAULT_CODE_LOCALE 00952 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding()) 00953 #elif defined(DEFAULT_ENCIDX) 00954 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding()) 00955 #else 00956 "NONE\n" 00957 #endif 00958 ); 00959 fprintf(HELP_OUTPUT, 00960 " Default output end of line: " 00961 #if DEFAULT_NEWLINE == CR 00962 "CR" 00963 #elif DEFAULT_NEWLINE == CRLF 00964 "CRLF" 00965 #else 00966 "LF" 00967 #endif 00968 "\n" 00969 " Decode MIME encoded string: " 00970 #if MIME_DECODE_DEFAULT 00971 "ON" 00972 #else 00973 "OFF" 00974 #endif 00975 "\n" 00976 " Convert JIS X 0201 Katakana: " 00977 #if X0201_DEFAULT 00978 "ON" 00979 #else 00980 "OFF" 00981 #endif 00982 "\n" 00983 " --help, --version output: " 00984 #if HELP_OUTPUT_HELP_OUTPUT 00985 "HELP_OUTPUT" 00986 #else 00987 "STDOUT" 00988 #endif 00989 "\n"); 00990 } 00991 #endif /*PERL_XS*/ 00992 00993 #ifdef OVERWRITE 00994 static char* 00995 get_backup_filename(const char *suffix, const char *filename) 00996 { 00997 char *backup_filename; 00998 int asterisk_count = 0; 00999 int i, j; 01000 int filename_length = strlen(filename); 01001 01002 for(i = 0; suffix[i]; i++){ 01003 if(suffix[i] == '*') asterisk_count++; 01004 } 01005 01006 if(asterisk_count){ 01007 backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1); 01008 for(i = 0, j = 0; suffix[i];){ 01009 if(suffix[i] == '*'){ 01010 backup_filename[j] = '\0'; 01011 strncat(backup_filename, filename, filename_length); 01012 i++; 01013 j += filename_length; 01014 }else{ 01015 backup_filename[j++] = suffix[i++]; 01016 } 01017 } 01018 backup_filename[j] = '\0'; 01019 }else{ 01020 j = filename_length + strlen(suffix); 01021 backup_filename = nkf_xmalloc(j + 1); 01022 strcpy(backup_filename, filename); 01023 strcat(backup_filename, suffix); 01024 backup_filename[j] = '\0'; 01025 } 01026 return backup_filename; 01027 } 01028 #endif 01029 01030 #ifdef UTF8_INPUT_ENABLE 01031 static void 01032 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c) 01033 { 01034 int shift = 20; 01035 c &= VALUE_MASK; 01036 while(shift >= 0){ 01037 if(c >= NKF_INT32_C(1)<<shift){ 01038 while(shift >= 0){ 01039 (*f)(0, bin2hex(c>>shift)); 01040 shift -= 4; 01041 } 01042 }else{ 01043 shift -= 4; 01044 } 01045 } 01046 return; 01047 } 01048 01049 static void 01050 encode_fallback_html(nkf_char c) 01051 { 01052 (*oconv)(0, '&'); 01053 (*oconv)(0, '#'); 01054 c &= VALUE_MASK; 01055 if(c >= NKF_INT32_C(1000000)) 01056 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10); 01057 if(c >= NKF_INT32_C(100000)) 01058 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10); 01059 if(c >= 10000) 01060 (*oconv)(0, 0x30+(c/10000 )%10); 01061 if(c >= 1000) 01062 (*oconv)(0, 0x30+(c/1000 )%10); 01063 if(c >= 100) 01064 (*oconv)(0, 0x30+(c/100 )%10); 01065 if(c >= 10) 01066 (*oconv)(0, 0x30+(c/10 )%10); 01067 if(c >= 0) 01068 (*oconv)(0, 0x30+ c %10); 01069 (*oconv)(0, ';'); 01070 return; 01071 } 01072 01073 static void 01074 encode_fallback_xml(nkf_char c) 01075 { 01076 (*oconv)(0, '&'); 01077 (*oconv)(0, '#'); 01078 (*oconv)(0, 'x'); 01079 nkf_each_char_to_hex(oconv, c); 01080 (*oconv)(0, ';'); 01081 return; 01082 } 01083 01084 static void 01085 encode_fallback_java(nkf_char c) 01086 { 01087 (*oconv)(0, '\\'); 01088 c &= VALUE_MASK; 01089 if(!nkf_char_unicode_bmp_p(c)){ 01090 (*oconv)(0, 'U'); 01091 (*oconv)(0, '0'); 01092 (*oconv)(0, '0'); 01093 (*oconv)(0, bin2hex(c>>20)); 01094 (*oconv)(0, bin2hex(c>>16)); 01095 }else{ 01096 (*oconv)(0, 'u'); 01097 } 01098 (*oconv)(0, bin2hex(c>>12)); 01099 (*oconv)(0, bin2hex(c>> 8)); 01100 (*oconv)(0, bin2hex(c>> 4)); 01101 (*oconv)(0, bin2hex(c )); 01102 return; 01103 } 01104 01105 static void 01106 encode_fallback_perl(nkf_char c) 01107 { 01108 (*oconv)(0, '\\'); 01109 (*oconv)(0, 'x'); 01110 (*oconv)(0, '{'); 01111 nkf_each_char_to_hex(oconv, c); 01112 (*oconv)(0, '}'); 01113 return; 01114 } 01115 01116 static void 01117 encode_fallback_subchar(nkf_char c) 01118 { 01119 c = unicode_subchar; 01120 (*oconv)((c>>8)&0xFF, c&0xFF); 01121 return; 01122 } 01123 #endif 01124 01125 static const struct { 01126 const char *name; 01127 const char *alias; 01128 } long_option[] = { 01129 {"ic=", ""}, 01130 {"oc=", ""}, 01131 {"base64","jMB"}, 01132 {"euc","e"}, 01133 {"euc-input","E"}, 01134 {"fj","jm"}, 01135 {"help",""}, 01136 {"jis","j"}, 01137 {"jis-input","J"}, 01138 {"mac","sLm"}, 01139 {"mime","jM"}, 01140 {"mime-input","m"}, 01141 {"msdos","sLw"}, 01142 {"sjis","s"}, 01143 {"sjis-input","S"}, 01144 {"unix","eLu"}, 01145 {"version","v"}, 01146 {"windows","sLw"}, 01147 {"hiragana","h1"}, 01148 {"katakana","h2"}, 01149 {"katakana-hiragana","h3"}, 01150 {"guess=", ""}, 01151 {"guess", "g2"}, 01152 {"cp932", ""}, 01153 {"no-cp932", ""}, 01154 #ifdef X0212_ENABLE 01155 {"x0212", ""}, 01156 #endif 01157 #ifdef UTF8_OUTPUT_ENABLE 01158 {"utf8", "w"}, 01159 {"utf16", "w16"}, 01160 {"ms-ucs-map", ""}, 01161 {"fb-skip", ""}, 01162 {"fb-html", ""}, 01163 {"fb-xml", ""}, 01164 {"fb-perl", ""}, 01165 {"fb-java", ""}, 01166 {"fb-subchar", ""}, 01167 {"fb-subchar=", ""}, 01168 #endif 01169 #ifdef UTF8_INPUT_ENABLE 01170 {"utf8-input", "W"}, 01171 {"utf16-input", "W16"}, 01172 {"no-cp932ext", ""}, 01173 {"no-best-fit-chars",""}, 01174 #endif 01175 #ifdef UNICODE_NORMALIZATION 01176 {"utf8mac-input", ""}, 01177 #endif 01178 #ifdef OVERWRITE 01179 {"overwrite", ""}, 01180 {"overwrite=", ""}, 01181 {"in-place", ""}, 01182 {"in-place=", ""}, 01183 #endif 01184 #ifdef INPUT_OPTION 01185 {"cap-input", ""}, 01186 {"url-input", ""}, 01187 #endif 01188 #ifdef NUMCHAR_OPTION 01189 {"numchar-input", ""}, 01190 #endif 01191 #ifdef CHECK_OPTION 01192 {"no-output", ""}, 01193 {"debug", ""}, 01194 #endif 01195 #ifdef SHIFTJIS_CP932 01196 {"cp932inv", ""}, 01197 #endif 01198 #ifdef EXEC_IO 01199 {"exec-in", ""}, 01200 {"exec-out", ""}, 01201 #endif 01202 {"prefix=", ""}, 01203 }; 01204 01205 static void 01206 set_input_encoding(nkf_encoding *enc) 01207 { 01208 switch (nkf_enc_to_index(enc)) { 01209 case ISO_8859_1: 01210 iso8859_f = TRUE; 01211 break; 01212 case CP50221: 01213 case CP50222: 01214 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01215 case CP50220: 01216 #ifdef SHIFTJIS_CP932 01217 cp51932_f = TRUE; 01218 #endif 01219 #ifdef UTF8_OUTPUT_ENABLE 01220 ms_ucs_map_f = UCS_MAP_CP932; 01221 #endif 01222 break; 01223 case ISO_2022_JP_1: 01224 x0212_f = TRUE; 01225 break; 01226 case ISO_2022_JP_3: 01227 x0212_f = TRUE; 01228 x0213_f = TRUE; 01229 break; 01230 case ISO_2022_JP_2004: 01231 x0212_f = TRUE; 01232 x0213_f = TRUE; 01233 break; 01234 case SHIFT_JIS: 01235 break; 01236 case WINDOWS_31J: 01237 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01238 #ifdef SHIFTJIS_CP932 01239 cp51932_f = TRUE; 01240 #endif 01241 #ifdef UTF8_OUTPUT_ENABLE 01242 ms_ucs_map_f = UCS_MAP_CP932; 01243 #endif 01244 break; 01245 break; 01246 case CP10001: 01247 #ifdef SHIFTJIS_CP932 01248 cp51932_f = TRUE; 01249 #endif 01250 #ifdef UTF8_OUTPUT_ENABLE 01251 ms_ucs_map_f = UCS_MAP_CP10001; 01252 #endif 01253 break; 01254 case EUC_JP: 01255 break; 01256 case EUCJP_NKF: 01257 break; 01258 case CP51932: 01259 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01260 #ifdef SHIFTJIS_CP932 01261 cp51932_f = TRUE; 01262 #endif 01263 #ifdef UTF8_OUTPUT_ENABLE 01264 ms_ucs_map_f = UCS_MAP_CP932; 01265 #endif 01266 break; 01267 case EUCJP_MS: 01268 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01269 #ifdef SHIFTJIS_CP932 01270 cp51932_f = FALSE; 01271 #endif 01272 #ifdef UTF8_OUTPUT_ENABLE 01273 ms_ucs_map_f = UCS_MAP_MS; 01274 #endif 01275 break; 01276 case EUCJP_ASCII: 01277 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01278 #ifdef SHIFTJIS_CP932 01279 cp51932_f = FALSE; 01280 #endif 01281 #ifdef UTF8_OUTPUT_ENABLE 01282 ms_ucs_map_f = UCS_MAP_ASCII; 01283 #endif 01284 break; 01285 case SHIFT_JISX0213: 01286 case SHIFT_JIS_2004: 01287 x0213_f = TRUE; 01288 #ifdef SHIFTJIS_CP932 01289 cp51932_f = FALSE; 01290 #endif 01291 break; 01292 case EUC_JISX0213: 01293 case EUC_JIS_2004: 01294 x0213_f = TRUE; 01295 #ifdef SHIFTJIS_CP932 01296 cp51932_f = FALSE; 01297 #endif 01298 break; 01299 #ifdef UTF8_INPUT_ENABLE 01300 #ifdef UNICODE_NORMALIZATION 01301 case UTF8_MAC: 01302 nfc_f = TRUE; 01303 break; 01304 #endif 01305 case UTF_16: 01306 case UTF_16BE: 01307 case UTF_16BE_BOM: 01308 input_endian = ENDIAN_BIG; 01309 break; 01310 case UTF_16LE: 01311 case UTF_16LE_BOM: 01312 input_endian = ENDIAN_LITTLE; 01313 break; 01314 case UTF_32: 01315 case UTF_32BE: 01316 case UTF_32BE_BOM: 01317 input_endian = ENDIAN_BIG; 01318 break; 01319 case UTF_32LE: 01320 case UTF_32LE_BOM: 01321 input_endian = ENDIAN_LITTLE; 01322 break; 01323 #endif 01324 } 01325 } 01326 01327 static void 01328 set_output_encoding(nkf_encoding *enc) 01329 { 01330 switch (nkf_enc_to_index(enc)) { 01331 case CP50220: 01332 #ifdef SHIFTJIS_CP932 01333 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01334 #endif 01335 #ifdef UTF8_OUTPUT_ENABLE 01336 ms_ucs_map_f = UCS_MAP_CP932; 01337 #endif 01338 break; 01339 case CP50221: 01340 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01341 #ifdef SHIFTJIS_CP932 01342 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01343 #endif 01344 #ifdef UTF8_OUTPUT_ENABLE 01345 ms_ucs_map_f = UCS_MAP_CP932; 01346 #endif 01347 break; 01348 case ISO_2022_JP: 01349 #ifdef SHIFTJIS_CP932 01350 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01351 #endif 01352 break; 01353 case ISO_2022_JP_1: 01354 x0212_f = TRUE; 01355 #ifdef SHIFTJIS_CP932 01356 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01357 #endif 01358 break; 01359 case ISO_2022_JP_3: 01360 x0212_f = TRUE; 01361 x0213_f = TRUE; 01362 #ifdef SHIFTJIS_CP932 01363 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01364 #endif 01365 break; 01366 case SHIFT_JIS: 01367 break; 01368 case WINDOWS_31J: 01369 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01370 #ifdef UTF8_OUTPUT_ENABLE 01371 ms_ucs_map_f = UCS_MAP_CP932; 01372 #endif 01373 break; 01374 case CP10001: 01375 #ifdef UTF8_OUTPUT_ENABLE 01376 ms_ucs_map_f = UCS_MAP_CP10001; 01377 #endif 01378 break; 01379 case EUC_JP: 01380 x0212_f = TRUE; 01381 #ifdef SHIFTJIS_CP932 01382 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01383 #endif 01384 #ifdef UTF8_OUTPUT_ENABLE 01385 ms_ucs_map_f = UCS_MAP_ASCII; 01386 #endif 01387 break; 01388 case EUCJP_NKF: 01389 x0212_f = FALSE; 01390 #ifdef SHIFTJIS_CP932 01391 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01392 #endif 01393 #ifdef UTF8_OUTPUT_ENABLE 01394 ms_ucs_map_f = UCS_MAP_ASCII; 01395 #endif 01396 break; 01397 case CP51932: 01398 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01399 #ifdef SHIFTJIS_CP932 01400 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01401 #endif 01402 #ifdef UTF8_OUTPUT_ENABLE 01403 ms_ucs_map_f = UCS_MAP_CP932; 01404 #endif 01405 break; 01406 case EUCJP_MS: 01407 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01408 x0212_f = TRUE; 01409 #ifdef UTF8_OUTPUT_ENABLE 01410 ms_ucs_map_f = UCS_MAP_MS; 01411 #endif 01412 break; 01413 case EUCJP_ASCII: 01414 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01415 x0212_f = TRUE; 01416 #ifdef UTF8_OUTPUT_ENABLE 01417 ms_ucs_map_f = UCS_MAP_ASCII; 01418 #endif 01419 break; 01420 case SHIFT_JISX0213: 01421 case SHIFT_JIS_2004: 01422 x0213_f = TRUE; 01423 #ifdef SHIFTJIS_CP932 01424 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01425 #endif 01426 break; 01427 case EUC_JISX0213: 01428 case EUC_JIS_2004: 01429 x0212_f = TRUE; 01430 x0213_f = TRUE; 01431 #ifdef SHIFTJIS_CP932 01432 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01433 #endif 01434 break; 01435 #ifdef UTF8_OUTPUT_ENABLE 01436 case UTF_8_BOM: 01437 output_bom_f = TRUE; 01438 break; 01439 case UTF_16: 01440 case UTF_16BE_BOM: 01441 output_bom_f = TRUE; 01442 break; 01443 case UTF_16LE: 01444 output_endian = ENDIAN_LITTLE; 01445 output_bom_f = FALSE; 01446 break; 01447 case UTF_16LE_BOM: 01448 output_endian = ENDIAN_LITTLE; 01449 output_bom_f = TRUE; 01450 break; 01451 case UTF_32: 01452 case UTF_32BE_BOM: 01453 output_bom_f = TRUE; 01454 break; 01455 case UTF_32LE: 01456 output_endian = ENDIAN_LITTLE; 01457 output_bom_f = FALSE; 01458 break; 01459 case UTF_32LE_BOM: 01460 output_endian = ENDIAN_LITTLE; 01461 output_bom_f = TRUE; 01462 break; 01463 #endif 01464 } 01465 } 01466 01467 static struct input_code* 01468 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0)) 01469 { 01470 if (iconv_func){ 01471 struct input_code *p = input_code_list; 01472 while (p->name){ 01473 if (iconv_func == p->iconv_func){ 01474 return p; 01475 } 01476 p++; 01477 } 01478 } 01479 return 0; 01480 } 01481 01482 static void 01483 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0)) 01484 { 01485 #ifdef INPUT_CODE_FIX 01486 if (f || !input_encoding) 01487 #endif 01488 if (estab_f != f){ 01489 estab_f = f; 01490 } 01491 01492 if (iconv_func 01493 #ifdef INPUT_CODE_FIX 01494 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */ 01495 #endif 01496 ){ 01497 iconv = iconv_func; 01498 } 01499 #ifdef CHECK_OPTION 01500 if (estab_f && iconv_for_check != iconv){ 01501 struct input_code *p = find_inputcode_byfunc(iconv); 01502 if (p){ 01503 set_input_codename(p->name); 01504 debug(p->name); 01505 } 01506 iconv_for_check = iconv; 01507 } 01508 #endif 01509 } 01510 01511 #ifdef X0212_ENABLE 01512 static nkf_char 01513 x0212_shift(nkf_char c) 01514 { 01515 nkf_char ret = c; 01516 c &= 0x7f; 01517 if (is_eucg3(ret)){ 01518 if (0x75 <= c && c <= 0x7f){ 01519 ret = c + (0x109 - 0x75); 01520 } 01521 }else{ 01522 if (0x75 <= c && c <= 0x7f){ 01523 ret = c + (0x113 - 0x75); 01524 } 01525 } 01526 return ret; 01527 } 01528 01529 01530 static nkf_char 01531 x0212_unshift(nkf_char c) 01532 { 01533 nkf_char ret = c; 01534 if (0x7f <= c && c <= 0x88){ 01535 ret = c + (0x75 - 0x7f); 01536 }else if (0x89 <= c && c <= 0x92){ 01537 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89)); 01538 } 01539 return ret; 01540 } 01541 #endif /* X0212_ENABLE */ 01542 01543 static nkf_char 01544 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1) 01545 { 01546 nkf_char ndx; 01547 if (is_eucg3(c2)){ 01548 ndx = c2 & 0x7f; 01549 if (x0213_f){ 01550 if((0x21 <= ndx && ndx <= 0x2F)){ 01551 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3; 01552 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e); 01553 return 0; 01554 }else if(0x6E <= ndx && ndx <= 0x7E){ 01555 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe; 01556 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e); 01557 return 0; 01558 } 01559 return 1; 01560 } 01561 #ifdef X0212_ENABLE 01562 else if(nkf_isgraph(ndx)){ 01563 nkf_char val = 0; 01564 const unsigned short *ptr; 01565 ptr = x0212_shiftjis[ndx - 0x21]; 01566 if (ptr){ 01567 val = ptr[(c1 & 0x7f) - 0x21]; 01568 } 01569 if (val){ 01570 c2 = val >> 8; 01571 c1 = val & 0xff; 01572 if (p2) *p2 = c2; 01573 if (p1) *p1 = c1; 01574 return 0; 01575 } 01576 c2 = x0212_shift(c2); 01577 } 01578 #endif /* X0212_ENABLE */ 01579 } 01580 if(0x7F < c2) return 1; 01581 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1); 01582 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e); 01583 return 0; 01584 } 01585 01586 static nkf_char 01587 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1) 01588 { 01589 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE) 01590 nkf_char val; 01591 #endif 01592 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} }; 01593 if (0xFC < c1) return 1; 01594 #ifdef SHIFTJIS_CP932 01595 if (!cp932inv_f && is_ibmext_in_sjis(c2)){ 01596 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40]; 01597 if (val){ 01598 c2 = val >> 8; 01599 c1 = val & 0xff; 01600 } 01601 } 01602 if (cp932inv_f 01603 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){ 01604 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40]; 01605 if (val){ 01606 c2 = val >> 8; 01607 c1 = val & 0xff; 01608 } 01609 } 01610 #endif /* SHIFTJIS_CP932 */ 01611 #ifdef X0212_ENABLE 01612 if (!x0213_f && is_ibmext_in_sjis(c2)){ 01613 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40]; 01614 if (val){ 01615 if (val > 0x7FFF){ 01616 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f); 01617 c1 = val & 0xff; 01618 }else{ 01619 c2 = val >> 8; 01620 c1 = val & 0xff; 01621 } 01622 if (p2) *p2 = c2; 01623 if (p1) *p1 = c1; 01624 return 0; 01625 } 01626 } 01627 #endif 01628 if(c2 >= 0x80){ 01629 if(x0213_f && c2 >= 0xF0){ 01630 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */ 01631 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1]; 01632 }else{ /* 78<=k<=94 */ 01633 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B); 01634 if (0x9E < c1) c2++; 01635 } 01636 }else{ 01637 #define SJ0162 0x00e1 /* 01 - 62 ku offset */ 01638 #define SJ6394 0x0161 /* 63 - 94 ku offset */ 01639 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394); 01640 if (0x9E < c1) c2++; 01641 } 01642 if (c1 < 0x9F) 01643 c1 = c1 - ((c1 > DEL) ? SP : 0x1F); 01644 else { 01645 c1 = c1 - 0x7E; 01646 } 01647 } 01648 01649 #ifdef X0212_ENABLE 01650 c2 = x0212_unshift(c2); 01651 #endif 01652 if (p2) *p2 = c2; 01653 if (p1) *p1 = c1; 01654 return 0; 01655 } 01656 01657 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE) 01658 static void 01659 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4) 01660 { 01661 val &= VALUE_MASK; 01662 if (val < 0x80){ 01663 *p1 = val; 01664 *p2 = 0; 01665 *p3 = 0; 01666 *p4 = 0; 01667 }else if (val < 0x800){ 01668 *p1 = 0xc0 | (val >> 6); 01669 *p2 = 0x80 | (val & 0x3f); 01670 *p3 = 0; 01671 *p4 = 0; 01672 } else if (nkf_char_unicode_bmp_p(val)) { 01673 *p1 = 0xe0 | (val >> 12); 01674 *p2 = 0x80 | ((val >> 6) & 0x3f); 01675 *p3 = 0x80 | ( val & 0x3f); 01676 *p4 = 0; 01677 } else if (nkf_char_unicode_value_p(val)) { 01678 *p1 = 0xf0 | (val >> 18); 01679 *p2 = 0x80 | ((val >> 12) & 0x3f); 01680 *p3 = 0x80 | ((val >> 6) & 0x3f); 01681 *p4 = 0x80 | ( val & 0x3f); 01682 } else { 01683 *p1 = 0; 01684 *p2 = 0; 01685 *p3 = 0; 01686 *p4 = 0; 01687 } 01688 } 01689 01690 static nkf_char 01691 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) 01692 { 01693 nkf_char wc; 01694 if (c1 <= 0x7F) { 01695 /* single byte */ 01696 wc = c1; 01697 } 01698 else if (c1 <= 0xC3) { 01699 /* trail byte or invalid */ 01700 return -1; 01701 } 01702 else if (c1 <= 0xDF) { 01703 /* 2 bytes */ 01704 wc = (c1 & 0x1F) << 6; 01705 wc |= (c2 & 0x3F); 01706 } 01707 else if (c1 <= 0xEF) { 01708 /* 3 bytes */ 01709 wc = (c1 & 0x0F) << 12; 01710 wc |= (c2 & 0x3F) << 6; 01711 wc |= (c3 & 0x3F); 01712 } 01713 else if (c2 <= 0xF4) { 01714 /* 4 bytes */ 01715 wc = (c1 & 0x0F) << 18; 01716 wc |= (c2 & 0x3F) << 12; 01717 wc |= (c3 & 0x3F) << 6; 01718 wc |= (c4 & 0x3F); 01719 } 01720 else { 01721 return -1; 01722 } 01723 return wc; 01724 } 01725 #endif 01726 01727 #ifdef UTF8_INPUT_ENABLE 01728 static int 01729 unicode_to_jis_common2(nkf_char c1, nkf_char c0, 01730 const unsigned short *const *pp, nkf_char psize, 01731 nkf_char *p2, nkf_char *p1) 01732 { 01733 nkf_char c2; 01734 const unsigned short *p; 01735 unsigned short val; 01736 01737 if (pp == 0) return 1; 01738 01739 c1 -= 0x80; 01740 if (c1 < 0 || psize <= c1) return 1; 01741 p = pp[c1]; 01742 if (p == 0) return 1; 01743 01744 c0 -= 0x80; 01745 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1; 01746 val = p[c0]; 01747 if (val == 0) return 1; 01748 if (no_cp932ext_f && ( 01749 (val>>8) == 0x2D || /* NEC special characters */ 01750 val > NKF_INT32_C(0xF300) /* IBM extended characters */ 01751 )) return 1; 01752 01753 c2 = val >> 8; 01754 if (val > 0x7FFF){ 01755 c2 &= 0x7f; 01756 c2 |= PREFIX_EUCG3; 01757 } 01758 if (c2 == SO) c2 = JIS_X_0201_1976_K; 01759 c1 = val & 0xFF; 01760 if (p2) *p2 = c2; 01761 if (p1) *p1 = c1; 01762 return 0; 01763 } 01764 01765 static int 01766 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1) 01767 { 01768 const unsigned short *const *pp; 01769 const unsigned short *const *const *ppp; 01770 static const char no_best_fit_chars_table_C2[] = 01771 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 01772 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 01773 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2, 01774 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1}; 01775 static const char no_best_fit_chars_table_C2_ms[] = 01776 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 01777 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 01778 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 01779 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0}; 01780 static const char no_best_fit_chars_table_932_C2[] = 01781 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 01782 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 01783 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 01784 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0}; 01785 static const char no_best_fit_chars_table_932_C3[] = 01786 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 01787 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 01788 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 01789 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1}; 01790 nkf_char ret = 0; 01791 01792 if(c2 < 0x80){ 01793 *p2 = 0; 01794 *p1 = c2; 01795 }else if(c2 < 0xe0){ 01796 if(no_best_fit_chars_f){ 01797 if(ms_ucs_map_f == UCS_MAP_CP932){ 01798 switch(c2){ 01799 case 0xC2: 01800 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1; 01801 break; 01802 case 0xC3: 01803 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1; 01804 break; 01805 } 01806 }else if(!cp932inv_f){ 01807 switch(c2){ 01808 case 0xC2: 01809 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1; 01810 break; 01811 case 0xC3: 01812 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1; 01813 break; 01814 } 01815 }else if(ms_ucs_map_f == UCS_MAP_MS){ 01816 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1; 01817 }else if(ms_ucs_map_f == UCS_MAP_CP10001){ 01818 switch(c2){ 01819 case 0xC2: 01820 switch(c1){ 01821 case 0xA2: 01822 case 0xA3: 01823 case 0xA5: 01824 case 0xA6: 01825 case 0xAC: 01826 case 0xAF: 01827 case 0xB8: 01828 return 1; 01829 } 01830 break; 01831 } 01832 } 01833 } 01834 pp = 01835 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 : 01836 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms : 01837 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac : 01838 utf8_to_euc_2bytes; 01839 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1); 01840 }else if(c0 < 0xF0){ 01841 if(no_best_fit_chars_f){ 01842 if(ms_ucs_map_f == UCS_MAP_CP932){ 01843 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1; 01844 }else if(ms_ucs_map_f == UCS_MAP_MS){ 01845 switch(c2){ 01846 case 0xE2: 01847 switch(c1){ 01848 case 0x80: 01849 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1; 01850 break; 01851 case 0x88: 01852 if(c0 == 0x92) return 1; 01853 break; 01854 } 01855 break; 01856 case 0xE3: 01857 if(c1 == 0x80 || c0 == 0x9C) return 1; 01858 break; 01859 } 01860 }else if(ms_ucs_map_f == UCS_MAP_CP10001){ 01861 switch(c2){ 01862 case 0xE3: 01863 switch(c1){ 01864 case 0x82: 01865 if(c0 == 0x94) return 1; 01866 break; 01867 case 0x83: 01868 if(c0 == 0xBB) return 1; 01869 break; 01870 } 01871 break; 01872 } 01873 }else{ 01874 switch(c2){ 01875 case 0xE2: 01876 switch(c1){ 01877 case 0x80: 01878 if(c0 == 0x95) return 1; 01879 break; 01880 case 0x88: 01881 if(c0 == 0xA5) return 1; 01882 break; 01883 } 01884 break; 01885 case 0xEF: 01886 switch(c1){ 01887 case 0xBC: 01888 if(c0 == 0x8D) return 1; 01889 break; 01890 case 0xBD: 01891 if(c0 == 0x9E && !cp932inv_f) return 1; 01892 break; 01893 case 0xBF: 01894 if(0xA0 <= c0 && c0 <= 0xA5) return 1; 01895 break; 01896 } 01897 break; 01898 } 01899 } 01900 } 01901 ppp = 01902 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 : 01903 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms : 01904 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac : 01905 utf8_to_euc_3bytes; 01906 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1); 01907 }else return -1; 01908 #ifdef SHIFTJIS_CP932 01909 if (!ret && !cp932inv_f && is_eucg3(*p2)) { 01910 nkf_char s2, s1; 01911 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) { 01912 s2e_conv(s2, s1, p2, p1); 01913 }else{ 01914 ret = 1; 01915 } 01916 } 01917 #endif 01918 return ret; 01919 } 01920 01921 #ifdef UTF8_OUTPUT_ENABLE 01922 static nkf_char 01923 e2w_conv(nkf_char c2, nkf_char c1) 01924 { 01925 const unsigned short *p; 01926 01927 if (c2 == JIS_X_0201_1976_K) { 01928 if (ms_ucs_map_f == UCS_MAP_CP10001) { 01929 switch (c1) { 01930 case 0x20: 01931 return 0xA0; 01932 case 0x7D: 01933 return 0xA9; 01934 } 01935 } 01936 p = euc_to_utf8_1byte; 01937 #ifdef X0212_ENABLE 01938 } else if (is_eucg3(c2)){ 01939 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){ 01940 return 0xA6; 01941 } 01942 c2 = (c2&0x7f) - 0x21; 01943 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes) 01944 p = x0212_to_utf8_2bytes[c2]; 01945 else 01946 return 0; 01947 #endif 01948 } else { 01949 c2 &= 0x7f; 01950 c2 = (c2&0x7f) - 0x21; 01951 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes) 01952 p = 01953 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] : 01954 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] : 01955 euc_to_utf8_2bytes_ms[c2]; 01956 else 01957 return 0; 01958 } 01959 if (!p) return 0; 01960 c1 = (c1 & 0x7f) - 0x21; 01961 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte) 01962 return p[c1]; 01963 return 0; 01964 } 01965 #endif 01966 01967 static nkf_char 01968 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1) 01969 { 01970 nkf_char ret = 0; 01971 01972 if (!c1){ 01973 *p2 = 0; 01974 *p1 = c2; 01975 }else if (0xc0 <= c2 && c2 <= 0xef) { 01976 ret = unicode_to_jis_common(c2, c1, c0, p2, p1); 01977 #ifdef NUMCHAR_OPTION 01978 if (ret > 0){ 01979 if (p2) *p2 = 0; 01980 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0)); 01981 ret = 0; 01982 } 01983 #endif 01984 } 01985 return ret; 01986 } 01987 01988 #ifdef UTF8_INPUT_ENABLE 01989 static nkf_char 01990 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1) 01991 { 01992 nkf_char c1, c2, c3, c4; 01993 nkf_char ret = 0; 01994 val &= VALUE_MASK; 01995 if (val < 0x80) { 01996 *p2 = 0; 01997 *p1 = val; 01998 } 01999 else if (nkf_char_unicode_bmp_p(val)){ 02000 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); 02001 ret = unicode_to_jis_common(c1, c2, c3, p2, p1); 02002 if (ret > 0){ 02003 *p2 = 0; 02004 *p1 = nkf_char_unicode_new(val); 02005 ret = 0; 02006 } 02007 } 02008 else { 02009 *p2 = 0; 02010 *p1 = nkf_char_unicode_new(val); 02011 } 02012 return ret; 02013 } 02014 #endif 02015 02016 static nkf_char 02017 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0) 02018 { 02019 if (c2 == JIS_X_0201_1976_K || c2 == SS2){ 02020 if (iso2022jp_f && !x0201_f) { 02021 c2 = GETA1; c1 = GETA2; 02022 } else { 02023 c2 = JIS_X_0201_1976_K; 02024 c1 &= 0x7f; 02025 } 02026 #ifdef X0212_ENABLE 02027 }else if (c2 == 0x8f){ 02028 if (c0 == 0){ 02029 return -1; 02030 } 02031 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) { 02032 /* encoding is eucJP-ms, so invert to Unicode Private User Area */ 02033 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC); 02034 c2 = 0; 02035 } else { 02036 c2 = (c2 << 8) | (c1 & 0x7f); 02037 c1 = c0 & 0x7f; 02038 #ifdef SHIFTJIS_CP932 02039 if (cp51932_f){ 02040 nkf_char s2, s1; 02041 if (e2s_conv(c2, c1, &s2, &s1) == 0){ 02042 s2e_conv(s2, s1, &c2, &c1); 02043 if (c2 < 0x100){ 02044 c1 &= 0x7f; 02045 c2 &= 0x7f; 02046 } 02047 } 02048 } 02049 #endif /* SHIFTJIS_CP932 */ 02050 } 02051 #endif /* X0212_ENABLE */ 02052 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) { 02053 /* NOP */ 02054 } else { 02055 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) { 02056 /* encoding is eucJP-ms, so invert to Unicode Private User Area */ 02057 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000); 02058 c2 = 0; 02059 } else { 02060 c1 &= 0x7f; 02061 c2 &= 0x7f; 02062 #ifdef SHIFTJIS_CP932 02063 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){ 02064 nkf_char s2, s1; 02065 if (e2s_conv(c2, c1, &s2, &s1) == 0){ 02066 s2e_conv(s2, s1, &c2, &c1); 02067 if (c2 < 0x100){ 02068 c1 &= 0x7f; 02069 c2 &= 0x7f; 02070 } 02071 } 02072 } 02073 #endif /* SHIFTJIS_CP932 */ 02074 } 02075 } 02076 (*oconv)(c2, c1); 02077 return 0; 02078 } 02079 02080 static nkf_char 02081 s_iconv(nkf_char c2, nkf_char c1, nkf_char c0) 02082 { 02083 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) { 02084 if (iso2022jp_f && !x0201_f) { 02085 c2 = GETA1; c1 = GETA2; 02086 } else { 02087 c1 &= 0x7f; 02088 } 02089 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) { 02090 /* NOP */ 02091 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) { 02092 /* CP932 UDC */ 02093 if(c1 == 0x7F) return 0; 02094 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000); 02095 c2 = 0; 02096 } else { 02097 nkf_char ret = s2e_conv(c2, c1, &c2, &c1); 02098 if (ret) return ret; 02099 } 02100 (*oconv)(c2, c1); 02101 return 0; 02102 } 02103 02104 static nkf_char 02105 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3) 02106 { 02107 nkf_char ret = 0, c4 = 0; 02108 static const char w_iconv_utf8_1st_byte[] = 02109 { /* 0xC0 - 0xFF */ 02110 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 02111 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 02112 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 02113 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70}; 02114 02115 if (c3 > 0xFF) { 02116 c4 = c3 & 0xFF; 02117 c3 >>= 8; 02118 } 02119 02120 if (c1 < 0 || 0xff < c1) { 02121 }else if (c1 == 0) { /* 0 : 1 byte*/ 02122 c3 = 0; 02123 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */ 02124 return 0; 02125 } else{ 02126 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) { 02127 case 21: 02128 if (c2 < 0x80 || 0xBF < c2) return 0; 02129 break; 02130 case 30: 02131 if (c3 == 0) return -1; 02132 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80) 02133 return 0; 02134 break; 02135 case 31: 02136 case 33: 02137 if (c3 == 0) return -1; 02138 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80) 02139 return 0; 02140 break; 02141 case 32: 02142 if (c3 == 0) return -1; 02143 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80) 02144 return 0; 02145 break; 02146 case 40: 02147 if (c3 == 0) return -2; 02148 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80) 02149 return 0; 02150 break; 02151 case 41: 02152 if (c3 == 0) return -2; 02153 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80) 02154 return 0; 02155 break; 02156 case 42: 02157 if (c3 == 0) return -2; 02158 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80) 02159 return 0; 02160 break; 02161 default: 02162 return 0; 02163 break; 02164 } 02165 } 02166 if (c1 == 0 || c1 == EOF){ 02167 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */ 02168 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4)); 02169 c1 = 0; 02170 } else { 02171 ret = w2e_conv(c1, c2, c3, &c1, &c2); 02172 } 02173 if (ret == 0){ 02174 (*oconv)(c1, c2); 02175 } 02176 return ret; 02177 } 02178 02179 #define NKF_ICONV_INVALID_CODE_RANGE -13 02180 static size_t 02181 unicode_iconv(nkf_char wc) 02182 { 02183 nkf_char c1, c2; 02184 int ret = 0; 02185 02186 if (wc < 0x80) { 02187 c2 = 0; 02188 c1 = wc; 02189 }else if ((wc>>11) == 27) { 02190 /* unpaired surrogate */ 02191 return NKF_ICONV_INVALID_CODE_RANGE; 02192 }else if (wc < 0xFFFF) { 02193 ret = w16e_conv(wc, &c2, &c1); 02194 if (ret) return ret; 02195 }else if (wc < 0x10FFFF) { 02196 c2 = 0; 02197 c1 = nkf_char_unicode_new(wc); 02198 } else { 02199 return NKF_ICONV_INVALID_CODE_RANGE; 02200 } 02201 (*oconv)(c2, c1); 02202 return 0; 02203 } 02204 02205 #define NKF_ICONV_NEED_ONE_MORE_BYTE (size_t)-1 02206 #define NKF_ICONV_NEED_TWO_MORE_BYTES (size_t)-2 02207 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00)) 02208 static size_t 02209 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) 02210 { 02211 nkf_char wc; 02212 02213 if (c1 == EOF) { 02214 (*oconv)(EOF, 0); 02215 return 0; 02216 } 02217 02218 if (input_endian == ENDIAN_BIG) { 02219 if (0xD8 <= c1 && c1 <= 0xDB) { 02220 if (0xDC <= c3 && c3 <= 0xDF) { 02221 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4); 02222 } else return NKF_ICONV_NEED_TWO_MORE_BYTES; 02223 } else { 02224 wc = c1 << 8 | c2; 02225 } 02226 } else { 02227 if (0xD8 <= c2 && c2 <= 0xDB) { 02228 if (0xDC <= c4 && c4 <= 0xDF) { 02229 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3); 02230 } else return NKF_ICONV_NEED_TWO_MORE_BYTES; 02231 } else { 02232 wc = c2 << 8 | c1; 02233 } 02234 } 02235 02236 return (*unicode_iconv)(wc); 02237 } 02238 02239 static nkf_char 02240 w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0) 02241 { 02242 (*oconv)(c2, c1); 02243 return 16; /* different from w_iconv32 */ 02244 } 02245 02246 static nkf_char 02247 w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0) 02248 { 02249 (*oconv)(c2, c1); 02250 return 32; /* different from w_iconv16 */ 02251 } 02252 02253 static size_t 02254 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) 02255 { 02256 nkf_char wc; 02257 02258 if (c1 == EOF) { 02259 (*oconv)(EOF, 0); 02260 return 0; 02261 } 02262 02263 switch(input_endian){ 02264 case ENDIAN_BIG: 02265 wc = c2 << 16 | c3 << 8 | c4; 02266 break; 02267 case ENDIAN_LITTLE: 02268 wc = c3 << 16 | c2 << 8 | c1; 02269 break; 02270 case ENDIAN_2143: 02271 wc = c1 << 16 | c4 << 8 | c3; 02272 break; 02273 case ENDIAN_3412: 02274 wc = c4 << 16 | c1 << 8 | c2; 02275 break; 02276 default: 02277 return NKF_ICONV_INVALID_CODE_RANGE; 02278 } 02279 02280 return (*unicode_iconv)(wc); 02281 } 02282 #endif 02283 02284 #define output_ascii_escape_sequence(mode) do { \ 02285 if (output_mode != ASCII && output_mode != ISO_8859_1) { \ 02286 (*o_putc)(ESC); \ 02287 (*o_putc)('('); \ 02288 (*o_putc)(ascii_intro); \ 02289 output_mode = mode; \ 02290 } \ 02291 } while (0) 02292 02293 static void 02294 output_escape_sequence(int mode) 02295 { 02296 if (output_mode == mode) 02297 return; 02298 switch(mode) { 02299 case ISO_8859_1: 02300 (*o_putc)(ESC); 02301 (*o_putc)('.'); 02302 (*o_putc)('A'); 02303 break; 02304 case JIS_X_0201_1976_K: 02305 (*o_putc)(ESC); 02306 (*o_putc)('('); 02307 (*o_putc)('I'); 02308 break; 02309 case JIS_X_0208: 02310 (*o_putc)(ESC); 02311 (*o_putc)('$'); 02312 (*o_putc)(kanji_intro); 02313 break; 02314 case JIS_X_0212: 02315 (*o_putc)(ESC); 02316 (*o_putc)('$'); 02317 (*o_putc)('('); 02318 (*o_putc)('D'); 02319 break; 02320 case JIS_X_0213_1: 02321 (*o_putc)(ESC); 02322 (*o_putc)('$'); 02323 (*o_putc)('('); 02324 (*o_putc)('Q'); 02325 break; 02326 case JIS_X_0213_2: 02327 (*o_putc)(ESC); 02328 (*o_putc)('$'); 02329 (*o_putc)('('); 02330 (*o_putc)('P'); 02331 break; 02332 } 02333 output_mode = mode; 02334 } 02335 02336 static void 02337 j_oconv(nkf_char c2, nkf_char c1) 02338 { 02339 #ifdef NUMCHAR_OPTION 02340 if (c2 == 0 && nkf_char_unicode_p(c1)){ 02341 w16e_conv(c1, &c2, &c1); 02342 if (c2 == 0 && nkf_char_unicode_p(c1)){ 02343 c2 = c1 & VALUE_MASK; 02344 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) { 02345 /* CP5022x UDC */ 02346 c1 &= 0xFFF; 02347 c2 = 0x7F + c1 / 94; 02348 c1 = 0x21 + c1 % 94; 02349 } else { 02350 if (encode_fallback) (*encode_fallback)(c1); 02351 return; 02352 } 02353 } 02354 } 02355 #endif 02356 if (c2 == 0) { 02357 output_ascii_escape_sequence(ASCII); 02358 (*o_putc)(c1); 02359 } 02360 else if (c2 == EOF) { 02361 output_ascii_escape_sequence(ASCII); 02362 (*o_putc)(EOF); 02363 } 02364 else if (c2 == ISO_8859_1) { 02365 output_ascii_escape_sequence(ISO_8859_1); 02366 (*o_putc)(c1|0x80); 02367 } 02368 else if (c2 == JIS_X_0201_1976_K) { 02369 output_escape_sequence(JIS_X_0201_1976_K); 02370 (*o_putc)(c1); 02371 #ifdef X0212_ENABLE 02372 } else if (is_eucg3(c2)){ 02373 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212); 02374 (*o_putc)(c2 & 0x7f); 02375 (*o_putc)(c1); 02376 #endif 02377 } else { 02378 if(ms_ucs_map_f 02379 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1 02380 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return; 02381 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208); 02382 (*o_putc)(c2); 02383 (*o_putc)(c1); 02384 } 02385 } 02386 02387 static void 02388 e_oconv(nkf_char c2, nkf_char c1) 02389 { 02390 if (c2 == 0 && nkf_char_unicode_p(c1)){ 02391 w16e_conv(c1, &c2, &c1); 02392 if (c2 == 0 && nkf_char_unicode_p(c1)){ 02393 c2 = c1 & VALUE_MASK; 02394 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) { 02395 /* eucJP-ms UDC */ 02396 c1 &= 0xFFF; 02397 c2 = c1 / 94; 02398 c2 += c2 < 10 ? 0x75 : 0x8FEB; 02399 c1 = 0x21 + c1 % 94; 02400 if (is_eucg3(c2)){ 02401 (*o_putc)(0x8f); 02402 (*o_putc)((c2 & 0x7f) | 0x080); 02403 (*o_putc)(c1 | 0x080); 02404 }else{ 02405 (*o_putc)((c2 & 0x7f) | 0x080); 02406 (*o_putc)(c1 | 0x080); 02407 } 02408 return; 02409 } else { 02410 if (encode_fallback) (*encode_fallback)(c1); 02411 return; 02412 } 02413 } 02414 } 02415 02416 if (c2 == EOF) { 02417 (*o_putc)(EOF); 02418 } else if (c2 == 0) { 02419 output_mode = ASCII; 02420 (*o_putc)(c1); 02421 } else if (c2 == JIS_X_0201_1976_K) { 02422 output_mode = EUC_JP; 02423 (*o_putc)(SS2); (*o_putc)(c1|0x80); 02424 } else if (c2 == ISO_8859_1) { 02425 output_mode = ISO_8859_1; 02426 (*o_putc)(c1 | 0x080); 02427 #ifdef X0212_ENABLE 02428 } else if (is_eucg3(c2)){ 02429 output_mode = EUC_JP; 02430 #ifdef SHIFTJIS_CP932 02431 if (!cp932inv_f){ 02432 nkf_char s2, s1; 02433 if (e2s_conv(c2, c1, &s2, &s1) == 0){ 02434 s2e_conv(s2, s1, &c2, &c1); 02435 } 02436 } 02437 #endif 02438 if (c2 == 0) { 02439 output_mode = ASCII; 02440 (*o_putc)(c1); 02441 }else if (is_eucg3(c2)){ 02442 if (x0212_f){ 02443 (*o_putc)(0x8f); 02444 (*o_putc)((c2 & 0x7f) | 0x080); 02445 (*o_putc)(c1 | 0x080); 02446 } 02447 }else{ 02448 (*o_putc)((c2 & 0x7f) | 0x080); 02449 (*o_putc)(c1 | 0x080); 02450 } 02451 #endif 02452 } else { 02453 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) { 02454 set_iconv(FALSE, 0); 02455 return; /* too late to rescue this char */ 02456 } 02457 output_mode = EUC_JP; 02458 (*o_putc)(c2 | 0x080); 02459 (*o_putc)(c1 | 0x080); 02460 } 02461 } 02462 02463 static void 02464 s_oconv(nkf_char c2, nkf_char c1) 02465 { 02466 #ifdef NUMCHAR_OPTION 02467 if (c2 == 0 && nkf_char_unicode_p(c1)){ 02468 w16e_conv(c1, &c2, &c1); 02469 if (c2 == 0 && nkf_char_unicode_p(c1)){ 02470 c2 = c1 & VALUE_MASK; 02471 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) { 02472 /* CP932 UDC */ 02473 c1 &= 0xFFF; 02474 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB); 02475 c1 = c1 % 188; 02476 c1 += 0x40 + (c1 > 0x3e); 02477 (*o_putc)(c2); 02478 (*o_putc)(c1); 02479 return; 02480 } else { 02481 if(encode_fallback)(*encode_fallback)(c1); 02482 return; 02483 } 02484 } 02485 } 02486 #endif 02487 if (c2 == EOF) { 02488 (*o_putc)(EOF); 02489 return; 02490 } else if (c2 == 0) { 02491 output_mode = ASCII; 02492 (*o_putc)(c1); 02493 } else if (c2 == JIS_X_0201_1976_K) { 02494 output_mode = SHIFT_JIS; 02495 (*o_putc)(c1|0x80); 02496 } else if (c2 == ISO_8859_1) { 02497 output_mode = ISO_8859_1; 02498 (*o_putc)(c1 | 0x080); 02499 #ifdef X0212_ENABLE 02500 } else if (is_eucg3(c2)){ 02501 output_mode = SHIFT_JIS; 02502 if (e2s_conv(c2, c1, &c2, &c1) == 0){ 02503 (*o_putc)(c2); 02504 (*o_putc)(c1); 02505 } 02506 #endif 02507 } else { 02508 if (!nkf_isprint(c1) || !nkf_isprint(c2)) { 02509 set_iconv(FALSE, 0); 02510 return; /* too late to rescue this char */ 02511 } 02512 output_mode = SHIFT_JIS; 02513 e2s_conv(c2, c1, &c2, &c1); 02514 02515 #ifdef SHIFTJIS_CP932 02516 if (cp932inv_f 02517 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){ 02518 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40]; 02519 if (c){ 02520 c2 = c >> 8; 02521 c1 = c & 0xff; 02522 } 02523 } 02524 #endif /* SHIFTJIS_CP932 */ 02525 02526 (*o_putc)(c2); 02527 if (prefix_table[(unsigned char)c1]){ 02528 (*o_putc)(prefix_table[(unsigned char)c1]); 02529 } 02530 (*o_putc)(c1); 02531 } 02532 } 02533 02534 #ifdef UTF8_OUTPUT_ENABLE 02535 static void 02536 w_oconv(nkf_char c2, nkf_char c1) 02537 { 02538 nkf_char c3, c4; 02539 nkf_char val; 02540 02541 if (output_bom_f) { 02542 output_bom_f = FALSE; 02543 (*o_putc)('\357'); 02544 (*o_putc)('\273'); 02545 (*o_putc)('\277'); 02546 } 02547 02548 if (c2 == EOF) { 02549 (*o_putc)(EOF); 02550 return; 02551 } 02552 02553 if (c2 == 0 && nkf_char_unicode_p(c1)){ 02554 val = c1 & VALUE_MASK; 02555 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); 02556 (*o_putc)(c1); 02557 if (c2) (*o_putc)(c2); 02558 if (c3) (*o_putc)(c3); 02559 if (c4) (*o_putc)(c4); 02560 return; 02561 } 02562 02563 if (c2 == 0) { 02564 (*o_putc)(c1); 02565 } else { 02566 val = e2w_conv(c2, c1); 02567 if (val){ 02568 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); 02569 (*o_putc)(c1); 02570 if (c2) (*o_putc)(c2); 02571 if (c3) (*o_putc)(c3); 02572 if (c4) (*o_putc)(c4); 02573 } 02574 } 02575 } 02576 02577 static void 02578 w_oconv16(nkf_char c2, nkf_char c1) 02579 { 02580 if (output_bom_f) { 02581 output_bom_f = FALSE; 02582 if (output_endian == ENDIAN_LITTLE){ 02583 (*o_putc)(0xFF); 02584 (*o_putc)(0xFE); 02585 }else{ 02586 (*o_putc)(0xFE); 02587 (*o_putc)(0xFF); 02588 } 02589 } 02590 02591 if (c2 == EOF) { 02592 (*o_putc)(EOF); 02593 return; 02594 } 02595 02596 if (c2 == 0 && nkf_char_unicode_p(c1)) { 02597 if (nkf_char_unicode_bmp_p(c1)) { 02598 c2 = (c1 >> 8) & 0xff; 02599 c1 &= 0xff; 02600 } else { 02601 c1 &= VALUE_MASK; 02602 if (c1 <= UNICODE_MAX) { 02603 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */ 02604 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */ 02605 if (output_endian == ENDIAN_LITTLE){ 02606 (*o_putc)(c2 & 0xff); 02607 (*o_putc)((c2 >> 8) & 0xff); 02608 (*o_putc)(c1 & 0xff); 02609 (*o_putc)((c1 >> 8) & 0xff); 02610 }else{ 02611 (*o_putc)((c2 >> 8) & 0xff); 02612 (*o_putc)(c2 & 0xff); 02613 (*o_putc)((c1 >> 8) & 0xff); 02614 (*o_putc)(c1 & 0xff); 02615 } 02616 } 02617 return; 02618 } 02619 } else if (c2) { 02620 nkf_char val = e2w_conv(c2, c1); 02621 c2 = (val >> 8) & 0xff; 02622 c1 = val & 0xff; 02623 if (!val) return; 02624 } 02625 02626 if (output_endian == ENDIAN_LITTLE){ 02627 (*o_putc)(c1); 02628 (*o_putc)(c2); 02629 }else{ 02630 (*o_putc)(c2); 02631 (*o_putc)(c1); 02632 } 02633 } 02634 02635 static void 02636 w_oconv32(nkf_char c2, nkf_char c1) 02637 { 02638 if (output_bom_f) { 02639 output_bom_f = FALSE; 02640 if (output_endian == ENDIAN_LITTLE){ 02641 (*o_putc)(0xFF); 02642 (*o_putc)(0xFE); 02643 (*o_putc)(0); 02644 (*o_putc)(0); 02645 }else{ 02646 (*o_putc)(0); 02647 (*o_putc)(0); 02648 (*o_putc)(0xFE); 02649 (*o_putc)(0xFF); 02650 } 02651 } 02652 02653 if (c2 == EOF) { 02654 (*o_putc)(EOF); 02655 return; 02656 } 02657 02658 if (c2 == ISO_8859_1) { 02659 c1 |= 0x80; 02660 } else if (c2 == 0 && nkf_char_unicode_p(c1)) { 02661 c1 &= VALUE_MASK; 02662 } else if (c2) { 02663 c1 = e2w_conv(c2, c1); 02664 if (!c1) return; 02665 } 02666 if (output_endian == ENDIAN_LITTLE){ 02667 (*o_putc)( c1 & 0xFF); 02668 (*o_putc)((c1 >> 8) & 0xFF); 02669 (*o_putc)((c1 >> 16) & 0xFF); 02670 (*o_putc)(0); 02671 }else{ 02672 (*o_putc)(0); 02673 (*o_putc)((c1 >> 16) & 0xFF); 02674 (*o_putc)((c1 >> 8) & 0xFF); 02675 (*o_putc)( c1 & 0xFF); 02676 } 02677 } 02678 #endif 02679 02680 #define SCORE_L2 (1) /* Kanji Level 2 */ 02681 #define SCORE_KANA (SCORE_L2 << 1) /* Halfwidth Katakana */ 02682 #define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */ 02683 #define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */ 02684 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */ 02685 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /* Undefined Characters */ 02686 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */ 02687 #define SCORE_ERROR (SCORE_iMIME << 1) /* Error */ 02688 02689 #define SCORE_INIT (SCORE_iMIME) 02690 02691 static const nkf_char score_table_A0[] = { 02692 0, 0, 0, 0, 02693 0, 0, 0, 0, 02694 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, 02695 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST, 02696 }; 02697 02698 static const nkf_char score_table_F0[] = { 02699 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2, 02700 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, 02701 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932, 02702 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR, 02703 }; 02704 02705 static void 02706 set_code_score(struct input_code *ptr, nkf_char score) 02707 { 02708 if (ptr){ 02709 ptr->score |= score; 02710 } 02711 } 02712 02713 static void 02714 clr_code_score(struct input_code *ptr, nkf_char score) 02715 { 02716 if (ptr){ 02717 ptr->score &= ~score; 02718 } 02719 } 02720 02721 static void 02722 code_score(struct input_code *ptr) 02723 { 02724 nkf_char c2 = ptr->buf[0]; 02725 #ifdef UTF8_OUTPUT_ENABLE 02726 nkf_char c1 = ptr->buf[1]; 02727 #endif 02728 if (c2 < 0){ 02729 set_code_score(ptr, SCORE_ERROR); 02730 }else if (c2 == SS2){ 02731 set_code_score(ptr, SCORE_KANA); 02732 }else if (c2 == 0x8f){ 02733 set_code_score(ptr, SCORE_X0212); 02734 #ifdef UTF8_OUTPUT_ENABLE 02735 }else if (!e2w_conv(c2, c1)){ 02736 set_code_score(ptr, SCORE_NO_EXIST); 02737 #endif 02738 }else if ((c2 & 0x70) == 0x20){ 02739 set_code_score(ptr, score_table_A0[c2 & 0x0f]); 02740 }else if ((c2 & 0x70) == 0x70){ 02741 set_code_score(ptr, score_table_F0[c2 & 0x0f]); 02742 }else if ((c2 & 0x70) >= 0x50){ 02743 set_code_score(ptr, SCORE_L2); 02744 } 02745 } 02746 02747 static void 02748 status_disable(struct input_code *ptr) 02749 { 02750 ptr->stat = -1; 02751 ptr->buf[0] = -1; 02752 code_score(ptr); 02753 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0); 02754 } 02755 02756 static void 02757 status_push_ch(struct input_code *ptr, nkf_char c) 02758 { 02759 ptr->buf[ptr->index++] = c; 02760 } 02761 02762 static void 02763 status_clear(struct input_code *ptr) 02764 { 02765 ptr->stat = 0; 02766 ptr->index = 0; 02767 } 02768 02769 static void 02770 status_reset(struct input_code *ptr) 02771 { 02772 status_clear(ptr); 02773 ptr->score = SCORE_INIT; 02774 } 02775 02776 static void 02777 status_reinit(struct input_code *ptr) 02778 { 02779 status_reset(ptr); 02780 ptr->_file_stat = 0; 02781 } 02782 02783 static void 02784 status_check(struct input_code *ptr, nkf_char c) 02785 { 02786 if (c <= DEL && estab_f){ 02787 status_reset(ptr); 02788 } 02789 } 02790 02791 static void 02792 s_status(struct input_code *ptr, nkf_char c) 02793 { 02794 switch(ptr->stat){ 02795 case -1: 02796 status_check(ptr, c); 02797 break; 02798 case 0: 02799 if (c <= DEL){ 02800 break; 02801 }else if (nkf_char_unicode_p(c)){ 02802 break; 02803 }else if (0xa1 <= c && c <= 0xdf){ 02804 status_push_ch(ptr, SS2); 02805 status_push_ch(ptr, c); 02806 code_score(ptr); 02807 status_clear(ptr); 02808 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){ 02809 ptr->stat = 1; 02810 status_push_ch(ptr, c); 02811 }else if (0xed <= c && c <= 0xee){ 02812 ptr->stat = 3; 02813 status_push_ch(ptr, c); 02814 #ifdef SHIFTJIS_CP932 02815 }else if (is_ibmext_in_sjis(c)){ 02816 ptr->stat = 2; 02817 status_push_ch(ptr, c); 02818 #endif /* SHIFTJIS_CP932 */ 02819 #ifdef X0212_ENABLE 02820 }else if (0xf0 <= c && c <= 0xfc){ 02821 ptr->stat = 1; 02822 status_push_ch(ptr, c); 02823 #endif /* X0212_ENABLE */ 02824 }else{ 02825 status_disable(ptr); 02826 } 02827 break; 02828 case 1: 02829 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){ 02830 status_push_ch(ptr, c); 02831 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]); 02832 code_score(ptr); 02833 status_clear(ptr); 02834 }else{ 02835 status_disable(ptr); 02836 } 02837 break; 02838 case 2: 02839 #ifdef SHIFTJIS_CP932 02840 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) { 02841 status_push_ch(ptr, c); 02842 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) { 02843 set_code_score(ptr, SCORE_CP932); 02844 status_clear(ptr); 02845 break; 02846 } 02847 } 02848 #endif /* SHIFTJIS_CP932 */ 02849 status_disable(ptr); 02850 break; 02851 case 3: 02852 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){ 02853 status_push_ch(ptr, c); 02854 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]); 02855 set_code_score(ptr, SCORE_CP932); 02856 status_clear(ptr); 02857 }else{ 02858 status_disable(ptr); 02859 } 02860 break; 02861 } 02862 } 02863 02864 static void 02865 e_status(struct input_code *ptr, nkf_char c) 02866 { 02867 switch (ptr->stat){ 02868 case -1: 02869 status_check(ptr, c); 02870 break; 02871 case 0: 02872 if (c <= DEL){ 02873 break; 02874 }else if (nkf_char_unicode_p(c)){ 02875 break; 02876 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){ 02877 ptr->stat = 1; 02878 status_push_ch(ptr, c); 02879 #ifdef X0212_ENABLE 02880 }else if (0x8f == c){ 02881 ptr->stat = 2; 02882 status_push_ch(ptr, c); 02883 #endif /* X0212_ENABLE */ 02884 }else{ 02885 status_disable(ptr); 02886 } 02887 break; 02888 case 1: 02889 if (0xa1 <= c && c <= 0xfe){ 02890 status_push_ch(ptr, c); 02891 code_score(ptr); 02892 status_clear(ptr); 02893 }else{ 02894 status_disable(ptr); 02895 } 02896 break; 02897 #ifdef X0212_ENABLE 02898 case 2: 02899 if (0xa1 <= c && c <= 0xfe){ 02900 ptr->stat = 1; 02901 status_push_ch(ptr, c); 02902 }else{ 02903 status_disable(ptr); 02904 } 02905 #endif /* X0212_ENABLE */ 02906 } 02907 } 02908 02909 #ifdef UTF8_INPUT_ENABLE 02910 static void 02911 w_status(struct input_code *ptr, nkf_char c) 02912 { 02913 switch (ptr->stat){ 02914 case -1: 02915 status_check(ptr, c); 02916 break; 02917 case 0: 02918 if (c <= DEL){ 02919 break; 02920 }else if (nkf_char_unicode_p(c)){ 02921 break; 02922 }else if (0xc0 <= c && c <= 0xdf){ 02923 ptr->stat = 1; 02924 status_push_ch(ptr, c); 02925 }else if (0xe0 <= c && c <= 0xef){ 02926 ptr->stat = 2; 02927 status_push_ch(ptr, c); 02928 }else if (0xf0 <= c && c <= 0xf4){ 02929 ptr->stat = 3; 02930 status_push_ch(ptr, c); 02931 }else{ 02932 status_disable(ptr); 02933 } 02934 break; 02935 case 1: 02936 case 2: 02937 if (0x80 <= c && c <= 0xbf){ 02938 status_push_ch(ptr, c); 02939 if (ptr->index > ptr->stat){ 02940 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb 02941 && ptr->buf[2] == 0xbf); 02942 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2], 02943 &ptr->buf[0], &ptr->buf[1]); 02944 if (!bom){ 02945 code_score(ptr); 02946 } 02947 status_clear(ptr); 02948 } 02949 }else{ 02950 status_disable(ptr); 02951 } 02952 break; 02953 case 3: 02954 if (0x80 <= c && c <= 0xbf){ 02955 if (ptr->index < ptr->stat){ 02956 status_push_ch(ptr, c); 02957 } else { 02958 status_clear(ptr); 02959 } 02960 }else{ 02961 status_disable(ptr); 02962 } 02963 break; 02964 } 02965 } 02966 #endif 02967 02968 static void 02969 code_status(nkf_char c) 02970 { 02971 int action_flag = 1; 02972 struct input_code *result = 0; 02973 struct input_code *p = input_code_list; 02974 while (p->name){ 02975 if (!p->status_func) { 02976 ++p; 02977 continue; 02978 } 02979 if (!p->status_func) 02980 continue; 02981 (p->status_func)(p, c); 02982 if (p->stat > 0){ 02983 action_flag = 0; 02984 }else if(p->stat == 0){ 02985 if (result){ 02986 action_flag = 0; 02987 }else{ 02988 result = p; 02989 } 02990 } 02991 ++p; 02992 } 02993 02994 if (action_flag){ 02995 if (result && !estab_f){ 02996 set_iconv(TRUE, result->iconv_func); 02997 }else if (c <= DEL){ 02998 struct input_code *ptr = input_code_list; 02999 while (ptr->name){ 03000 status_reset(ptr); 03001 ++ptr; 03002 } 03003 } 03004 } 03005 } 03006 03007 typedef struct { 03008 nkf_buf_t *std_gc_buf; 03009 nkf_char broken_state; 03010 nkf_buf_t *broken_buf; 03011 nkf_char mimeout_state; 03012 nkf_buf_t *nfc_buf; 03013 } nkf_state_t; 03014 03015 static nkf_state_t *nkf_state = NULL; 03016 03017 #define STD_GC_BUFSIZE (256) 03018 03019 static void 03020 nkf_state_init(void) 03021 { 03022 if (nkf_state) { 03023 nkf_buf_clear(nkf_state->std_gc_buf); 03024 nkf_buf_clear(nkf_state->broken_buf); 03025 nkf_buf_clear(nkf_state->nfc_buf); 03026 } 03027 else { 03028 nkf_state = nkf_xmalloc(sizeof(nkf_state_t)); 03029 nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE); 03030 nkf_state->broken_buf = nkf_buf_new(3); 03031 nkf_state->nfc_buf = nkf_buf_new(9); 03032 } 03033 nkf_state->broken_state = 0; 03034 nkf_state->mimeout_state = 0; 03035 } 03036 03037 #ifndef WIN32DLL 03038 static nkf_char 03039 std_getc(FILE *f) 03040 { 03041 if (!nkf_buf_empty_p(nkf_state->std_gc_buf)){ 03042 return nkf_buf_pop(nkf_state->std_gc_buf); 03043 } 03044 return getc(f); 03045 } 03046 #endif /*WIN32DLL*/ 03047 03048 static nkf_char 03049 std_ungetc(nkf_char c, FILE *f) 03050 { 03051 nkf_buf_push(nkf_state->std_gc_buf, c); 03052 return c; 03053 } 03054 03055 #ifndef WIN32DLL 03056 static void 03057 std_putc(nkf_char c) 03058 { 03059 if(c!=EOF) 03060 putchar(c); 03061 } 03062 #endif /*WIN32DLL*/ 03063 03064 static nkf_char hold_buf[HOLD_SIZE*2]; 03065 static int hold_count = 0; 03066 static nkf_char 03067 push_hold_buf(nkf_char c2) 03068 { 03069 if (hold_count >= HOLD_SIZE*2) 03070 return (EOF); 03071 hold_buf[hold_count++] = c2; 03072 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count); 03073 } 03074 03075 static int 03076 h_conv(FILE *f, nkf_char c1, nkf_char c2) 03077 { 03078 int ret; 03079 int hold_index; 03080 nkf_char c3, c4; 03081 03086 hold_count = 0; 03087 push_hold_buf(c1); 03088 push_hold_buf(c2); 03089 03090 while ((c2 = (*i_getc)(f)) != EOF) { 03091 if (c2 == ESC){ 03092 (*i_ungetc)(c2,f); 03093 break; 03094 } 03095 code_status(c2); 03096 if (push_hold_buf(c2) == EOF || estab_f) { 03097 break; 03098 } 03099 } 03100 03101 if (!estab_f) { 03102 struct input_code *p = input_code_list; 03103 struct input_code *result = p; 03104 if (c2 == EOF) { 03105 code_status(c2); 03106 } 03107 while (p->name) { 03108 if (p->status_func && p->score < result->score) { 03109 result = p; 03110 } 03111 p++; 03112 } 03113 set_iconv(TRUE, result->iconv_func); 03114 } 03115 03116 03126 ret = c2; 03127 hold_index = 0; 03128 while (hold_index < hold_count){ 03129 c1 = hold_buf[hold_index++]; 03130 if (nkf_char_unicode_p(c1)) { 03131 (*oconv)(0, c1); 03132 continue; 03133 } 03134 else if (c1 <= DEL){ 03135 (*iconv)(0, c1, 0); 03136 continue; 03137 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){ 03138 (*iconv)(JIS_X_0201_1976_K, c1, 0); 03139 continue; 03140 } 03141 if (hold_index < hold_count){ 03142 c2 = hold_buf[hold_index++]; 03143 }else{ 03144 c2 = (*i_getc)(f); 03145 if (c2 == EOF){ 03146 c4 = EOF; 03147 break; 03148 } 03149 code_status(c2); 03150 } 03151 c3 = 0; 03152 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */ 03153 case -2: 03154 /* 4 bytes UTF-8 */ 03155 if (hold_index < hold_count){ 03156 c3 = hold_buf[hold_index++]; 03157 } else if ((c3 = (*i_getc)(f)) == EOF) { 03158 ret = EOF; 03159 break; 03160 } 03161 code_status(c3); 03162 if (hold_index < hold_count){ 03163 c4 = hold_buf[hold_index++]; 03164 } else if ((c4 = (*i_getc)(f)) == EOF) { 03165 c3 = ret = EOF; 03166 break; 03167 } 03168 code_status(c4); 03169 (*iconv)(c1, c2, (c3<<8)|c4); 03170 break; 03171 case -1: 03172 /* 3 bytes EUC or UTF-8 */ 03173 if (hold_index < hold_count){ 03174 c3 = hold_buf[hold_index++]; 03175 } else if ((c3 = (*i_getc)(f)) == EOF) { 03176 ret = EOF; 03177 break; 03178 } else { 03179 code_status(c3); 03180 } 03181 (*iconv)(c1, c2, c3); 03182 break; 03183 } 03184 if (c3 == EOF) break; 03185 } 03186 return ret; 03187 } 03188 03189 /* 03190 * Check and Ignore BOM 03191 */ 03192 static void 03193 check_bom(FILE *f) 03194 { 03195 int c2; 03196 switch(c2 = (*i_getc)(f)){ 03197 case 0x00: 03198 if((c2 = (*i_getc)(f)) == 0x00){ 03199 if((c2 = (*i_getc)(f)) == 0xFE){ 03200 if((c2 = (*i_getc)(f)) == 0xFF){ 03201 if(!input_encoding){ 03202 set_iconv(TRUE, w_iconv32); 03203 } 03204 if (iconv == w_iconv32) { 03205 input_endian = ENDIAN_BIG; 03206 return; 03207 } 03208 (*i_ungetc)(0xFF,f); 03209 }else (*i_ungetc)(c2,f); 03210 (*i_ungetc)(0xFE,f); 03211 }else if(c2 == 0xFF){ 03212 if((c2 = (*i_getc)(f)) == 0xFE){ 03213 if(!input_encoding){ 03214 set_iconv(TRUE, w_iconv32); 03215 } 03216 if (iconv == w_iconv32) { 03217 input_endian = ENDIAN_2143; 03218 return; 03219 } 03220 (*i_ungetc)(0xFF,f); 03221 }else (*i_ungetc)(c2,f); 03222 (*i_ungetc)(0xFF,f); 03223 }else (*i_ungetc)(c2,f); 03224 (*i_ungetc)(0x00,f); 03225 }else (*i_ungetc)(c2,f); 03226 (*i_ungetc)(0x00,f); 03227 break; 03228 case 0xEF: 03229 if((c2 = (*i_getc)(f)) == 0xBB){ 03230 if((c2 = (*i_getc)(f)) == 0xBF){ 03231 if(!input_encoding){ 03232 set_iconv(TRUE, w_iconv); 03233 } 03234 if (iconv == w_iconv) { 03235 return; 03236 } 03237 (*i_ungetc)(0xBF,f); 03238 }else (*i_ungetc)(c2,f); 03239 (*i_ungetc)(0xBB,f); 03240 }else (*i_ungetc)(c2,f); 03241 (*i_ungetc)(0xEF,f); 03242 break; 03243 case 0xFE: 03244 if((c2 = (*i_getc)(f)) == 0xFF){ 03245 if((c2 = (*i_getc)(f)) == 0x00){ 03246 if((c2 = (*i_getc)(f)) == 0x00){ 03247 if(!input_encoding){ 03248 set_iconv(TRUE, w_iconv32); 03249 } 03250 if (iconv == w_iconv32) { 03251 input_endian = ENDIAN_3412; 03252 return; 03253 } 03254 (*i_ungetc)(0x00,f); 03255 }else (*i_ungetc)(c2,f); 03256 (*i_ungetc)(0x00,f); 03257 }else (*i_ungetc)(c2,f); 03258 if(!input_encoding){ 03259 set_iconv(TRUE, w_iconv16); 03260 } 03261 if (iconv == w_iconv16) { 03262 input_endian = ENDIAN_BIG; 03263 return; 03264 } 03265 (*i_ungetc)(0xFF,f); 03266 }else (*i_ungetc)(c2,f); 03267 (*i_ungetc)(0xFE,f); 03268 break; 03269 case 0xFF: 03270 if((c2 = (*i_getc)(f)) == 0xFE){ 03271 if((c2 = (*i_getc)(f)) == 0x00){ 03272 if((c2 = (*i_getc)(f)) == 0x00){ 03273 if(!input_encoding){ 03274 set_iconv(TRUE, w_iconv32); 03275 } 03276 if (iconv == w_iconv32) { 03277 input_endian = ENDIAN_LITTLE; 03278 return; 03279 } 03280 (*i_ungetc)(0x00,f); 03281 }else (*i_ungetc)(c2,f); 03282 (*i_ungetc)(0x00,f); 03283 }else (*i_ungetc)(c2,f); 03284 if(!input_encoding){ 03285 set_iconv(TRUE, w_iconv16); 03286 } 03287 if (iconv == w_iconv16) { 03288 input_endian = ENDIAN_LITTLE; 03289 return; 03290 } 03291 (*i_ungetc)(0xFE,f); 03292 }else (*i_ungetc)(c2,f); 03293 (*i_ungetc)(0xFF,f); 03294 break; 03295 default: 03296 (*i_ungetc)(c2,f); 03297 break; 03298 } 03299 } 03300 03301 static nkf_char 03302 broken_getc(FILE *f) 03303 { 03304 nkf_char c, c1; 03305 03306 if (!nkf_buf_empty_p(nkf_state->broken_buf)) { 03307 return nkf_buf_pop(nkf_state->broken_buf); 03308 } 03309 c = (*i_bgetc)(f); 03310 if (c=='$' && nkf_state->broken_state != ESC 03311 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) { 03312 c1= (*i_bgetc)(f); 03313 nkf_state->broken_state = 0; 03314 if (c1=='@'|| c1=='B') { 03315 nkf_buf_push(nkf_state->broken_buf, c1); 03316 nkf_buf_push(nkf_state->broken_buf, c); 03317 return ESC; 03318 } else { 03319 (*i_bungetc)(c1,f); 03320 return c; 03321 } 03322 } else if (c=='(' && nkf_state->broken_state != ESC 03323 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) { 03324 c1= (*i_bgetc)(f); 03325 nkf_state->broken_state = 0; 03326 if (c1=='J'|| c1=='B') { 03327 nkf_buf_push(nkf_state->broken_buf, c1); 03328 nkf_buf_push(nkf_state->broken_buf, c); 03329 return ESC; 03330 } else { 03331 (*i_bungetc)(c1,f); 03332 return c; 03333 } 03334 } else { 03335 nkf_state->broken_state = c; 03336 return c; 03337 } 03338 } 03339 03340 static nkf_char 03341 broken_ungetc(nkf_char c, FILE *f) 03342 { 03343 if (nkf_buf_length(nkf_state->broken_buf) < 2) 03344 nkf_buf_push(nkf_state->broken_buf, c); 03345 return c; 03346 } 03347 03348 static void 03349 eol_conv(nkf_char c2, nkf_char c1) 03350 { 03351 if (guess_f && input_eol != EOF) { 03352 if (c2 == 0 && c1 == LF) { 03353 if (!input_eol) input_eol = prev_cr ? CRLF : LF; 03354 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF; 03355 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF; 03356 else if (!prev_cr); 03357 else if (!input_eol) input_eol = CR; 03358 else if (input_eol != CR) input_eol = EOF; 03359 } 03360 if (prev_cr || (c2 == 0 && c1 == LF)) { 03361 prev_cr = 0; 03362 if (eolmode_f != LF) (*o_eol_conv)(0, CR); 03363 if (eolmode_f != CR) (*o_eol_conv)(0, LF); 03364 } 03365 if (c2 == 0 && c1 == CR) prev_cr = CR; 03366 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1); 03367 } 03368 03369 static void 03370 put_newline(void (*func)(nkf_char)) 03371 { 03372 switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) { 03373 case CRLF: 03374 (*func)(0x0D); 03375 (*func)(0x0A); 03376 break; 03377 case CR: 03378 (*func)(0x0D); 03379 break; 03380 case LF: 03381 (*func)(0x0A); 03382 break; 03383 } 03384 } 03385 03386 static void 03387 oconv_newline(void (*func)(nkf_char, nkf_char)) 03388 { 03389 switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) { 03390 case CRLF: 03391 (*func)(0, 0x0D); 03392 (*func)(0, 0x0A); 03393 break; 03394 case CR: 03395 (*func)(0, 0x0D); 03396 break; 03397 case LF: 03398 (*func)(0, 0x0A); 03399 break; 03400 } 03401 } 03402 03403 /* 03404 Return value of fold_conv() 03405 03406 LF add newline and output char 03407 CR add newline and output nothing 03408 SP space 03409 0 skip 03410 1 (or else) normal output 03411 03412 fold state in prev (previous character) 03413 03414 >0x80 Japanese (X0208/X0201) 03415 <0x80 ASCII 03416 LF new line 03417 SP space 03418 03419 This fold algorthm does not preserve heading space in a line. 03420 This is the main difference from fmt. 03421 */ 03422 03423 #define char_size(c2,c1) (c2?2:1) 03424 03425 static void 03426 fold_conv(nkf_char c2, nkf_char c1) 03427 { 03428 nkf_char prev0; 03429 nkf_char fold_state; 03430 03431 if (c1== CR && !fold_preserve_f) { 03432 fold_state=0; /* ignore cr */ 03433 }else if (c1== LF&&f_prev==CR && fold_preserve_f) { 03434 f_prev = LF; 03435 fold_state=0; /* ignore cr */ 03436 } else if (c1== BS) { 03437 if (f_line>0) f_line--; 03438 fold_state = 1; 03439 } else if (c2==EOF && f_line != 0) { /* close open last line */ 03440 fold_state = LF; 03441 } else if ((c1==LF && !fold_preserve_f) 03442 || ((c1==CR||(c1==LF&&f_prev!=CR)) 03443 && fold_preserve_f)) { 03444 /* new line */ 03445 if (fold_preserve_f) { 03446 f_prev = c1; 03447 f_line = 0; 03448 fold_state = CR; 03449 } else if ((f_prev == c1 && !fold_preserve_f) 03450 || (f_prev == LF && fold_preserve_f) 03451 ) { /* duplicate newline */ 03452 if (f_line) { 03453 f_line = 0; 03454 fold_state = LF; /* output two newline */ 03455 } else { 03456 f_line = 0; 03457 fold_state = 1; 03458 } 03459 } else { 03460 if (f_prev&0x80) { /* Japanese? */ 03461 f_prev = c1; 03462 fold_state = 0; /* ignore given single newline */ 03463 } else if (f_prev==SP) { 03464 fold_state = 0; 03465 } else { 03466 f_prev = c1; 03467 if (++f_line<=fold_len) 03468 fold_state = SP; 03469 else { 03470 f_line = 0; 03471 fold_state = CR; /* fold and output nothing */ 03472 } 03473 } 03474 } 03475 } else if (c1=='\f') { 03476 f_prev = LF; 03477 f_line = 0; 03478 fold_state = LF; /* output newline and clear */ 03479 } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) { 03480 /* X0208 kankaku or ascii space */ 03481 if (f_prev == SP) { 03482 fold_state = 0; /* remove duplicate spaces */ 03483 } else { 03484 f_prev = SP; 03485 if (++f_line<=fold_len) 03486 fold_state = SP; /* output ASCII space only */ 03487 else { 03488 f_prev = SP; f_line = 0; 03489 fold_state = CR; /* fold and output nothing */ 03490 } 03491 } 03492 } else { 03493 prev0 = f_prev; /* we still need this one... , but almost done */ 03494 f_prev = c1; 03495 if (c2 || c2 == JIS_X_0201_1976_K) 03496 f_prev |= 0x80; /* this is Japanese */ 03497 f_line += char_size(c2,c1); 03498 if (f_line<=fold_len) { /* normal case */ 03499 fold_state = 1; 03500 } else { 03501 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */ 03502 f_line = char_size(c2,c1); 03503 fold_state = LF; /* We can't wait, do fold now */ 03504 } else if (c2 == JIS_X_0201_1976_K) { 03505 /* simple kinsoku rules return 1 means no folding */ 03506 if (c1==(0xde&0x7f)) fold_state = 1; /* $B!+(B*/ 03507 else if (c1==(0xdf&0x7f)) fold_state = 1; /* $B!,(B*/ 03508 else if (c1==(0xa4&0x7f)) fold_state = 1; /* $B!#(B*/ 03509 else if (c1==(0xa3&0x7f)) fold_state = 1; /* $B!$(B*/ 03510 else if (c1==(0xa1&0x7f)) fold_state = 1; /* $B!W(B*/ 03511 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */ 03512 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */ 03513 f_line = 1; 03514 fold_state = LF;/* add one new f_line before this character */ 03515 } else { 03516 f_line = 1; 03517 fold_state = LF;/* add one new f_line before this character */ 03518 } 03519 } else if (c2==0) { 03520 /* kinsoku point in ASCII */ 03521 if ( c1==')'|| /* { [ ( */ 03522 c1==']'|| 03523 c1=='}'|| 03524 c1=='.'|| 03525 c1==','|| 03526 c1=='!'|| 03527 c1=='?'|| 03528 c1=='/'|| 03529 c1==':'|| 03530 c1==';') { 03531 fold_state = 1; 03532 /* just after special */ 03533 } else if (!is_alnum(prev0)) { 03534 f_line = char_size(c2,c1); 03535 fold_state = LF; 03536 } else if ((prev0==SP) || /* ignored new f_line */ 03537 (prev0==LF)|| /* ignored new f_line */ 03538 (prev0&0x80)) { /* X0208 - ASCII */ 03539 f_line = char_size(c2,c1); 03540 fold_state = LF;/* add one new f_line before this character */ 03541 } else { 03542 fold_state = 1; /* default no fold in ASCII */ 03543 } 03544 } else { 03545 if (c2=='!') { 03546 if (c1=='"') fold_state = 1; /* $B!"(B */ 03547 else if (c1=='#') fold_state = 1; /* $B!#(B */ 03548 else if (c1=='W') fold_state = 1; /* $B!W(B */ 03549 else if (c1=='K') fold_state = 1; /* $B!K(B */ 03550 else if (c1=='$') fold_state = 1; /* $B!$(B */ 03551 else if (c1=='%') fold_state = 1; /* $B!%(B */ 03552 else if (c1=='\'') fold_state = 1; /* $B!\(B */ 03553 else if (c1=='(') fold_state = 1; /* $B!((B */ 03554 else if (c1==')') fold_state = 1; /* $B!)(B */ 03555 else if (c1=='*') fold_state = 1; /* $B!*(B */ 03556 else if (c1=='+') fold_state = 1; /* $B!+(B */ 03557 else if (c1==',') fold_state = 1; /* $B!,(B */ 03558 /* default no fold in kinsoku */ 03559 else { 03560 fold_state = LF; 03561 f_line = char_size(c2,c1); 03562 /* add one new f_line before this character */ 03563 } 03564 } else { 03565 f_line = char_size(c2,c1); 03566 fold_state = LF; 03567 /* add one new f_line before this character */ 03568 } 03569 } 03570 } 03571 } 03572 /* terminator process */ 03573 switch(fold_state) { 03574 case LF: 03575 oconv_newline(o_fconv); 03576 (*o_fconv)(c2,c1); 03577 break; 03578 case 0: 03579 return; 03580 case CR: 03581 oconv_newline(o_fconv); 03582 break; 03583 case TAB: 03584 case SP: 03585 (*o_fconv)(0,SP); 03586 break; 03587 default: 03588 (*o_fconv)(c2,c1); 03589 } 03590 } 03591 03592 static nkf_char z_prev2=0,z_prev1=0; 03593 03594 static void 03595 z_conv(nkf_char c2, nkf_char c1) 03596 { 03597 03598 /* if (c2) c1 &= 0x7f; assertion */ 03599 03600 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) { 03601 (*o_zconv)(c2,c1); 03602 return; 03603 } 03604 03605 if (x0201_f) { 03606 if (z_prev2 == JIS_X_0201_1976_K) { 03607 if (c2 == JIS_X_0201_1976_K) { 03608 if (c1 == (0xde&0x7f)) { /* $BByE@(B */ 03609 z_prev2 = 0; 03610 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]); 03611 return; 03612 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /* $BH>ByE@(B */ 03613 z_prev2 = 0; 03614 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]); 03615 return; 03616 } 03617 } 03618 z_prev2 = 0; 03619 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]); 03620 } 03621 if (c2 == JIS_X_0201_1976_K) { 03622 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) { 03623 /* wait for $BByE@(B or $BH>ByE@(B */ 03624 z_prev1 = c1; 03625 z_prev2 = c2; 03626 return; 03627 } else { 03628 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]); 03629 return; 03630 } 03631 } 03632 } 03633 03634 if (c2 == EOF) { 03635 (*o_zconv)(c2, c1); 03636 return; 03637 } 03638 03639 if (alpha_f&1 && c2 == 0x23) { 03640 /* JISX0208 Alphabet */ 03641 c2 = 0; 03642 } else if (c2 == 0x21) { 03643 /* JISX0208 Kigou */ 03644 if (0x21==c1) { 03645 if (alpha_f&2) { 03646 c2 = 0; 03647 c1 = SP; 03648 } else if (alpha_f&4) { 03649 (*o_zconv)(0, SP); 03650 (*o_zconv)(0, SP); 03651 return; 03652 } 03653 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) { 03654 c2 = 0; 03655 c1 = fv[c1-0x20]; 03656 } 03657 } 03658 03659 if (alpha_f&8 && c2 == 0) { 03660 /* HTML Entity */ 03661 const char *entity = 0; 03662 switch (c1){ 03663 case '>': entity = ">"; break; 03664 case '<': entity = "<"; break; 03665 case '\"': entity = """; break; 03666 case '&': entity = "&"; break; 03667 } 03668 if (entity){ 03669 while (*entity) (*o_zconv)(0, *entity++); 03670 return; 03671 } 03672 } 03673 03674 if (alpha_f & 16) { 03675 /* JIS X 0208 Katakana to JIS X 0201 Katakana */ 03676 if (c2 == 0x21) { 03677 nkf_char c = 0; 03678 switch (c1) { 03679 case 0x23: 03680 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */ 03681 c = 0xA1; 03682 break; 03683 case 0x56: 03684 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */ 03685 c = 0xA2; 03686 break; 03687 case 0x57: 03688 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */ 03689 c = 0xA3; 03690 break; 03691 case 0x22: 03692 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */ 03693 c = 0xA4; 03694 break; 03695 case 0x26: 03696 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */ 03697 c = 0xA5; 03698 break; 03699 case 0x3C: 03700 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */ 03701 c = 0xB0; 03702 break; 03703 case 0x2B: 03704 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */ 03705 c = 0xDE; 03706 break; 03707 case 0x2C: 03708 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */ 03709 c = 0xDF; 03710 break; 03711 } 03712 if (c) { 03713 (*o_zconv)(JIS_X_0201_1976_K, c); 03714 return; 03715 } 03716 } else if (c2 == 0x25) { 03717 /* JISX0208 Katakana */ 03718 static const int fullwidth_to_halfwidth[] = 03719 { 03720 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00, 03721 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800, 03722 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00, 03723 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000, 03724 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E, 03725 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00, 03726 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F, 03727 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000, 03728 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00, 03729 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00, 03730 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000, 03731 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 03732 }; 03733 if (fullwidth_to_halfwidth[c1-0x20]){ 03734 c2 = fullwidth_to_halfwidth[c1-0x20]; 03735 (*o_zconv)(JIS_X_0201_1976_K, c2>>8); 03736 if (c2 & 0xFF) { 03737 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF); 03738 } 03739 return; 03740 } 03741 } 03742 } 03743 (*o_zconv)(c2,c1); 03744 } 03745 03746 03747 #define rot13(c) ( \ 03748 ( c < 'A') ? c: \ 03749 (c <= 'M') ? (c + 13): \ 03750 (c <= 'Z') ? (c - 13): \ 03751 (c < 'a') ? (c): \ 03752 (c <= 'm') ? (c + 13): \ 03753 (c <= 'z') ? (c - 13): \ 03754 (c) \ 03755 ) 03756 03757 #define rot47(c) ( \ 03758 ( c < '!') ? c: \ 03759 ( c <= 'O') ? (c + 47) : \ 03760 ( c <= '~') ? (c - 47) : \ 03761 c \ 03762 ) 03763 03764 static void 03765 rot_conv(nkf_char c2, nkf_char c1) 03766 { 03767 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) { 03768 c1 = rot13(c1); 03769 } else if (c2) { 03770 c1 = rot47(c1); 03771 c2 = rot47(c2); 03772 } 03773 (*o_rot_conv)(c2,c1); 03774 } 03775 03776 static void 03777 hira_conv(nkf_char c2, nkf_char c1) 03778 { 03779 if (hira_f & 1) { 03780 if (c2 == 0x25) { 03781 if (0x20 < c1 && c1 < 0x74) { 03782 c2 = 0x24; 03783 (*o_hira_conv)(c2,c1); 03784 return; 03785 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) { 03786 c2 = 0; 03787 c1 = nkf_char_unicode_new(0x3094); 03788 (*o_hira_conv)(c2,c1); 03789 return; 03790 } 03791 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) { 03792 c1 += 2; 03793 (*o_hira_conv)(c2,c1); 03794 return; 03795 } 03796 } 03797 if (hira_f & 2) { 03798 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) { 03799 c2 = 0x25; 03800 c1 = 0x74; 03801 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) { 03802 c2 = 0x25; 03803 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) { 03804 c1 -= 2; 03805 } 03806 } 03807 (*o_hira_conv)(c2,c1); 03808 } 03809 03810 03811 static void 03812 iso2022jp_check_conv(nkf_char c2, nkf_char c1) 03813 { 03814 #define RANGE_NUM_MAX 18 03815 static const nkf_char range[RANGE_NUM_MAX][2] = { 03816 {0x222f, 0x2239,}, 03817 {0x2242, 0x2249,}, 03818 {0x2251, 0x225b,}, 03819 {0x226b, 0x2271,}, 03820 {0x227a, 0x227d,}, 03821 {0x2321, 0x232f,}, 03822 {0x233a, 0x2340,}, 03823 {0x235b, 0x2360,}, 03824 {0x237b, 0x237e,}, 03825 {0x2474, 0x247e,}, 03826 {0x2577, 0x257e,}, 03827 {0x2639, 0x2640,}, 03828 {0x2659, 0x267e,}, 03829 {0x2742, 0x2750,}, 03830 {0x2772, 0x277e,}, 03831 {0x2841, 0x287e,}, 03832 {0x4f54, 0x4f7e,}, 03833 {0x7425, 0x747e}, 03834 }; 03835 nkf_char i; 03836 nkf_char start, end, c; 03837 03838 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) { 03839 c2 = GETA1; 03840 c1 = GETA2; 03841 } 03842 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) { 03843 c2 = GETA1; 03844 c1 = GETA2; 03845 } 03846 03847 for (i = 0; i < RANGE_NUM_MAX; i++) { 03848 start = range[i][0]; 03849 end = range[i][1]; 03850 c = (c2 << 8) + c1; 03851 if (c >= start && c <= end) { 03852 c2 = GETA1; 03853 c1 = GETA2; 03854 } 03855 } 03856 (*o_iso2022jp_check_conv)(c2,c1); 03857 } 03858 03859 03860 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */ 03861 03862 static const unsigned char *mime_pattern[] = { 03863 (const unsigned char *)"\075?EUC-JP?B?", 03864 (const unsigned char *)"\075?SHIFT_JIS?B?", 03865 (const unsigned char *)"\075?ISO-8859-1?Q?", 03866 (const unsigned char *)"\075?ISO-8859-1?B?", 03867 (const unsigned char *)"\075?ISO-2022-JP?B?", 03868 (const unsigned char *)"\075?ISO-2022-JP?B?", 03869 (const unsigned char *)"\075?ISO-2022-JP?Q?", 03870 #if defined(UTF8_INPUT_ENABLE) 03871 (const unsigned char *)"\075?UTF-8?B?", 03872 (const unsigned char *)"\075?UTF-8?Q?", 03873 #endif 03874 (const unsigned char *)"\075?US-ASCII?Q?", 03875 NULL 03876 }; 03877 03878 03879 /* $B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u(B */ 03880 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = { 03881 e_iconv, s_iconv, 0, 0, 0, 0, 0, 03882 #if defined(UTF8_INPUT_ENABLE) 03883 w_iconv, w_iconv, 03884 #endif 03885 0, 03886 }; 03887 03888 static const nkf_char mime_encode[] = { 03889 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K, JIS_X_0201_1976_K, 03890 #if defined(UTF8_INPUT_ENABLE) 03891 UTF_8, UTF_8, 03892 #endif 03893 ASCII, 03894 0 03895 }; 03896 03897 static const nkf_char mime_encode_method[] = { 03898 'B', 'B','Q', 'B', 'B', 'B', 'Q', 03899 #if defined(UTF8_INPUT_ENABLE) 03900 'B', 'Q', 03901 #endif 03902 'Q', 03903 0 03904 }; 03905 03906 03907 /* MIME preprocessor fifo */ 03908 03909 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */ 03910 #define MIME_BUF_MASK (MIME_BUF_SIZE-1) 03911 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK] 03912 static struct { 03913 unsigned char buf[MIME_BUF_SIZE]; 03914 unsigned int top; 03915 unsigned int last; /* decoded */ 03916 unsigned int input; /* undecoded */ 03917 } mime_input_state; 03918 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL; 03919 03920 #define MAXRECOVER 20 03921 03922 static void 03923 mime_input_buf_unshift(nkf_char c) 03924 { 03925 mime_input_buf(--mime_input_state.top) = (unsigned char)c; 03926 } 03927 03928 static nkf_char 03929 mime_ungetc(nkf_char c, FILE *f) 03930 { 03931 mime_input_buf_unshift(c); 03932 return c; 03933 } 03934 03935 static nkf_char 03936 mime_ungetc_buf(nkf_char c, FILE *f) 03937 { 03938 if (mimebuf_f) 03939 (*i_mungetc_buf)(c,f); 03940 else 03941 mime_input_buf(--mime_input_state.input) = (unsigned char)c; 03942 return c; 03943 } 03944 03945 static nkf_char 03946 mime_getc_buf(FILE *f) 03947 { 03948 /* we don't keep eof of mime_input_buf, becase it contains ?= as 03949 a terminator. It was checked in mime_integrity. */ 03950 return ((mimebuf_f)? 03951 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++)); 03952 } 03953 03954 static void 03955 switch_mime_getc(void) 03956 { 03957 if (i_getc!=mime_getc) { 03958 i_mgetc = i_getc; i_getc = mime_getc; 03959 i_mungetc = i_ungetc; i_ungetc = mime_ungetc; 03960 if(mime_f==STRICT_MIME) { 03961 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf; 03962 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf; 03963 } 03964 } 03965 } 03966 03967 static void 03968 unswitch_mime_getc(void) 03969 { 03970 if(mime_f==STRICT_MIME) { 03971 i_mgetc = i_mgetc_buf; 03972 i_mungetc = i_mungetc_buf; 03973 } 03974 i_getc = i_mgetc; 03975 i_ungetc = i_mungetc; 03976 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back); 03977 mime_iconv_back = NULL; 03978 } 03979 03980 static nkf_char 03981 mime_integrity(FILE *f, const unsigned char *p) 03982 { 03983 nkf_char c,d; 03984 unsigned int q; 03985 /* In buffered mode, read until =? or NL or buffer full 03986 */ 03987 mime_input_state.input = mime_input_state.top; 03988 mime_input_state.last = mime_input_state.top; 03989 03990 while(*p) mime_input_buf(mime_input_state.input++) = *p++; 03991 d = 0; 03992 q = mime_input_state.input; 03993 while((c=(*i_getc)(f))!=EOF) { 03994 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) { 03995 break; /* buffer full */ 03996 } 03997 if (c=='=' && d=='?') { 03998 /* checked. skip header, start decode */ 03999 mime_input_buf(mime_input_state.input++) = (unsigned char)c; 04000 /* mime_last_input = mime_input_state.input; */ 04001 mime_input_state.input = q; 04002 switch_mime_getc(); 04003 return 1; 04004 } 04005 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c)))) 04006 break; 04007 /* Should we check length mod 4? */ 04008 mime_input_buf(mime_input_state.input++) = (unsigned char)c; 04009 d=c; 04010 } 04011 /* In case of Incomplete MIME, no MIME decode */ 04012 mime_input_buf(mime_input_state.input++) = (unsigned char)c; 04013 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */ 04014 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */ 04015 switch_mime_getc(); /* anyway we need buffered getc */ 04016 return 1; 04017 } 04018 04019 static nkf_char 04020 mime_begin_strict(FILE *f) 04021 { 04022 nkf_char c1 = 0; 04023 int i,j,k; 04024 const unsigned char *p,*q; 04025 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */ 04026 04027 mime_decode_mode = FALSE; 04028 /* =? has been checked */ 04029 j = 0; 04030 p = mime_pattern[j]; 04031 r[0]='='; r[1]='?'; 04032 04033 for(i=2;p[i]>SP;i++) { /* start at =? */ 04034 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) { 04035 /* pattern fails, try next one */ 04036 q = p; 04037 while (mime_pattern[++j]) { 04038 p = mime_pattern[j]; 04039 for(k=2;k<i;k++) /* assume length(p) > i */ 04040 if (p[k]!=q[k]) break; 04041 if (k==i && nkf_toupper(c1)==p[k]) break; 04042 } 04043 p = mime_pattern[j]; 04044 if (p) continue; /* found next one, continue */ 04045 /* all fails, output from recovery buffer */ 04046 (*i_ungetc)(c1,f); 04047 for(j=0;j<i;j++) { 04048 (*oconv)(0,r[j]); 04049 } 04050 return c1; 04051 } 04052 } 04053 mime_decode_mode = p[i-2]; 04054 04055 mime_iconv_back = iconv; 04056 set_iconv(FALSE, mime_priority_func[j]); 04057 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME); 04058 04059 if (mime_decode_mode=='B') { 04060 mimebuf_f = unbuf_f; 04061 if (!unbuf_f) { 04062 /* do MIME integrity check */ 04063 return mime_integrity(f,mime_pattern[j]); 04064 } 04065 } 04066 switch_mime_getc(); 04067 mimebuf_f = TRUE; 04068 return c1; 04069 } 04070 04071 static nkf_char 04072 mime_begin(FILE *f) 04073 { 04074 nkf_char c1; 04075 int i,k; 04076 04077 /* In NONSTRICT mode, only =? is checked. In case of failure, we */ 04078 /* re-read and convert again from mime_buffer. */ 04079 04080 /* =? has been checked */ 04081 k = mime_input_state.last; 04082 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?'; 04083 for(i=2;i<MAXRECOVER;i++) { /* start at =? */ 04084 /* We accept any character type even if it is breaked by new lines */ 04085 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1; 04086 if (c1==LF||c1==SP||c1==CR|| 04087 c1=='-'||c1=='_'||is_alnum(c1)) continue; 04088 if (c1=='=') { 04089 /* Failed. But this could be another MIME preemble */ 04090 (*i_ungetc)(c1,f); 04091 mime_input_state.last--; 04092 break; 04093 } 04094 if (c1!='?') break; 04095 else { 04096 /* c1=='?' */ 04097 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1; 04098 if (!(++i<MAXRECOVER) || c1==EOF) break; 04099 if (c1=='b'||c1=='B') { 04100 mime_decode_mode = 'B'; 04101 } else if (c1=='q'||c1=='Q') { 04102 mime_decode_mode = 'Q'; 04103 } else { 04104 break; 04105 } 04106 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1; 04107 if (!(++i<MAXRECOVER) || c1==EOF) break; 04108 if (c1!='?') { 04109 mime_decode_mode = FALSE; 04110 } 04111 break; 04112 } 04113 } 04114 switch_mime_getc(); 04115 if (!mime_decode_mode) { 04116 /* false MIME premble, restart from mime_buffer */ 04117 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */ 04118 /* Since we are in MIME mode until buffer becomes empty, */ 04119 /* we never go into mime_begin again for a while. */ 04120 return c1; 04121 } 04122 /* discard mime preemble, and goto MIME mode */ 04123 mime_input_state.last = k; 04124 /* do no MIME integrity check */ 04125 return c1; /* used only for checking EOF */ 04126 } 04127 04128 #ifdef CHECK_OPTION 04129 static void 04130 no_putc(nkf_char c) 04131 { 04132 ; 04133 } 04134 04135 static void 04136 debug(const char *str) 04137 { 04138 if (debug_f){ 04139 fprintf(stderr, "%s\n", str ? str : "NULL"); 04140 } 04141 } 04142 #endif 04143 04144 static void 04145 set_input_codename(const char *codename) 04146 { 04147 if (!input_codename) { 04148 input_codename = codename; 04149 } else if (strcmp(codename, input_codename) != 0) { 04150 input_codename = ""; 04151 } 04152 } 04153 04154 static const char* 04155 get_guessed_code(void) 04156 { 04157 if (input_codename && !*input_codename) { 04158 input_codename = "BINARY"; 04159 } else { 04160 struct input_code *p = find_inputcode_byfunc(iconv); 04161 if (!input_codename) { 04162 input_codename = "ASCII"; 04163 } else if (strcmp(input_codename, "Shift_JIS") == 0) { 04164 if (p->score & (SCORE_DEPEND|SCORE_CP932)) 04165 input_codename = "CP932"; 04166 } else if (strcmp(input_codename, "EUC-JP") == 0) { 04167 if (p->score & (SCORE_X0212)) 04168 input_codename = "EUCJP-MS"; 04169 else if (p->score & (SCORE_DEPEND|SCORE_CP932)) 04170 input_codename = "CP51932"; 04171 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) { 04172 if (p->score & (SCORE_KANA)) 04173 input_codename = "CP50221"; 04174 else if (p->score & (SCORE_DEPEND|SCORE_CP932)) 04175 input_codename = "CP50220"; 04176 } 04177 } 04178 return input_codename; 04179 } 04180 04181 #if !defined(PERL_XS) && !defined(WIN32DLL) 04182 static void 04183 print_guessed_code(char *filename) 04184 { 04185 if (filename != NULL) printf("%s: ", filename); 04186 if (input_codename && !*input_codename) { 04187 printf("BINARY\n"); 04188 } else { 04189 input_codename = get_guessed_code(); 04190 if (guess_f == 1) { 04191 printf("%s\n", input_codename); 04192 } else { 04193 printf("%s%s\n", 04194 input_codename, 04195 input_eol == CR ? " (CR)" : 04196 input_eol == LF ? " (LF)" : 04197 input_eol == CRLF ? " (CRLF)" : 04198 input_eol == EOF ? " (MIXED NL)" : 04199 ""); 04200 } 04201 } 04202 } 04203 #endif /*WIN32DLL*/ 04204 04205 #ifdef INPUT_OPTION 04206 04207 static nkf_char 04208 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f)) 04209 { 04210 nkf_char c1, c2, c3; 04211 c1 = (*g)(f); 04212 if (c1 != ch){ 04213 return c1; 04214 } 04215 c2 = (*g)(f); 04216 if (!nkf_isxdigit(c2)){ 04217 (*u)(c2, f); 04218 return c1; 04219 } 04220 c3 = (*g)(f); 04221 if (!nkf_isxdigit(c3)){ 04222 (*u)(c2, f); 04223 (*u)(c3, f); 04224 return c1; 04225 } 04226 return (hex2bin(c2) << 4) | hex2bin(c3); 04227 } 04228 04229 static nkf_char 04230 cap_getc(FILE *f) 04231 { 04232 return hex_getc(':', f, i_cgetc, i_cungetc); 04233 } 04234 04235 static nkf_char 04236 cap_ungetc(nkf_char c, FILE *f) 04237 { 04238 return (*i_cungetc)(c, f); 04239 } 04240 04241 static nkf_char 04242 url_getc(FILE *f) 04243 { 04244 return hex_getc('%', f, i_ugetc, i_uungetc); 04245 } 04246 04247 static nkf_char 04248 url_ungetc(nkf_char c, FILE *f) 04249 { 04250 return (*i_uungetc)(c, f); 04251 } 04252 #endif 04253 04254 #ifdef NUMCHAR_OPTION 04255 static nkf_char 04256 numchar_getc(FILE *f) 04257 { 04258 nkf_char (*g)(FILE *) = i_ngetc; 04259 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc; 04260 int i = 0, j; 04261 nkf_char buf[12]; 04262 nkf_char c = -1; 04263 04264 buf[i] = (*g)(f); 04265 if (buf[i] == '&'){ 04266 buf[++i] = (*g)(f); 04267 if (buf[i] == '#'){ 04268 c = 0; 04269 buf[++i] = (*g)(f); 04270 if (buf[i] == 'x' || buf[i] == 'X'){ 04271 for (j = 0; j < 7; j++){ 04272 buf[++i] = (*g)(f); 04273 if (!nkf_isxdigit(buf[i])){ 04274 if (buf[i] != ';'){ 04275 c = -1; 04276 } 04277 break; 04278 } 04279 c <<= 4; 04280 c |= hex2bin(buf[i]); 04281 } 04282 }else{ 04283 for (j = 0; j < 8; j++){ 04284 if (j){ 04285 buf[++i] = (*g)(f); 04286 } 04287 if (!nkf_isdigit(buf[i])){ 04288 if (buf[i] != ';'){ 04289 c = -1; 04290 } 04291 break; 04292 } 04293 c *= 10; 04294 c += hex2bin(buf[i]); 04295 } 04296 } 04297 } 04298 } 04299 if (c != -1){ 04300 return nkf_char_unicode_new(c); 04301 } 04302 while (i > 0){ 04303 (*u)(buf[i], f); 04304 --i; 04305 } 04306 return buf[0]; 04307 } 04308 04309 static nkf_char 04310 numchar_ungetc(nkf_char c, FILE *f) 04311 { 04312 return (*i_nungetc)(c, f); 04313 } 04314 #endif 04315 04316 #ifdef UNICODE_NORMALIZATION 04317 04318 static nkf_char 04319 nfc_getc(FILE *f) 04320 { 04321 nkf_char (*g)(FILE *f) = i_nfc_getc; 04322 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc; 04323 nkf_buf_t *buf = nkf_state->nfc_buf; 04324 const unsigned char *array; 04325 int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1; 04326 nkf_char c = (*g)(f); 04327 04328 if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c; 04329 04330 nkf_buf_push(buf, c); 04331 do { 04332 while (lower <= upper) { 04333 int mid = (lower+upper) / 2; 04334 int len; 04335 array = normalization_table[mid].nfd; 04336 for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) { 04337 if (len >= nkf_buf_length(buf)) { 04338 c = (*g)(f); 04339 if (c == EOF) { 04340 len = 0; 04341 lower = 1, upper = 0; 04342 break; 04343 } 04344 nkf_buf_push(buf, c); 04345 } 04346 if (array[len] != nkf_buf_at(buf, len)) { 04347 if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1; 04348 else upper = mid - 1; 04349 len = 0; 04350 break; 04351 } 04352 } 04353 if (len > 0) { 04354 int i; 04355 array = normalization_table[mid].nfc; 04356 nkf_buf_clear(buf); 04357 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++) 04358 nkf_buf_push(buf, array[i]); 04359 break; 04360 } 04361 } 04362 } while (lower <= upper); 04363 04364 while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f); 04365 c = nkf_buf_pop(buf); 04366 04367 return c; 04368 } 04369 04370 static nkf_char 04371 nfc_ungetc(nkf_char c, FILE *f) 04372 { 04373 return (*i_nfc_ungetc)(c, f); 04374 } 04375 #endif /* UNICODE_NORMALIZATION */ 04376 04377 04378 static nkf_char 04379 base64decode(nkf_char c) 04380 { 04381 int i; 04382 if (c > '@') { 04383 if (c < '[') { 04384 i = c - 'A'; /* A..Z 0-25 */ 04385 } else if (c == '_') { 04386 i = '?' /* 63 */ ; /* _ 63 */ 04387 } else { 04388 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */ 04389 } 04390 } else if (c > '/') { 04391 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */ 04392 } else if (c == '+' || c == '-') { 04393 i = '>' /* 62 */ ; /* + and - 62 */ 04394 } else { 04395 i = '?' /* 63 */ ; /* / 63 */ 04396 } 04397 return (i); 04398 } 04399 04400 static nkf_char 04401 mime_getc(FILE *f) 04402 { 04403 nkf_char c1, c2, c3, c4, cc; 04404 nkf_char t1, t2, t3, t4, mode, exit_mode; 04405 nkf_char lwsp_count; 04406 char *lwsp_buf; 04407 char *lwsp_buf_new; 04408 nkf_char lwsp_size = 128; 04409 04410 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */ 04411 return mime_input_buf(mime_input_state.top++); 04412 } 04413 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) { 04414 mime_decode_mode=FALSE; 04415 unswitch_mime_getc(); 04416 return (*i_getc)(f); 04417 } 04418 04419 if (mimebuf_f == FIXED_MIME) 04420 exit_mode = mime_decode_mode; 04421 else 04422 exit_mode = FALSE; 04423 if (mime_decode_mode == 'Q') { 04424 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF); 04425 restart_mime_q: 04426 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP; 04427 if (c1<=SP || DEL<=c1) { 04428 mime_decode_mode = exit_mode; /* prepare for quit */ 04429 return c1; 04430 } 04431 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) { 04432 return c1; 04433 } 04434 04435 mime_decode_mode = exit_mode; /* prepare for quit */ 04436 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF); 04437 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) { 04438 /* end Q encoding */ 04439 input_mode = exit_mode; 04440 lwsp_count = 0; 04441 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char)); 04442 while ((c1=(*i_getc)(f))!=EOF) { 04443 switch (c1) { 04444 case LF: 04445 case CR: 04446 if (c1==LF) { 04447 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) { 04448 i_ungetc(SP,f); 04449 continue; 04450 } else { 04451 i_ungetc(c1,f); 04452 } 04453 c1 = LF; 04454 } else { 04455 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) { 04456 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) { 04457 i_ungetc(SP,f); 04458 continue; 04459 } else { 04460 i_ungetc(c1,f); 04461 } 04462 i_ungetc(LF,f); 04463 } else { 04464 i_ungetc(c1,f); 04465 } 04466 c1 = CR; 04467 } 04468 break; 04469 case SP: 04470 case TAB: 04471 lwsp_buf[lwsp_count] = (unsigned char)c1; 04472 if (lwsp_count++>lwsp_size){ 04473 lwsp_size <<= 1; 04474 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char)); 04475 lwsp_buf = lwsp_buf_new; 04476 } 04477 continue; 04478 } 04479 break; 04480 } 04481 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) { 04482 i_ungetc(c1,f); 04483 for(lwsp_count--;lwsp_count>0;lwsp_count--) 04484 i_ungetc(lwsp_buf[lwsp_count],f); 04485 c1 = lwsp_buf[0]; 04486 } 04487 nkf_xfree(lwsp_buf); 04488 return c1; 04489 } 04490 if (c1=='='&&c2<SP) { /* this is soft wrap */ 04491 while((c1 = (*i_mgetc)(f)) <=SP) { 04492 if (c1 == EOF) return (EOF); 04493 } 04494 mime_decode_mode = 'Q'; /* still in MIME */ 04495 goto restart_mime_q; 04496 } 04497 if (c1=='?') { 04498 mime_decode_mode = 'Q'; /* still in MIME */ 04499 (*i_mungetc)(c2,f); 04500 return c1; 04501 } 04502 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF); 04503 if (c2<=SP) return c2; 04504 mime_decode_mode = 'Q'; /* still in MIME */ 04505 return ((hex2bin(c2)<<4) + hex2bin(c3)); 04506 } 04507 04508 if (mime_decode_mode != 'B') { 04509 mime_decode_mode = FALSE; 04510 return (*i_mgetc)(f); 04511 } 04512 04513 04514 /* Base64 encoding */ 04515 /* 04516 MIME allows line break in the middle of 04517 Base64, but we are very pessimistic in decoding 04518 in unbuf mode because MIME encoded code may broken by 04519 less or editor's control sequence (such as ESC-[-K in unbuffered 04520 mode. ignore incomplete MIME. 04521 */ 04522 mode = mime_decode_mode; 04523 mime_decode_mode = exit_mode; /* prepare for quit */ 04524 04525 while ((c1 = (*i_mgetc)(f))<=SP) { 04526 if (c1==EOF) 04527 return (EOF); 04528 } 04529 mime_c2_retry: 04530 if ((c2 = (*i_mgetc)(f))<=SP) { 04531 if (c2==EOF) 04532 return (EOF); 04533 if (mime_f != STRICT_MIME) goto mime_c2_retry; 04534 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII; 04535 return c2; 04536 } 04537 if ((c1 == '?') && (c2 == '=')) { 04538 input_mode = ASCII; 04539 lwsp_count = 0; 04540 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char)); 04541 while ((c1=(*i_getc)(f))!=EOF) { 04542 switch (c1) { 04543 case LF: 04544 case CR: 04545 if (c1==LF) { 04546 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) { 04547 i_ungetc(SP,f); 04548 continue; 04549 } else { 04550 i_ungetc(c1,f); 04551 } 04552 c1 = LF; 04553 } else { 04554 if ((c1=(*i_getc)(f))!=EOF) { 04555 if (c1==SP) { 04556 i_ungetc(SP,f); 04557 continue; 04558 } else if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) { 04559 i_ungetc(SP,f); 04560 continue; 04561 } else { 04562 i_ungetc(c1,f); 04563 } 04564 i_ungetc(LF,f); 04565 } else { 04566 i_ungetc(c1,f); 04567 } 04568 c1 = CR; 04569 } 04570 break; 04571 case SP: 04572 case TAB: 04573 lwsp_buf[lwsp_count] = (unsigned char)c1; 04574 if (lwsp_count++>lwsp_size){ 04575 lwsp_size <<= 1; 04576 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char)); 04577 lwsp_buf = lwsp_buf_new; 04578 } 04579 continue; 04580 } 04581 break; 04582 } 04583 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) { 04584 i_ungetc(c1,f); 04585 for(lwsp_count--;lwsp_count>0;lwsp_count--) 04586 i_ungetc(lwsp_buf[lwsp_count],f); 04587 c1 = lwsp_buf[0]; 04588 } 04589 nkf_xfree(lwsp_buf); 04590 return c1; 04591 } 04592 mime_c3_retry: 04593 if ((c3 = (*i_mgetc)(f))<=SP) { 04594 if (c3==EOF) 04595 return (EOF); 04596 if (mime_f != STRICT_MIME) goto mime_c3_retry; 04597 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII; 04598 return c3; 04599 } 04600 mime_c4_retry: 04601 if ((c4 = (*i_mgetc)(f))<=SP) { 04602 if (c4==EOF) 04603 return (EOF); 04604 if (mime_f != STRICT_MIME) goto mime_c4_retry; 04605 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII; 04606 return c4; 04607 } 04608 04609 mime_decode_mode = mode; /* still in MIME sigh... */ 04610 04611 /* BASE 64 decoding */ 04612 04613 t1 = 0x3f & base64decode(c1); 04614 t2 = 0x3f & base64decode(c2); 04615 t3 = 0x3f & base64decode(c3); 04616 t4 = 0x3f & base64decode(c4); 04617 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03); 04618 if (c2 != '=') { 04619 mime_input_buf(mime_input_state.last++) = (unsigned char)cc; 04620 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f); 04621 if (c3 != '=') { 04622 mime_input_buf(mime_input_state.last++) = (unsigned char)cc; 04623 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f); 04624 if (c4 != '=') 04625 mime_input_buf(mime_input_state.last++) = (unsigned char)cc; 04626 } 04627 } else { 04628 return c1; 04629 } 04630 return mime_input_buf(mime_input_state.top++); 04631 } 04632 04633 static const char basis_64[] = 04634 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 04635 04636 #define MIMEOUT_BUF_LENGTH 74 04637 static struct { 04638 unsigned char buf[MIMEOUT_BUF_LENGTH+1]; 04639 int count; 04640 } mimeout_state; 04641 04642 /*nkf_char mime_lastchar2, mime_lastchar1;*/ 04643 04644 static void 04645 open_mime(nkf_char mode) 04646 { 04647 const unsigned char *p; 04648 int i; 04649 int j; 04650 p = mime_pattern[0]; 04651 for(i=0;mime_pattern[i];i++) { 04652 if (mode == mime_encode[i]) { 04653 p = mime_pattern[i]; 04654 break; 04655 } 04656 } 04657 mimeout_mode = mime_encode_method[i]; 04658 i = 0; 04659 if (base64_count>45) { 04660 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){ 04661 (*o_mputc)(mimeout_state.buf[i]); 04662 i++; 04663 } 04664 put_newline(o_mputc); 04665 (*o_mputc)(SP); 04666 base64_count = 1; 04667 if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) { 04668 i++; 04669 } 04670 } 04671 for (;i<mimeout_state.count;i++) { 04672 if (nkf_isspace(mimeout_state.buf[i])) { 04673 (*o_mputc)(mimeout_state.buf[i]); 04674 base64_count ++; 04675 } else { 04676 break; 04677 } 04678 } 04679 while(*p) { 04680 (*o_mputc)(*p++); 04681 base64_count ++; 04682 } 04683 j = mimeout_state.count; 04684 mimeout_state.count = 0; 04685 for (;i<j;i++) { 04686 mime_putc(mimeout_state.buf[i]); 04687 } 04688 } 04689 04690 static void 04691 mime_prechar(nkf_char c2, nkf_char c1) 04692 { 04693 if (mimeout_mode > 0){ 04694 if (c2 == EOF){ 04695 if (base64_count + mimeout_state.count/3*4> 73){ 04696 (*o_base64conv)(EOF,0); 04697 oconv_newline(o_base64conv); 04698 (*o_base64conv)(0,SP); 04699 base64_count = 1; 04700 } 04701 } else { 04702 if ((c2 != 0 || c1 > DEL) && base64_count + mimeout_state.count/3*4> 66) { 04703 (*o_base64conv)(EOF,0); 04704 oconv_newline(o_base64conv); 04705 (*o_base64conv)(0,SP); 04706 base64_count = 1; 04707 mimeout_mode = -1; 04708 } 04709 } 04710 } else if (c2) { 04711 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) { 04712 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B'; 04713 open_mime(output_mode); 04714 (*o_base64conv)(EOF,0); 04715 oconv_newline(o_base64conv); 04716 (*o_base64conv)(0,SP); 04717 base64_count = 1; 04718 mimeout_mode = -1; 04719 } 04720 } 04721 } 04722 04723 static void 04724 close_mime(void) 04725 { 04726 (*o_mputc)('?'); 04727 (*o_mputc)('='); 04728 base64_count += 2; 04729 mimeout_mode = 0; 04730 } 04731 04732 static void 04733 eof_mime(void) 04734 { 04735 switch(mimeout_mode) { 04736 case 'Q': 04737 case 'B': 04738 break; 04739 case 2: 04740 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]); 04741 (*o_mputc)('='); 04742 (*o_mputc)('='); 04743 base64_count += 3; 04744 break; 04745 case 1: 04746 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]); 04747 (*o_mputc)('='); 04748 base64_count += 2; 04749 break; 04750 } 04751 if (mimeout_mode > 0) { 04752 if (mimeout_f!=FIXED_MIME) { 04753 close_mime(); 04754 } else if (mimeout_mode != 'Q') 04755 mimeout_mode = 'B'; 04756 } 04757 } 04758 04759 static void 04760 mimeout_addchar(nkf_char c) 04761 { 04762 switch(mimeout_mode) { 04763 case 'Q': 04764 if (c==CR||c==LF) { 04765 (*o_mputc)(c); 04766 base64_count = 0; 04767 } else if(!nkf_isalnum(c)) { 04768 (*o_mputc)('='); 04769 (*o_mputc)(bin2hex(((c>>4)&0xf))); 04770 (*o_mputc)(bin2hex((c&0xf))); 04771 base64_count += 3; 04772 } else { 04773 (*o_mputc)(c); 04774 base64_count++; 04775 } 04776 break; 04777 case 'B': 04778 nkf_state->mimeout_state=c; 04779 (*o_mputc)(basis_64[c>>2]); 04780 mimeout_mode=2; 04781 base64_count ++; 04782 break; 04783 case 2: 04784 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]); 04785 nkf_state->mimeout_state=c; 04786 mimeout_mode=1; 04787 base64_count ++; 04788 break; 04789 case 1: 04790 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]); 04791 (*o_mputc)(basis_64[c & 0x3F]); 04792 mimeout_mode='B'; 04793 base64_count += 2; 04794 break; 04795 default: 04796 (*o_mputc)(c); 04797 base64_count++; 04798 break; 04799 } 04800 } 04801 04802 static void 04803 mime_putc(nkf_char c) 04804 { 04805 int i, j; 04806 nkf_char lastchar; 04807 04808 if (mimeout_f == FIXED_MIME){ 04809 if (mimeout_mode == 'Q'){ 04810 if (base64_count > 71){ 04811 if (c!=CR && c!=LF) { 04812 (*o_mputc)('='); 04813 put_newline(o_mputc); 04814 } 04815 base64_count = 0; 04816 } 04817 }else{ 04818 if (base64_count > 71){ 04819 eof_mime(); 04820 put_newline(o_mputc); 04821 base64_count = 0; 04822 } 04823 if (c == EOF) { /* c==EOF */ 04824 eof_mime(); 04825 } 04826 } 04827 if (c != EOF) { /* c==EOF */ 04828 mimeout_addchar(c); 04829 } 04830 return; 04831 } 04832 04833 /* mimeout_f != FIXED_MIME */ 04834 04835 if (c == EOF) { /* c==EOF */ 04836 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode); 04837 j = mimeout_state.count; 04838 mimeout_state.count = 0; 04839 i = 0; 04840 if (mimeout_mode > 0) { 04841 if (!nkf_isblank(mimeout_state.buf[j-1])) { 04842 for (;i<j;i++) { 04843 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){ 04844 break; 04845 } 04846 mimeout_addchar(mimeout_state.buf[i]); 04847 } 04848 eof_mime(); 04849 for (;i<j;i++) { 04850 mimeout_addchar(mimeout_state.buf[i]); 04851 } 04852 } else { 04853 for (;i<j;i++) { 04854 mimeout_addchar(mimeout_state.buf[i]); 04855 } 04856 eof_mime(); 04857 } 04858 } else { 04859 for (;i<j;i++) { 04860 mimeout_addchar(mimeout_state.buf[i]); 04861 } 04862 } 04863 return; 04864 } 04865 04866 if (mimeout_state.count > 0){ 04867 lastchar = mimeout_state.buf[mimeout_state.count - 1]; 04868 }else{ 04869 lastchar = -1; 04870 } 04871 04872 if (mimeout_mode=='Q') { 04873 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) { 04874 if (c == CR || c == LF) { 04875 close_mime(); 04876 (*o_mputc)(c); 04877 base64_count = 0; 04878 return; 04879 } else if (c <= SP) { 04880 close_mime(); 04881 if (base64_count > 70) { 04882 put_newline(o_mputc); 04883 base64_count = 0; 04884 } 04885 if (!nkf_isblank(c)) { 04886 (*o_mputc)(SP); 04887 base64_count++; 04888 } 04889 } else { 04890 if (base64_count > 70) { 04891 close_mime(); 04892 put_newline(o_mputc); 04893 (*o_mputc)(SP); 04894 base64_count = 1; 04895 open_mime(output_mode); 04896 } 04897 if (!nkf_noescape_mime(c)) { 04898 mimeout_addchar(c); 04899 return; 04900 } 04901 } 04902 if (c != 0x1B) { 04903 (*o_mputc)(c); 04904 base64_count++; 04905 return; 04906 } 04907 } 04908 } 04909 04910 if (mimeout_mode <= 0) { 04911 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 || 04912 output_mode == UTF_8)) { 04913 if (nkf_isspace(c)) { 04914 int flag = 0; 04915 if (mimeout_mode == -1) { 04916 flag = 1; 04917 } 04918 if (c==CR || c==LF) { 04919 if (flag) { 04920 open_mime(output_mode); 04921 output_mode = 0; 04922 } else { 04923 base64_count = 0; 04924 } 04925 } 04926 for (i=0;i<mimeout_state.count;i++) { 04927 (*o_mputc)(mimeout_state.buf[i]); 04928 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){ 04929 base64_count = 0; 04930 }else{ 04931 base64_count++; 04932 } 04933 } 04934 if (flag) { 04935 eof_mime(); 04936 base64_count = 0; 04937 mimeout_mode = 0; 04938 } 04939 mimeout_state.buf[0] = (char)c; 04940 mimeout_state.count = 1; 04941 }else{ 04942 if (base64_count > 1 04943 && base64_count + mimeout_state.count > 76 04944 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){ 04945 static const char *str = "boundary=\""; 04946 static int len = 10; 04947 i = 0; 04948 04949 for (; i < mimeout_state.count - len; ++i) { 04950 if (!strncmp((char *)(mimeout_state.buf+i), str, len)) { 04951 i += len - 2; 04952 break; 04953 } 04954 } 04955 04956 if (i == 0 || i == mimeout_state.count - len) { 04957 put_newline(o_mputc); 04958 base64_count = 0; 04959 if (!nkf_isspace(mimeout_state.buf[0])){ 04960 (*o_mputc)(SP); 04961 base64_count++; 04962 } 04963 } 04964 else { 04965 int j; 04966 for (j = 0; j <= i; ++j) { 04967 (*o_mputc)(mimeout_state.buf[j]); 04968 } 04969 put_newline(o_mputc); 04970 base64_count = 1; 04971 for (; j <= mimeout_state.count; ++j) { 04972 mimeout_state.buf[j - i] = mimeout_state.buf[j]; 04973 } 04974 mimeout_state.count -= i; 04975 } 04976 } 04977 mimeout_state.buf[mimeout_state.count++] = (char)c; 04978 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) { 04979 open_mime(output_mode); 04980 } 04981 } 04982 return; 04983 }else{ 04984 if (lastchar==CR || lastchar == LF){ 04985 for (i=0;i<mimeout_state.count;i++) { 04986 (*o_mputc)(mimeout_state.buf[i]); 04987 } 04988 base64_count = 0; 04989 mimeout_state.count = 0; 04990 } 04991 if (lastchar==SP) { 04992 for (i=0;i<mimeout_state.count-1;i++) { 04993 (*o_mputc)(mimeout_state.buf[i]); 04994 base64_count++; 04995 } 04996 mimeout_state.buf[0] = SP; 04997 mimeout_state.count = 1; 04998 } 04999 open_mime(output_mode); 05000 } 05001 }else{ 05002 /* mimeout_mode == 'B', 1, 2 */ 05003 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 || 05004 output_mode == UTF_8)) { 05005 if (lastchar == CR || lastchar == LF){ 05006 if (nkf_isblank(c)) { 05007 for (i=0;i<mimeout_state.count;i++) { 05008 mimeout_addchar(mimeout_state.buf[i]); 05009 } 05010 mimeout_state.count = 0; 05011 } else { 05012 eof_mime(); 05013 for (i=0;i<mimeout_state.count;i++) { 05014 (*o_mputc)(mimeout_state.buf[i]); 05015 } 05016 base64_count = 0; 05017 mimeout_state.count = 0; 05018 } 05019 mimeout_state.buf[mimeout_state.count++] = (char)c; 05020 return; 05021 } 05022 if (nkf_isspace(c)) { 05023 for (i=0;i<mimeout_state.count;i++) { 05024 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) { 05025 eof_mime(); 05026 for (i=0;i<mimeout_state.count;i++) { 05027 (*o_mputc)(mimeout_state.buf[i]); 05028 base64_count++; 05029 } 05030 mimeout_state.count = 0; 05031 } 05032 } 05033 mimeout_state.buf[mimeout_state.count++] = (char)c; 05034 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) { 05035 eof_mime(); 05036 for (i=0;i<mimeout_state.count;i++) { 05037 (*o_mputc)(mimeout_state.buf[i]); 05038 base64_count++; 05039 } 05040 mimeout_state.count = 0; 05041 } 05042 return; 05043 } 05044 if (mimeout_state.count>0 && SP<c && c!='=') { 05045 mimeout_state.buf[mimeout_state.count++] = (char)c; 05046 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) { 05047 j = mimeout_state.count; 05048 mimeout_state.count = 0; 05049 for (i=0;i<j;i++) { 05050 mimeout_addchar(mimeout_state.buf[i]); 05051 } 05052 } 05053 return; 05054 } 05055 } 05056 } 05057 if (mimeout_state.count>0) { 05058 j = mimeout_state.count; 05059 mimeout_state.count = 0; 05060 for (i=0;i<j;i++) { 05061 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF) 05062 break; 05063 mimeout_addchar(mimeout_state.buf[i]); 05064 } 05065 if (i<j) { 05066 eof_mime(); 05067 base64_count=0; 05068 for (;i<j;i++) { 05069 (*o_mputc)(mimeout_state.buf[i]); 05070 } 05071 open_mime(output_mode); 05072 } 05073 } 05074 mimeout_addchar(c); 05075 } 05076 05077 static void 05078 base64_conv(nkf_char c2, nkf_char c1) 05079 { 05080 mime_prechar(c2, c1); 05081 (*o_base64conv)(c2,c1); 05082 } 05083 05084 #ifdef HAVE_ICONV_H 05085 typedef struct nkf_iconv_t { 05086 iconv_t cd; 05087 char *input_buffer; 05088 size_t input_buffer_size; 05089 char *output_buffer; 05090 size_t output_buffer_size; 05091 } 05092 05093 static nkf_iconv_t 05094 nkf_iconv_new(char *tocode, char *fromcode) 05095 { 05096 nkf_iconv_t converter; 05097 05098 converter->input_buffer_size = IOBUF_SIZE; 05099 converter->input_buffer = nkf_xmalloc(converter->input_buffer_size); 05100 converter->output_buffer_size = IOBUF_SIZE * 2; 05101 converter->output_buffer = nkf_xmalloc(converter->output_buffer_size); 05102 converter->cd = iconv_open(tocode, fromcode); 05103 if (converter->cd == (iconv_t)-1) 05104 { 05105 switch (errno) { 05106 case EINVAL: 05107 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode)); 05108 return -1; 05109 default: 05110 perror("can't iconv_open"); 05111 } 05112 } 05113 } 05114 05115 static size_t 05116 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input) 05117 { 05118 size_t invalid = (size_t)0; 05119 char *input_buffer = converter->input_buffer; 05120 size_t input_length = (size_t)0; 05121 char *output_buffer = converter->output_buffer; 05122 size_t output_length = converter->output_buffer_size; 05123 int c; 05124 05125 do { 05126 if (c != EOF) { 05127 while ((c = (*i_getc)(f)) != EOF) { 05128 input_buffer[input_length++] = c; 05129 if (input_length < converter->input_buffer_size) break; 05130 } 05131 } 05132 05133 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length); 05134 while (output_length-- > 0) { 05135 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]); 05136 } 05137 if (ret == (size_t) - 1) { 05138 switch (errno) { 05139 case EINVAL: 05140 if (input_buffer != converter->input_buffer) 05141 memmove(converter->input_buffer, input_buffer, input_length); 05142 break; 05143 case E2BIG: 05144 converter->output_buffer_size *= 2; 05145 output_buffer = realloc(converter->outbuf, converter->output_buffer_size); 05146 if (output_buffer == NULL) { 05147 perror("can't realloc"); 05148 return -1; 05149 } 05150 converter->output_buffer = output_buffer; 05151 break; 05152 default: 05153 perror("can't iconv"); 05154 return -1; 05155 } 05156 } else { 05157 invalid += ret; 05158 } 05159 } while (1); 05160 05161 return invalid; 05162 } 05163 05164 05165 static void 05166 nkf_iconv_close(nkf_iconv_t *convert) 05167 { 05168 nkf_xfree(converter->inbuf); 05169 nkf_xfree(converter->outbuf); 05170 iconv_close(converter->cd); 05171 } 05172 #endif 05173 05174 05175 static void 05176 reinit(void) 05177 { 05178 { 05179 struct input_code *p = input_code_list; 05180 while (p->name){ 05181 status_reinit(p++); 05182 } 05183 } 05184 unbuf_f = FALSE; 05185 estab_f = FALSE; 05186 nop_f = FALSE; 05187 binmode_f = TRUE; 05188 rot_f = FALSE; 05189 hira_f = FALSE; 05190 alpha_f = FALSE; 05191 mime_f = MIME_DECODE_DEFAULT; 05192 mime_decode_f = FALSE; 05193 mimebuf_f = FALSE; 05194 broken_f = FALSE; 05195 iso8859_f = FALSE; 05196 mimeout_f = FALSE; 05197 x0201_f = NKF_UNSPECIFIED; 05198 iso2022jp_f = FALSE; 05199 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE) 05200 ms_ucs_map_f = UCS_MAP_ASCII; 05201 #endif 05202 #ifdef UTF8_INPUT_ENABLE 05203 no_cp932ext_f = FALSE; 05204 no_best_fit_chars_f = FALSE; 05205 encode_fallback = NULL; 05206 unicode_subchar = '?'; 05207 input_endian = ENDIAN_BIG; 05208 #endif 05209 #ifdef UTF8_OUTPUT_ENABLE 05210 output_bom_f = FALSE; 05211 output_endian = ENDIAN_BIG; 05212 #endif 05213 #ifdef UNICODE_NORMALIZATION 05214 nfc_f = FALSE; 05215 #endif 05216 #ifdef INPUT_OPTION 05217 cap_f = FALSE; 05218 url_f = FALSE; 05219 numchar_f = FALSE; 05220 #endif 05221 #ifdef CHECK_OPTION 05222 noout_f = FALSE; 05223 debug_f = FALSE; 05224 #endif 05225 guess_f = 0; 05226 #ifdef EXEC_IO 05227 exec_f = 0; 05228 #endif 05229 #ifdef SHIFTJIS_CP932 05230 cp51932_f = TRUE; 05231 cp932inv_f = TRUE; 05232 #endif 05233 #ifdef X0212_ENABLE 05234 x0212_f = FALSE; 05235 x0213_f = FALSE; 05236 #endif 05237 { 05238 int i; 05239 for (i = 0; i < 256; i++){ 05240 prefix_table[i] = 0; 05241 } 05242 } 05243 hold_count = 0; 05244 mimeout_state.count = 0; 05245 mimeout_mode = 0; 05246 base64_count = 0; 05247 f_line = 0; 05248 f_prev = 0; 05249 fold_preserve_f = FALSE; 05250 fold_f = FALSE; 05251 fold_len = 0; 05252 kanji_intro = DEFAULT_J; 05253 ascii_intro = DEFAULT_R; 05254 fold_margin = FOLD_MARGIN; 05255 o_zconv = no_connection; 05256 o_fconv = no_connection; 05257 o_eol_conv = no_connection; 05258 o_rot_conv = no_connection; 05259 o_hira_conv = no_connection; 05260 o_base64conv = no_connection; 05261 o_iso2022jp_check_conv = no_connection; 05262 o_putc = std_putc; 05263 i_getc = std_getc; 05264 i_ungetc = std_ungetc; 05265 i_bgetc = std_getc; 05266 i_bungetc = std_ungetc; 05267 o_mputc = std_putc; 05268 i_mgetc = std_getc; 05269 i_mungetc = std_ungetc; 05270 i_mgetc_buf = std_getc; 05271 i_mungetc_buf = std_ungetc; 05272 output_mode = ASCII; 05273 input_mode = ASCII; 05274 mime_decode_mode = FALSE; 05275 file_out_f = FALSE; 05276 eolmode_f = 0; 05277 input_eol = 0; 05278 prev_cr = 0; 05279 option_mode = 0; 05280 z_prev2=0,z_prev1=0; 05281 #ifdef CHECK_OPTION 05282 iconv_for_check = 0; 05283 #endif 05284 input_codename = NULL; 05285 input_encoding = NULL; 05286 output_encoding = NULL; 05287 nkf_state_init(); 05288 #ifdef WIN32DLL 05289 reinitdll(); 05290 #endif /*WIN32DLL*/ 05291 } 05292 05293 static int 05294 module_connection(void) 05295 { 05296 if (input_encoding) set_input_encoding(input_encoding); 05297 if (!output_encoding) { 05298 output_encoding = nkf_default_encoding(); 05299 } 05300 if (!output_encoding) { 05301 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP); 05302 else return -1; 05303 } 05304 set_output_encoding(output_encoding); 05305 oconv = nkf_enc_to_oconv(output_encoding); 05306 o_putc = std_putc; 05307 if (nkf_enc_unicode_p(output_encoding)) 05308 output_mode = UTF_8; 05309 05310 if (x0201_f == NKF_UNSPECIFIED) { 05311 x0201_f = X0201_DEFAULT; 05312 } 05313 05314 /* replace continucation module, from output side */ 05315 05316 /* output redicrection */ 05317 #ifdef CHECK_OPTION 05318 if (noout_f || guess_f){ 05319 o_putc = no_putc; 05320 } 05321 #endif 05322 if (mimeout_f) { 05323 o_mputc = o_putc; 05324 o_putc = mime_putc; 05325 if (mimeout_f == TRUE) { 05326 o_base64conv = oconv; oconv = base64_conv; 05327 } 05328 /* base64_count = 0; */ 05329 } 05330 05331 if (eolmode_f || guess_f) { 05332 o_eol_conv = oconv; oconv = eol_conv; 05333 } 05334 if (rot_f) { 05335 o_rot_conv = oconv; oconv = rot_conv; 05336 } 05337 if (iso2022jp_f) { 05338 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv; 05339 } 05340 if (hira_f) { 05341 o_hira_conv = oconv; oconv = hira_conv; 05342 } 05343 if (fold_f) { 05344 o_fconv = oconv; oconv = fold_conv; 05345 f_line = 0; 05346 } 05347 if (alpha_f || x0201_f) { 05348 o_zconv = oconv; oconv = z_conv; 05349 } 05350 05351 i_getc = std_getc; 05352 i_ungetc = std_ungetc; 05353 /* input redicrection */ 05354 #ifdef INPUT_OPTION 05355 if (cap_f){ 05356 i_cgetc = i_getc; i_getc = cap_getc; 05357 i_cungetc = i_ungetc; i_ungetc= cap_ungetc; 05358 } 05359 if (url_f){ 05360 i_ugetc = i_getc; i_getc = url_getc; 05361 i_uungetc = i_ungetc; i_ungetc= url_ungetc; 05362 } 05363 #endif 05364 #ifdef NUMCHAR_OPTION 05365 if (numchar_f){ 05366 i_ngetc = i_getc; i_getc = numchar_getc; 05367 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc; 05368 } 05369 #endif 05370 #ifdef UNICODE_NORMALIZATION 05371 if (nfc_f){ 05372 i_nfc_getc = i_getc; i_getc = nfc_getc; 05373 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc; 05374 } 05375 #endif 05376 if (mime_f && mimebuf_f==FIXED_MIME) { 05377 i_mgetc = i_getc; i_getc = mime_getc; 05378 i_mungetc = i_ungetc; i_ungetc = mime_ungetc; 05379 } 05380 if (broken_f & 1) { 05381 i_bgetc = i_getc; i_getc = broken_getc; 05382 i_bungetc = i_ungetc; i_ungetc = broken_ungetc; 05383 } 05384 if (input_encoding) { 05385 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding)); 05386 } else { 05387 set_iconv(FALSE, e_iconv); 05388 } 05389 05390 { 05391 struct input_code *p = input_code_list; 05392 while (p->name){ 05393 status_reinit(p++); 05394 } 05395 } 05396 return 0; 05397 } 05398 05399 /* 05400 Conversion main loop. Code detection only. 05401 */ 05402 05403 #if !defined(PERL_XS) && !defined(WIN32DLL) 05404 static nkf_char 05405 noconvert(FILE *f) 05406 { 05407 nkf_char c; 05408 05409 if (nop_f == 2) 05410 module_connection(); 05411 while ((c = (*i_getc)(f)) != EOF) 05412 (*o_putc)(c); 05413 (*o_putc)(EOF); 05414 return 1; 05415 } 05416 #endif 05417 05418 #define NEXT continue /* no output, get next */ 05419 #define SKIP c2=0;continue /* no output, get next */ 05420 #define MORE c2=c1;continue /* need one more byte */ 05421 #define SEND (void)0 /* output c1 and c2, get next */ 05422 #define LAST break /* end of loop, go closing */ 05423 #define set_input_mode(mode) do { \ 05424 input_mode = mode; \ 05425 shift_mode = 0; \ 05426 set_input_codename("ISO-2022-JP"); \ 05427 debug("ISO-2022-JP"); \ 05428 } while (0) 05429 05430 static int 05431 kanji_convert(FILE *f) 05432 { 05433 nkf_char c1=0, c2=0, c3=0, c4=0; 05434 int shift_mode = 0; /* 0, 1, 2, 3 */ 05435 int g2 = 0; 05436 int is_8bit = FALSE; 05437 05438 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) { 05439 is_8bit = TRUE; 05440 } 05441 05442 input_mode = ASCII; 05443 output_mode = ASCII; 05444 05445 if (module_connection() < 0) { 05446 #if !defined(PERL_XS) && !defined(WIN32DLL) 05447 fprintf(stderr, "no output encoding given\n"); 05448 #endif 05449 return -1; 05450 } 05451 check_bom(f); 05452 05453 #ifdef UTF8_INPUT_ENABLE 05454 if(iconv == w_iconv32){ 05455 while ((c1 = (*i_getc)(f)) != EOF && 05456 (c2 = (*i_getc)(f)) != EOF && 05457 (c3 = (*i_getc)(f)) != EOF && 05458 (c4 = (*i_getc)(f)) != EOF) { 05459 nkf_iconv_utf_32(c1, c2, c3, c4); 05460 } 05461 goto finished; 05462 } 05463 else if (iconv == w_iconv16) { 05464 while ((c1 = (*i_getc)(f)) != EOF && 05465 (c2 = (*i_getc)(f)) != EOF) { 05466 if (nkf_iconv_utf_16(c1, c2, 0, 0) == NKF_ICONV_NEED_TWO_MORE_BYTES && 05467 (c3 = (*i_getc)(f)) != EOF && 05468 (c4 = (*i_getc)(f)) != EOF) { 05469 nkf_iconv_utf_16(c1, c2, c3, c4); 05470 } 05471 } 05472 goto finished; 05473 } 05474 #endif 05475 05476 while ((c1 = (*i_getc)(f)) != EOF) { 05477 #ifdef INPUT_CODE_FIX 05478 if (!input_encoding) 05479 #endif 05480 code_status(c1); 05481 if (c2) { 05482 /* second byte */ 05483 if (c2 > DEL) { 05484 /* in case of 8th bit is on */ 05485 if (!estab_f&&!mime_decode_mode) { 05486 /* in case of not established yet */ 05487 /* It is still ambiguious */ 05488 if (h_conv(f, c2, c1)==EOF) { 05489 LAST; 05490 } 05491 else { 05492 SKIP; 05493 } 05494 } 05495 else { 05496 /* in case of already established */ 05497 if (c1 < 0x40) { 05498 /* ignore bogus code */ 05499 SKIP; 05500 } else { 05501 SEND; 05502 } 05503 } 05504 } 05505 else { 05506 /* 2nd byte of 7 bit code or SJIS */ 05507 SEND; 05508 } 05509 } 05510 else if (nkf_char_unicode_p(c1)) { 05511 (*oconv)(0, c1); 05512 NEXT; 05513 } 05514 else { 05515 /* first byte */ 05516 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) { 05517 /* CP5022x */ 05518 MORE; 05519 }else if (input_codename && input_codename[0] == 'I' && 05520 0xA1 <= c1 && c1 <= 0xDF) { 05521 /* JIS X 0201 Katakana in 8bit JIS */ 05522 c2 = JIS_X_0201_1976_K; 05523 c1 &= 0x7f; 05524 SEND; 05525 } else if (c1 > DEL) { 05526 /* 8 bit code */ 05527 if (!estab_f && !iso8859_f) { 05528 /* not established yet */ 05529 MORE; 05530 } else { /* estab_f==TRUE */ 05531 if (iso8859_f) { 05532 c2 = ISO_8859_1; 05533 c1 &= 0x7f; 05534 SEND; 05535 } 05536 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) || 05537 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) { 05538 /* JIS X 0201 */ 05539 c2 = JIS_X_0201_1976_K; 05540 c1 &= 0x7f; 05541 SEND; 05542 } 05543 else { 05544 /* already established */ 05545 MORE; 05546 } 05547 } 05548 } else if (SP < c1 && c1 < DEL) { 05549 /* in case of Roman characters */ 05550 if (shift_mode) { 05551 /* output 1 shifted byte */ 05552 if (iso8859_f) { 05553 c2 = ISO_8859_1; 05554 SEND; 05555 } else if (nkf_byte_jisx0201_katakana_p(c1)){ 05556 /* output 1 shifted byte */ 05557 c2 = JIS_X_0201_1976_K; 05558 SEND; 05559 } else { 05560 /* look like bogus code */ 05561 SKIP; 05562 } 05563 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 || 05564 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) { 05565 /* in case of Kanji shifted */ 05566 MORE; 05567 } else if (c1 == '=' && mime_f && !mime_decode_mode) { 05568 /* Check MIME code */ 05569 if ((c1 = (*i_getc)(f)) == EOF) { 05570 (*oconv)(0, '='); 05571 LAST; 05572 } else if (c1 == '?') { 05573 /* =? is mime conversion start sequence */ 05574 if(mime_f == STRICT_MIME) { 05575 /* check in real detail */ 05576 if (mime_begin_strict(f) == EOF) 05577 LAST; 05578 SKIP; 05579 } else if (mime_begin(f) == EOF) 05580 LAST; 05581 SKIP; 05582 } else { 05583 (*oconv)(0, '='); 05584 (*i_ungetc)(c1,f); 05585 SKIP; 05586 } 05587 } else { 05588 /* normal ASCII code */ 05589 SEND; 05590 } 05591 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) { 05592 shift_mode = 0; 05593 SKIP; 05594 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) { 05595 shift_mode = 1; 05596 SKIP; 05597 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) { 05598 if ((c1 = (*i_getc)(f)) == EOF) { 05599 (*oconv)(0, ESC); 05600 LAST; 05601 } 05602 else if (c1 == '&') { 05603 /* IRR */ 05604 if ((c1 = (*i_getc)(f)) == EOF) { 05605 LAST; 05606 } else { 05607 SKIP; 05608 } 05609 } 05610 else if (c1 == '$') { 05611 /* GZDMx */ 05612 if ((c1 = (*i_getc)(f)) == EOF) { 05613 /* don't send bogus code 05614 (*oconv)(0, ESC); 05615 (*oconv)(0, '$'); */ 05616 LAST; 05617 } else if (c1 == '@' || c1 == 'B') { 05618 /* JIS X 0208 */ 05619 set_input_mode(JIS_X_0208); 05620 SKIP; 05621 } else if (c1 == '(') { 05622 /* GZDM4 */ 05623 if ((c1 = (*i_getc)(f)) == EOF) { 05624 /* don't send bogus code 05625 (*oconv)(0, ESC); 05626 (*oconv)(0, '$'); 05627 (*oconv)(0, '('); 05628 */ 05629 LAST; 05630 } else if (c1 == '@'|| c1 == 'B') { 05631 /* JIS X 0208 */ 05632 set_input_mode(JIS_X_0208); 05633 SKIP; 05634 #ifdef X0212_ENABLE 05635 } else if (c1 == 'D'){ 05636 set_input_mode(JIS_X_0212); 05637 SKIP; 05638 #endif /* X0212_ENABLE */ 05639 } else if (c1 == 'O' || c1 == 'Q'){ 05640 set_input_mode(JIS_X_0213_1); 05641 SKIP; 05642 } else if (c1 == 'P'){ 05643 set_input_mode(JIS_X_0213_2); 05644 SKIP; 05645 } else { 05646 /* could be some special code */ 05647 (*oconv)(0, ESC); 05648 (*oconv)(0, '$'); 05649 (*oconv)(0, '('); 05650 (*oconv)(0, c1); 05651 SKIP; 05652 } 05653 } else if (broken_f&0x2) { 05654 /* accept any ESC-(-x as broken code ... */ 05655 input_mode = JIS_X_0208; 05656 shift_mode = 0; 05657 SKIP; 05658 } else { 05659 (*oconv)(0, ESC); 05660 (*oconv)(0, '$'); 05661 (*oconv)(0, c1); 05662 SKIP; 05663 } 05664 } else if (c1 == '(') { 05665 /* GZD4 */ 05666 if ((c1 = (*i_getc)(f)) == EOF) { 05667 /* don't send bogus code 05668 (*oconv)(0, ESC); 05669 (*oconv)(0, '('); */ 05670 LAST; 05671 } 05672 else if (c1 == 'I') { 05673 /* JIS X 0201 Katakana */ 05674 set_input_mode(JIS_X_0201_1976_K); 05675 SKIP; 05676 } 05677 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') { 05678 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */ 05679 set_input_mode(ASCII); 05680 SKIP; 05681 } 05682 else if (broken_f&0x2) { 05683 set_input_mode(ASCII); 05684 SKIP; 05685 } 05686 else { 05687 (*oconv)(0, ESC); 05688 (*oconv)(0, '('); 05689 SEND; 05690 } 05691 } 05692 else if (c1 == '.') { 05693 /* G2D6 */ 05694 if ((c1 = (*i_getc)(f)) == EOF) { 05695 LAST; 05696 } 05697 else if (c1 == 'A') { 05698 /* ISO-8859-1 */ 05699 g2 = ISO_8859_1; 05700 SKIP; 05701 } 05702 else { 05703 (*oconv)(0, ESC); 05704 (*oconv)(0, '.'); 05705 SEND; 05706 } 05707 } 05708 else if (c1 == 'N') { 05709 /* SS2 */ 05710 c1 = (*i_getc)(f); 05711 if (g2 == ISO_8859_1) { 05712 c2 = ISO_8859_1; 05713 SEND; 05714 }else{ 05715 (*i_ungetc)(c1, f); 05716 /* lonely ESC */ 05717 (*oconv)(0, ESC); 05718 SEND; 05719 } 05720 } 05721 else { 05722 /* lonely ESC */ 05723 (*oconv)(0, ESC); 05724 SEND; 05725 } 05726 } else if (c1 == ESC && iconv == s_iconv) { 05727 /* ESC in Shift_JIS */ 05728 if ((c1 = (*i_getc)(f)) == EOF) { 05729 (*oconv)(0, ESC); 05730 LAST; 05731 } else if (c1 == '$') { 05732 /* J-PHONE emoji */ 05733 if ((c1 = (*i_getc)(f)) == EOF) { 05734 LAST; 05735 } else if (('E' <= c1 && c1 <= 'G') || 05736 ('O' <= c1 && c1 <= 'Q')) { 05737 /* 05738 NUM : 0 1 2 3 4 5 05739 BYTE: G E F O P Q 05740 C%7 : 1 6 0 2 3 4 05741 C%7 : 0 1 2 3 4 5 6 05742 NUM : 2 0 3 4 5 X 1 05743 */ 05744 static const nkf_char jphone_emoji_first_table[7] = 05745 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0}; 05746 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]); 05747 if ((c1 = (*i_getc)(f)) == EOF) LAST; 05748 while (SP <= c1 && c1 <= 'z') { 05749 (*oconv)(0, c1 + c3); 05750 if ((c1 = (*i_getc)(f)) == EOF) LAST; 05751 } 05752 SKIP; 05753 } 05754 else { 05755 (*oconv)(0, ESC); 05756 (*oconv)(0, '$'); 05757 SEND; 05758 } 05759 } 05760 else { 05761 /* lonely ESC */ 05762 (*oconv)(0, ESC); 05763 SEND; 05764 } 05765 } else if (c1 == LF || c1 == CR) { 05766 if (broken_f&4) { 05767 input_mode = ASCII; set_iconv(FALSE, 0); 05768 SEND; 05769 } else if (mime_decode_f && !mime_decode_mode){ 05770 if (c1 == LF) { 05771 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) { 05772 i_ungetc(SP,f); 05773 continue; 05774 } else { 05775 i_ungetc(c1,f); 05776 } 05777 c1 = LF; 05778 SEND; 05779 } else { /* if (c1 == CR)*/ 05780 if ((c1=(*i_getc)(f))!=EOF) { 05781 if (c1==SP) { 05782 i_ungetc(SP,f); 05783 continue; 05784 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) { 05785 i_ungetc(SP,f); 05786 continue; 05787 } else { 05788 i_ungetc(c1,f); 05789 } 05790 i_ungetc(LF,f); 05791 } else { 05792 i_ungetc(c1,f); 05793 } 05794 c1 = CR; 05795 SEND; 05796 } 05797 } 05798 } else 05799 SEND; 05800 } 05801 /* send: */ 05802 switch(input_mode){ 05803 case ASCII: 05804 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */ 05805 case -2: 05806 /* 4 bytes UTF-8 */ 05807 if ((c3 = (*i_getc)(f)) != EOF) { 05808 code_status(c3); 05809 c3 <<= 8; 05810 if ((c4 = (*i_getc)(f)) != EOF) { 05811 code_status(c4); 05812 (*iconv)(c2, c1, c3|c4); 05813 } 05814 } 05815 break; 05816 case -1: 05817 /* 3 bytes EUC or UTF-8 */ 05818 if ((c3 = (*i_getc)(f)) != EOF) { 05819 code_status(c3); 05820 (*iconv)(c2, c1, c3); 05821 } 05822 break; 05823 } 05824 break; 05825 case JIS_X_0208: 05826 case JIS_X_0213_1: 05827 if (ms_ucs_map_f && 05828 0x7F <= c2 && c2 <= 0x92 && 05829 0x21 <= c1 && c1 <= 0x7E) { 05830 /* CP932 UDC */ 05831 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000); 05832 c2 = 0; 05833 } 05834 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */ 05835 break; 05836 #ifdef X0212_ENABLE 05837 case JIS_X_0212: 05838 (*oconv)(PREFIX_EUCG3 | c2, c1); 05839 break; 05840 #endif /* X0212_ENABLE */ 05841 case JIS_X_0213_2: 05842 (*oconv)(PREFIX_EUCG3 | c2, c1); 05843 break; 05844 default: 05845 (*oconv)(input_mode, c1); /* other special case */ 05846 } 05847 05848 c2 = 0; 05849 c3 = 0; 05850 continue; 05851 /* goto next_word */ 05852 } 05853 05854 finished: 05855 /* epilogue */ 05856 (*iconv)(EOF, 0, 0); 05857 if (!input_codename) 05858 { 05859 if (is_8bit) { 05860 struct input_code *p = input_code_list; 05861 struct input_code *result = p; 05862 while (p->name){ 05863 if (p->score < result->score) result = p; 05864 ++p; 05865 } 05866 set_input_codename(result->name); 05867 #ifdef CHECK_OPTION 05868 debug(result->name); 05869 #endif 05870 } 05871 } 05872 return 0; 05873 } 05874 05875 /* 05876 * int options(unsigned char *cp) 05877 * 05878 * return values: 05879 * 0: success 05880 * -1: ArgumentError 05881 */ 05882 static int 05883 options(unsigned char *cp) 05884 { 05885 nkf_char i, j; 05886 unsigned char *p; 05887 unsigned char *cp_back = NULL; 05888 nkf_encoding *enc; 05889 05890 if (option_mode==1) 05891 return 0; 05892 while(*cp && *cp++!='-'); 05893 while (*cp || cp_back) { 05894 if(!*cp){ 05895 cp = cp_back; 05896 cp_back = NULL; 05897 continue; 05898 } 05899 p = 0; 05900 switch (*cp++) { 05901 case '-': /* literal options */ 05902 if (!*cp || *cp == SP) { /* ignore the rest of arguments */ 05903 option_mode = 1; 05904 return 0; 05905 } 05906 for (i=0;i<(int)(sizeof(long_option)/sizeof(long_option[0]));i++) { 05907 p = (unsigned char *)long_option[i].name; 05908 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++); 05909 if (*p == cp[j] || cp[j] == SP){ 05910 p = &cp[j] + 1; 05911 break; 05912 } 05913 p = 0; 05914 } 05915 if (p == 0) { 05916 #if !defined(PERL_XS) && !defined(WIN32DLL) 05917 fprintf(stderr, "unknown long option: --%s\n", cp); 05918 #endif 05919 return -1; 05920 } 05921 while(*cp && *cp != SP && cp++); 05922 if (long_option[i].alias[0]){ 05923 cp_back = cp; 05924 cp = (unsigned char *)long_option[i].alias; 05925 }else{ 05926 #ifndef PERL_XS 05927 if (strcmp(long_option[i].name, "help") == 0){ 05928 usage(); 05929 exit(EXIT_SUCCESS); 05930 } 05931 #endif 05932 if (strcmp(long_option[i].name, "ic=") == 0){ 05933 enc = nkf_enc_find((char *)p); 05934 if (!enc) continue; 05935 input_encoding = enc; 05936 continue; 05937 } 05938 if (strcmp(long_option[i].name, "oc=") == 0){ 05939 enc = nkf_enc_find((char *)p); 05940 /* if (enc <= 0) continue; */ 05941 if (!enc) continue; 05942 output_encoding = enc; 05943 continue; 05944 } 05945 if (strcmp(long_option[i].name, "guess=") == 0){ 05946 if (p[0] == '0' || p[0] == '1') { 05947 guess_f = 1; 05948 } else { 05949 guess_f = 2; 05950 } 05951 continue; 05952 } 05953 #ifdef OVERWRITE 05954 if (strcmp(long_option[i].name, "overwrite") == 0){ 05955 file_out_f = TRUE; 05956 overwrite_f = TRUE; 05957 preserve_time_f = TRUE; 05958 continue; 05959 } 05960 if (strcmp(long_option[i].name, "overwrite=") == 0){ 05961 file_out_f = TRUE; 05962 overwrite_f = TRUE; 05963 preserve_time_f = TRUE; 05964 backup_f = TRUE; 05965 backup_suffix = (char *)p; 05966 continue; 05967 } 05968 if (strcmp(long_option[i].name, "in-place") == 0){ 05969 file_out_f = TRUE; 05970 overwrite_f = TRUE; 05971 preserve_time_f = FALSE; 05972 continue; 05973 } 05974 if (strcmp(long_option[i].name, "in-place=") == 0){ 05975 file_out_f = TRUE; 05976 overwrite_f = TRUE; 05977 preserve_time_f = FALSE; 05978 backup_f = TRUE; 05979 backup_suffix = (char *)p; 05980 continue; 05981 } 05982 #endif 05983 #ifdef INPUT_OPTION 05984 if (strcmp(long_option[i].name, "cap-input") == 0){ 05985 cap_f = TRUE; 05986 continue; 05987 } 05988 if (strcmp(long_option[i].name, "url-input") == 0){ 05989 url_f = TRUE; 05990 continue; 05991 } 05992 #endif 05993 #ifdef NUMCHAR_OPTION 05994 if (strcmp(long_option[i].name, "numchar-input") == 0){ 05995 numchar_f = TRUE; 05996 continue; 05997 } 05998 #endif 05999 #ifdef CHECK_OPTION 06000 if (strcmp(long_option[i].name, "no-output") == 0){ 06001 noout_f = TRUE; 06002 continue; 06003 } 06004 if (strcmp(long_option[i].name, "debug") == 0){ 06005 debug_f = TRUE; 06006 continue; 06007 } 06008 #endif 06009 if (strcmp(long_option[i].name, "cp932") == 0){ 06010 #ifdef SHIFTJIS_CP932 06011 cp51932_f = TRUE; 06012 cp932inv_f = -TRUE; 06013 #endif 06014 #ifdef UTF8_OUTPUT_ENABLE 06015 ms_ucs_map_f = UCS_MAP_CP932; 06016 #endif 06017 continue; 06018 } 06019 if (strcmp(long_option[i].name, "no-cp932") == 0){ 06020 #ifdef SHIFTJIS_CP932 06021 cp51932_f = FALSE; 06022 cp932inv_f = FALSE; 06023 #endif 06024 #ifdef UTF8_OUTPUT_ENABLE 06025 ms_ucs_map_f = UCS_MAP_ASCII; 06026 #endif 06027 continue; 06028 } 06029 #ifdef SHIFTJIS_CP932 06030 if (strcmp(long_option[i].name, "cp932inv") == 0){ 06031 cp932inv_f = -TRUE; 06032 continue; 06033 } 06034 #endif 06035 06036 #ifdef X0212_ENABLE 06037 if (strcmp(long_option[i].name, "x0212") == 0){ 06038 x0212_f = TRUE; 06039 continue; 06040 } 06041 #endif 06042 06043 #ifdef EXEC_IO 06044 if (strcmp(long_option[i].name, "exec-in") == 0){ 06045 exec_f = 1; 06046 return 0; 06047 } 06048 if (strcmp(long_option[i].name, "exec-out") == 0){ 06049 exec_f = -1; 06050 return 0; 06051 } 06052 #endif 06053 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE) 06054 if (strcmp(long_option[i].name, "no-cp932ext") == 0){ 06055 no_cp932ext_f = TRUE; 06056 continue; 06057 } 06058 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){ 06059 no_best_fit_chars_f = TRUE; 06060 continue; 06061 } 06062 if (strcmp(long_option[i].name, "fb-skip") == 0){ 06063 encode_fallback = NULL; 06064 continue; 06065 } 06066 if (strcmp(long_option[i].name, "fb-html") == 0){ 06067 encode_fallback = encode_fallback_html; 06068 continue; 06069 } 06070 if (strcmp(long_option[i].name, "fb-xml") == 0){ 06071 encode_fallback = encode_fallback_xml; 06072 continue; 06073 } 06074 if (strcmp(long_option[i].name, "fb-java") == 0){ 06075 encode_fallback = encode_fallback_java; 06076 continue; 06077 } 06078 if (strcmp(long_option[i].name, "fb-perl") == 0){ 06079 encode_fallback = encode_fallback_perl; 06080 continue; 06081 } 06082 if (strcmp(long_option[i].name, "fb-subchar") == 0){ 06083 encode_fallback = encode_fallback_subchar; 06084 continue; 06085 } 06086 if (strcmp(long_option[i].name, "fb-subchar=") == 0){ 06087 encode_fallback = encode_fallback_subchar; 06088 unicode_subchar = 0; 06089 if (p[0] != '0'){ 06090 /* decimal number */ 06091 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){ 06092 unicode_subchar *= 10; 06093 unicode_subchar += hex2bin(p[i]); 06094 } 06095 }else if(p[1] == 'x' || p[1] == 'X'){ 06096 /* hexadecimal number */ 06097 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){ 06098 unicode_subchar <<= 4; 06099 unicode_subchar |= hex2bin(p[i]); 06100 } 06101 }else{ 06102 /* octal number */ 06103 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){ 06104 unicode_subchar *= 8; 06105 unicode_subchar += hex2bin(p[i]); 06106 } 06107 } 06108 w16e_conv(unicode_subchar, &i, &j); 06109 unicode_subchar = i<<8 | j; 06110 continue; 06111 } 06112 #endif 06113 #ifdef UTF8_OUTPUT_ENABLE 06114 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){ 06115 ms_ucs_map_f = UCS_MAP_MS; 06116 continue; 06117 } 06118 #endif 06119 #ifdef UNICODE_NORMALIZATION 06120 if (strcmp(long_option[i].name, "utf8mac-input") == 0){ 06121 nfc_f = TRUE; 06122 continue; 06123 } 06124 #endif 06125 if (strcmp(long_option[i].name, "prefix=") == 0){ 06126 if (nkf_isgraph(p[0])){ 06127 for (i = 1; nkf_isgraph(p[i]); i++){ 06128 prefix_table[p[i]] = p[0]; 06129 } 06130 } 06131 continue; 06132 } 06133 #if !defined(PERL_XS) && !defined(WIN32DLL) 06134 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name); 06135 #endif 06136 return -1; 06137 } 06138 continue; 06139 case 'b': /* buffered mode */ 06140 unbuf_f = FALSE; 06141 continue; 06142 case 'u': /* non bufferd mode */ 06143 unbuf_f = TRUE; 06144 continue; 06145 case 't': /* transparent mode */ 06146 if (*cp=='1') { 06147 /* alias of -t */ 06148 cp++; 06149 nop_f = TRUE; 06150 } else if (*cp=='2') { 06151 /* 06152 * -t with put/get 06153 * 06154 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin 06155 * 06156 */ 06157 cp++; 06158 nop_f = 2; 06159 } else 06160 nop_f = TRUE; 06161 continue; 06162 case 'j': /* JIS output */ 06163 case 'n': 06164 output_encoding = nkf_enc_from_index(ISO_2022_JP); 06165 continue; 06166 case 'e': /* AT&T EUC output */ 06167 output_encoding = nkf_enc_from_index(EUCJP_NKF); 06168 continue; 06169 case 's': /* SJIS output */ 06170 output_encoding = nkf_enc_from_index(SHIFT_JIS); 06171 continue; 06172 case 'l': /* ISO8859 Latin-1 support, no conversion */ 06173 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */ 06174 input_encoding = nkf_enc_from_index(ISO_8859_1); 06175 continue; 06176 case 'i': /* Kanji IN ESC-$-@/B */ 06177 if (*cp=='@'||*cp=='B') 06178 kanji_intro = *cp++; 06179 continue; 06180 case 'o': /* ASCII IN ESC-(-J/B/H */ 06181 /* ESC ( H was used in initial JUNET messages */ 06182 if (*cp=='J'||*cp=='B'||*cp=='H') 06183 ascii_intro = *cp++; 06184 continue; 06185 case 'h': 06186 /* 06187 bit:1 katakana->hiragana 06188 bit:2 hiragana->katakana 06189 */ 06190 if ('9'>= *cp && *cp>='0') 06191 hira_f |= (*cp++ -'0'); 06192 else 06193 hira_f |= 1; 06194 continue; 06195 case 'r': 06196 rot_f = TRUE; 06197 continue; 06198 #if defined(MSDOS) || defined(__OS2__) 06199 case 'T': 06200 binmode_f = FALSE; 06201 continue; 06202 #endif 06203 #ifndef PERL_XS 06204 case 'V': 06205 show_configuration(); 06206 exit(EXIT_SUCCESS); 06207 break; 06208 case 'v': 06209 version(); 06210 exit(EXIT_SUCCESS); 06211 break; 06212 #endif 06213 #ifdef UTF8_OUTPUT_ENABLE 06214 case 'w': /* UTF-{8,16,32} output */ 06215 if (cp[0] == '8') { 06216 cp++; 06217 if (cp[0] == '0'){ 06218 cp++; 06219 output_encoding = nkf_enc_from_index(UTF_8N); 06220 } else { 06221 output_bom_f = TRUE; 06222 output_encoding = nkf_enc_from_index(UTF_8_BOM); 06223 } 06224 } else { 06225 int enc_idx; 06226 if ('1'== cp[0] && '6'==cp[1]) { 06227 cp += 2; 06228 enc_idx = UTF_16; 06229 } else if ('3'== cp[0] && '2'==cp[1]) { 06230 cp += 2; 06231 enc_idx = UTF_32; 06232 } else { 06233 output_encoding = nkf_enc_from_index(UTF_8); 06234 continue; 06235 } 06236 if (cp[0]=='L') { 06237 cp++; 06238 output_endian = ENDIAN_LITTLE; 06239 output_bom_f = TRUE; 06240 } else if (cp[0] == 'B') { 06241 cp++; 06242 output_bom_f = TRUE; 06243 } 06244 if (cp[0] == '0'){ 06245 output_bom_f = FALSE; 06246 cp++; 06247 enc_idx = enc_idx == UTF_16 06248 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE) 06249 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE); 06250 } else { 06251 enc_idx = enc_idx == UTF_16 06252 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM) 06253 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM); 06254 } 06255 output_encoding = nkf_enc_from_index(enc_idx); 06256 } 06257 continue; 06258 #endif 06259 #ifdef UTF8_INPUT_ENABLE 06260 case 'W': /* UTF input */ 06261 if (cp[0] == '8') { 06262 cp++; 06263 input_encoding = nkf_enc_from_index(UTF_8); 06264 }else{ 06265 int enc_idx; 06266 if ('1'== cp[0] && '6'==cp[1]) { 06267 cp += 2; 06268 input_endian = ENDIAN_BIG; 06269 enc_idx = UTF_16; 06270 } else if ('3'== cp[0] && '2'==cp[1]) { 06271 cp += 2; 06272 input_endian = ENDIAN_BIG; 06273 enc_idx = UTF_32; 06274 } else { 06275 input_encoding = nkf_enc_from_index(UTF_8); 06276 continue; 06277 } 06278 if (cp[0]=='L') { 06279 cp++; 06280 input_endian = ENDIAN_LITTLE; 06281 } else if (cp[0] == 'B') { 06282 cp++; 06283 input_endian = ENDIAN_BIG; 06284 } 06285 enc_idx = (enc_idx == UTF_16 06286 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE) 06287 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE)); 06288 input_encoding = nkf_enc_from_index(enc_idx); 06289 } 06290 continue; 06291 #endif 06292 /* Input code assumption */ 06293 case 'J': /* ISO-2022-JP input */ 06294 input_encoding = nkf_enc_from_index(ISO_2022_JP); 06295 continue; 06296 case 'E': /* EUC-JP input */ 06297 input_encoding = nkf_enc_from_index(EUCJP_NKF); 06298 continue; 06299 case 'S': /* Shift_JIS input */ 06300 input_encoding = nkf_enc_from_index(SHIFT_JIS); 06301 continue; 06302 case 'Z': /* Convert X0208 alphabet to asii */ 06303 /* alpha_f 06304 bit:0 Convert JIS X 0208 Alphabet to ASCII 06305 bit:1 Convert Kankaku to one space 06306 bit:2 Convert Kankaku to two spaces 06307 bit:3 Convert HTML Entity 06308 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana 06309 */ 06310 while ('0'<= *cp && *cp <='4') { 06311 alpha_f |= 1 << (*cp++ - '0'); 06312 } 06313 alpha_f |= 1; 06314 continue; 06315 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */ 06316 x0201_f = FALSE; /* No X0201->X0208 conversion */ 06317 /* accept X0201 06318 ESC-(-I in JIS, EUC, MS Kanji 06319 SI/SO in JIS, EUC, MS Kanji 06320 SS2 in EUC, JIS, not in MS Kanji 06321 MS Kanji (0xa0-0xdf) 06322 output X0201 06323 ESC-(-I in JIS (0x20-0x5f) 06324 SS2 in EUC (0xa0-0xdf) 06325 0xa0-0xd in MS Kanji (0xa0-0xdf) 06326 */ 06327 continue; 06328 case 'X': /* Convert X0201 kana to X0208 */ 06329 x0201_f = TRUE; 06330 continue; 06331 case 'F': /* prserve new lines */ 06332 fold_preserve_f = TRUE; 06333 case 'f': /* folding -f60 or -f */ 06334 fold_f = TRUE; 06335 fold_len = 0; 06336 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */ 06337 fold_len *= 10; 06338 fold_len += *cp++ - '0'; 06339 } 06340 if (!(0<fold_len && fold_len<BUFSIZ)) 06341 fold_len = DEFAULT_FOLD; 06342 if (*cp=='-') { 06343 fold_margin = 0; 06344 cp++; 06345 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */ 06346 fold_margin *= 10; 06347 fold_margin += *cp++ - '0'; 06348 } 06349 } 06350 continue; 06351 case 'm': /* MIME support */ 06352 /* mime_decode_f = TRUE; */ /* this has too large side effects... */ 06353 if (*cp=='B'||*cp=='Q') { 06354 mime_decode_mode = *cp++; 06355 mimebuf_f = FIXED_MIME; 06356 } else if (*cp=='N') { 06357 mime_f = TRUE; cp++; 06358 } else if (*cp=='S') { 06359 mime_f = STRICT_MIME; cp++; 06360 } else if (*cp=='0') { 06361 mime_decode_f = FALSE; 06362 mime_f = FALSE; cp++; 06363 } else { 06364 mime_f = STRICT_MIME; 06365 } 06366 continue; 06367 case 'M': /* MIME output */ 06368 if (*cp=='B') { 06369 mimeout_mode = 'B'; 06370 mimeout_f = FIXED_MIME; cp++; 06371 } else if (*cp=='Q') { 06372 mimeout_mode = 'Q'; 06373 mimeout_f = FIXED_MIME; cp++; 06374 } else { 06375 mimeout_f = TRUE; 06376 } 06377 continue; 06378 case 'B': /* Broken JIS support */ 06379 /* bit:0 no ESC JIS 06380 bit:1 allow any x on ESC-(-x or ESC-$-x 06381 bit:2 reset to ascii on NL 06382 */ 06383 if ('9'>= *cp && *cp>='0') 06384 broken_f |= 1<<(*cp++ -'0'); 06385 else 06386 broken_f |= TRUE; 06387 continue; 06388 #ifndef PERL_XS 06389 case 'O':/* for Output file */ 06390 file_out_f = TRUE; 06391 continue; 06392 #endif 06393 case 'c':/* add cr code */ 06394 eolmode_f = CRLF; 06395 continue; 06396 case 'd':/* delete cr code */ 06397 eolmode_f = LF; 06398 continue; 06399 case 'I': /* ISO-2022-JP output */ 06400 iso2022jp_f = TRUE; 06401 continue; 06402 case 'L': /* line mode */ 06403 if (*cp=='u') { /* unix */ 06404 eolmode_f = LF; cp++; 06405 } else if (*cp=='m') { /* mac */ 06406 eolmode_f = CR; cp++; 06407 } else if (*cp=='w') { /* windows */ 06408 eolmode_f = CRLF; cp++; 06409 } else if (*cp=='0') { /* no conversion */ 06410 eolmode_f = 0; cp++; 06411 } 06412 continue; 06413 #ifndef PERL_XS 06414 case 'g': 06415 if ('2' <= *cp && *cp <= '9') { 06416 guess_f = 2; 06417 cp++; 06418 } else if (*cp == '0' || *cp == '1') { 06419 guess_f = 1; 06420 cp++; 06421 } else { 06422 guess_f = 1; 06423 } 06424 continue; 06425 #endif 06426 case SP: 06427 /* module muliple options in a string are allowed for Perl moudle */ 06428 while(*cp && *cp++!='-'); 06429 continue; 06430 default: 06431 #if !defined(PERL_XS) && !defined(WIN32DLL) 06432 fprintf(stderr, "unknown option: -%c\n", *(cp-1)); 06433 #endif 06434 /* bogus option but ignored */ 06435 return -1; 06436 } 06437 } 06438 return 0; 06439 } 06440 06441 #ifdef WIN32DLL 06442 #include "nkf32dll.c" 06443 #elif defined(PERL_XS) 06444 #else /* WIN32DLL */ 06445 int 06446 main(int argc, char **argv) 06447 { 06448 FILE *fin; 06449 unsigned char *cp; 06450 06451 char *outfname = NULL; 06452 char *origfname; 06453 06454 #ifdef EASYWIN /*Easy Win */ 06455 _BufferSize.y = 400;/*Set Scroll Buffer Size*/ 06456 #endif 06457 #ifdef DEFAULT_CODE_LOCALE 06458 setlocale(LC_CTYPE, ""); 06459 #endif 06460 nkf_state_init(); 06461 06462 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) { 06463 cp = (unsigned char *)*argv; 06464 options(cp); 06465 #ifdef EXEC_IO 06466 if (exec_f){ 06467 int fds[2], pid; 06468 if (pipe(fds) < 0 || (pid = fork()) < 0){ 06469 abort(); 06470 } 06471 if (pid == 0){ 06472 if (exec_f > 0){ 06473 close(fds[0]); 06474 dup2(fds[1], 1); 06475 }else{ 06476 close(fds[1]); 06477 dup2(fds[0], 0); 06478 } 06479 execvp(argv[1], &argv[1]); 06480 } 06481 if (exec_f > 0){ 06482 close(fds[1]); 06483 dup2(fds[0], 0); 06484 }else{ 06485 close(fds[0]); 06486 dup2(fds[1], 1); 06487 } 06488 argc = 0; 06489 break; 06490 } 06491 #endif 06492 } 06493 06494 if (guess_f) { 06495 #ifdef CHECK_OPTION 06496 int debug_f_back = debug_f; 06497 #endif 06498 #ifdef EXEC_IO 06499 int exec_f_back = exec_f; 06500 #endif 06501 #ifdef X0212_ENABLE 06502 int x0212_f_back = x0212_f; 06503 #endif 06504 int x0213_f_back = x0213_f; 06505 int guess_f_back = guess_f; 06506 reinit(); 06507 guess_f = guess_f_back; 06508 mime_f = FALSE; 06509 #ifdef CHECK_OPTION 06510 debug_f = debug_f_back; 06511 #endif 06512 #ifdef EXEC_IO 06513 exec_f = exec_f_back; 06514 #endif 06515 x0212_f = x0212_f_back; 06516 x0213_f = x0213_f_back; 06517 } 06518 06519 if (binmode_f == TRUE) 06520 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__)) 06521 if (freopen("","wb",stdout) == NULL) 06522 return (-1); 06523 #else 06524 setbinmode(stdout); 06525 #endif 06526 06527 if (unbuf_f) 06528 setbuf(stdout, (char *) NULL); 06529 else 06530 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE); 06531 06532 if (argc == 0) { 06533 if (binmode_f == TRUE) 06534 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__)) 06535 if (freopen("","rb",stdin) == NULL) return (-1); 06536 #else 06537 setbinmode(stdin); 06538 #endif 06539 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE); 06540 if (nop_f) 06541 noconvert(stdin); 06542 else { 06543 kanji_convert(stdin); 06544 if (guess_f) print_guessed_code(NULL); 06545 } 06546 } else { 06547 int nfiles = argc; 06548 int is_argument_error = FALSE; 06549 while (argc--) { 06550 input_codename = NULL; 06551 input_eol = 0; 06552 #ifdef CHECK_OPTION 06553 iconv_for_check = 0; 06554 #endif 06555 if ((fin = fopen((origfname = *argv++), "r")) == NULL) { 06556 perror(*(argv-1)); 06557 is_argument_error = TRUE; 06558 continue; 06559 } else { 06560 #ifdef OVERWRITE 06561 int fd = 0; 06562 int fd_backup = 0; 06563 #endif 06564 06565 /* reopen file for stdout */ 06566 if (file_out_f == TRUE) { 06567 #ifdef OVERWRITE 06568 if (overwrite_f){ 06569 outfname = nkf_xmalloc(strlen(origfname) 06570 + strlen(".nkftmpXXXXXX") 06571 + 1); 06572 strcpy(outfname, origfname); 06573 #ifdef MSDOS 06574 { 06575 int i; 06576 for (i = strlen(outfname); i; --i){ 06577 if (outfname[i - 1] == '/' 06578 || outfname[i - 1] == '\\'){ 06579 break; 06580 } 06581 } 06582 outfname[i] = '\0'; 06583 } 06584 strcat(outfname, "ntXXXXXX"); 06585 mktemp(outfname); 06586 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 06587 S_IREAD | S_IWRITE); 06588 #else 06589 strcat(outfname, ".nkftmpXXXXXX"); 06590 fd = mkstemp(outfname); 06591 #endif 06592 if (fd < 0 06593 || (fd_backup = dup(fileno(stdout))) < 0 06594 || dup2(fd, fileno(stdout)) < 0 06595 ){ 06596 perror(origfname); 06597 return -1; 06598 } 06599 }else 06600 #endif 06601 if(argc == 1) { 06602 outfname = *argv++; 06603 argc--; 06604 } else { 06605 outfname = "nkf.out"; 06606 } 06607 06608 if(freopen(outfname, "w", stdout) == NULL) { 06609 perror (outfname); 06610 return (-1); 06611 } 06612 if (binmode_f == TRUE) { 06613 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__)) 06614 if (freopen("","wb",stdout) == NULL) 06615 return (-1); 06616 #else 06617 setbinmode(stdout); 06618 #endif 06619 } 06620 } 06621 if (binmode_f == TRUE) 06622 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__)) 06623 if (freopen("","rb",fin) == NULL) 06624 return (-1); 06625 #else 06626 setbinmode(fin); 06627 #endif 06628 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE); 06629 if (nop_f) 06630 noconvert(fin); 06631 else { 06632 char *filename = NULL; 06633 kanji_convert(fin); 06634 if (nfiles > 1) filename = origfname; 06635 if (guess_f) print_guessed_code(filename); 06636 } 06637 fclose(fin); 06638 #ifdef OVERWRITE 06639 if (overwrite_f) { 06640 struct stat sb; 06641 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__) 06642 time_t tb[2]; 06643 #else 06644 struct utimbuf tb; 06645 #endif 06646 06647 fflush(stdout); 06648 close(fd); 06649 if (dup2(fd_backup, fileno(stdout)) < 0){ 06650 perror("dup2"); 06651 } 06652 if (stat(origfname, &sb)) { 06653 fprintf(stderr, "Can't stat %s\n", origfname); 06654 } 06655 /* $B%Q!<%_%C%7%g%s$rI|85(B */ 06656 if (chmod(outfname, sb.st_mode)) { 06657 fprintf(stderr, "Can't set permission %s\n", outfname); 06658 } 06659 06660 /* $B%?%$%`%9%?%s%W$rI|85(B */ 06661 if(preserve_time_f){ 06662 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__) 06663 tb[0] = tb[1] = sb.st_mtime; 06664 if (utime(outfname, tb)) { 06665 fprintf(stderr, "Can't set timestamp %s\n", outfname); 06666 } 06667 #else 06668 tb.actime = sb.st_atime; 06669 tb.modtime = sb.st_mtime; 06670 if (utime(outfname, &tb)) { 06671 fprintf(stderr, "Can't set timestamp %s\n", outfname); 06672 } 06673 #endif 06674 } 06675 if(backup_f){ 06676 char *backup_filename = get_backup_filename(backup_suffix, origfname); 06677 #ifdef MSDOS 06678 unlink(backup_filename); 06679 #endif 06680 if (rename(origfname, backup_filename)) { 06681 perror(backup_filename); 06682 fprintf(stderr, "Can't rename %s to %s\n", 06683 origfname, backup_filename); 06684 } 06685 nkf_xfree(backup_filename); 06686 }else{ 06687 #ifdef MSDOS 06688 if (unlink(origfname)){ 06689 perror(origfname); 06690 } 06691 #endif 06692 } 06693 if (rename(outfname, origfname)) { 06694 perror(origfname); 06695 fprintf(stderr, "Can't rename %s to %s\n", 06696 outfname, origfname); 06697 } 06698 nkf_xfree(outfname); 06699 } 06700 #endif 06701 } 06702 } 06703 if (is_argument_error) 06704 return(-1); 06705 } 06706 #ifdef EASYWIN /*Easy Win */ 06707 if (file_out_f == FALSE) 06708 scanf("%d",&end_check); 06709 else 06710 fclose(stdout); 06711 #else /* for Other OS */ 06712 if (file_out_f == TRUE) 06713 fclose(stdout); 06714 #endif /*Easy Win */ 06715 return (0); 06716 } 06717 #endif /* WIN32DLL */ 06718