Audacious $Id:Doxyfile42802007-03-2104:39:00Znenolod$
|
00001 /* Audacious 00002 * Copyright (C) 2005-2007 Audacious development team. 00003 * 00004 * This program is free software; you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation; under version 3 of the License. 00007 * 00008 * This program is distributed in the hope that it will be useful, 00009 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00010 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00011 * GNU General Public License for more details. 00012 * 00013 * You should have received a copy of the GNU General Public License 00014 * along with this program. If not, see <http://www.gnu.org/licenses>. 00015 * 00016 * The Audacious team does not consider modular code linking to 00017 * Audacious or using our public API to be a derived work. 00018 */ 00019 00020 #include <string.h> 00021 #include <libaudcore/audstrings.h> 00022 00023 #include "audconfig.h" 00024 #include "config.h" 00025 #include "i18n.h" 00026 #include "debug.h" 00027 00028 #ifdef USE_CHARDET 00029 # include <libguess.h> 00030 #endif 00031 00032 static gchar * cd_chardet_to_utf8 (const gchar * str, gssize len, 00033 gsize * arg_bytes_read, gsize * arg_bytes_write, GError ** error); 00034 00035 static gchar * str_to_utf8_fallback (const gchar * str) 00036 { 00037 gchar * out = g_strconcat (str, _(" (invalid UTF-8)"), NULL); 00038 00039 for (gchar * c = out; * c; c ++) 00040 { 00041 if (* c & 0x80) 00042 * c = '?'; 00043 } 00044 00045 return out; 00046 } 00047 00048 static gchar * cd_str_to_utf8 (const gchar * str) 00049 { 00050 gchar *out_str; 00051 00052 if (str == NULL) 00053 return NULL; 00054 00055 /* Note: Currently, playlist calls this function repeatedly, even 00056 * if the string is already converted into utf-8. 00057 * chardet_to_utf8() would convert a valid utf-8 string into a 00058 * different utf-8 string, if fallback encodings were supplied and 00059 * the given string could be treated as a string in one of 00060 * fallback encodings. To avoid this, g_utf8_validate() had been 00061 * used at the top of evaluation. 00062 */ 00063 00064 /* Note 2: g_utf8_validate() has so called encapsulated utf-8 00065 * problem, thus chardet_to_utf8() took the place of that. 00066 */ 00067 00068 /* Note 3: As introducing madplug, the problem of conversion from 00069 * ISO-8859-1 to UTF-8 arose. This may be coped with g_convert() 00070 * located near the end of chardet_to_utf8(), but it requires utf8 00071 * validation guard where g_utf8_validate() was. New 00072 * dfa_validate_utf8() employs libguess' DFA engine to validate 00073 * utf-8 and can properly distinguish examples of encapsulated 00074 * utf-8. It is considered to be safe to use as a guard. 00075 */ 00076 00077 /* Already UTF-8? */ 00078 #ifdef USE_CHARDET 00079 if (libguess_validate_utf8(str, strlen(str))) 00080 return g_strdup(str); 00081 #else 00082 if (g_utf8_validate(str, strlen(str), NULL)) 00083 return g_strdup(str); 00084 #endif 00085 00086 /* chardet encoding detector */ 00087 if ((out_str = cd_chardet_to_utf8(str, strlen(str), NULL, NULL, NULL)) != NULL) 00088 return out_str; 00089 00090 /* all else fails, we mask off character codes >= 128, replace with '?' */ 00091 return str_to_utf8_fallback(str); 00092 } 00093 00094 static gchar * cd_chardet_to_utf8 (const gchar * str, gssize len, 00095 gsize * arg_bytes_read, gsize * arg_bytes_write, GError ** error) 00096 { 00097 if (error) 00098 * error = NULL; 00099 00100 #ifdef USE_CHARDET 00101 gchar *det = NULL, *encoding = NULL; 00102 #endif 00103 gchar *ret = NULL; 00104 gsize *bytes_read, *bytes_write; 00105 gsize my_bytes_read, my_bytes_write; 00106 00107 bytes_read = arg_bytes_read != NULL ? arg_bytes_read : &my_bytes_read; 00108 bytes_write = arg_bytes_write != NULL ? arg_bytes_write : &my_bytes_write; 00109 00110 g_return_val_if_fail(str != NULL, NULL); 00111 00112 #ifdef USE_CHARDET 00113 if (libguess_validate_utf8(str, len)) 00114 #else 00115 if (g_utf8_validate(str, len, NULL)) 00116 #endif 00117 { 00118 if (len < 0) 00119 len = strlen (str); 00120 00121 ret = g_malloc (len + 1); 00122 memcpy (ret, str, len); 00123 ret[len] = 0; 00124 00125 if (arg_bytes_read != NULL) 00126 * arg_bytes_read = len; 00127 if (arg_bytes_write != NULL) 00128 * arg_bytes_write = len; 00129 00130 return ret; 00131 } 00132 #ifdef USE_CHARDET 00133 if (cfg.chardet_detector) 00134 det = cfg.chardet_detector; 00135 00136 if (det) 00137 { 00138 AUDDBG("guess encoding (%s) %s\n", det, str); 00139 encoding = (gchar *) libguess_determine_encoding(str, len, det); 00140 AUDDBG("encoding = %s\n", encoding); 00141 if (encoding == NULL) 00142 goto fallback; 00143 00144 ret = g_convert (str, len, "UTF-8", encoding, bytes_read, bytes_write, 00145 (error && * error) ? NULL : error); 00146 } 00147 00148 fallback: 00149 #endif 00150 00151 /* If detection failed or was not enabled, try fallbacks (if there are any) */ 00152 if (ret == NULL && cfg.chardet_fallback_s != NULL) 00153 { 00154 gchar **enc; 00155 for (enc = cfg.chardet_fallback_s; *enc != NULL; enc++) 00156 { 00157 ret = g_convert (str, len, "UTF-8", * enc, bytes_read, bytes_write, 00158 (error && * error) ? NULL : error); 00159 if (len == *bytes_read) 00160 break; 00161 else { 00162 g_free(ret); 00163 ret = NULL; 00164 } 00165 } 00166 } 00167 00168 /* First fallback: locale (duh!) */ 00169 if (ret == NULL) 00170 ret = g_locale_to_utf8 (str, len, bytes_read, bytes_write, 00171 (error && * error) ? NULL : error); 00172 00173 /* The final fallback is ISO-8859-1, if no other is specified or conversions fail */ 00174 if (ret == NULL) 00175 ret = g_convert (str, len, "UTF-8", "ISO-8859-1", bytes_read, 00176 bytes_write, (error && * error) ? NULL : error); 00177 00178 if (ret != NULL) 00179 { 00180 if (g_utf8_validate(ret, -1, NULL)) 00181 return ret; 00182 else 00183 { 00184 g_warning("g_utf8_validate() failed for converted string in cd_chardet_to_utf8: '%s'", ret); 00185 g_free(ret); 00186 return NULL; 00187 } 00188 } 00189 00190 return NULL; /* If we have no idea, return NULL. */ 00191 } 00192 00193 void chardet_init (void) 00194 { 00195 str_set_utf8_impl (cd_str_to_utf8, cd_chardet_to_utf8); 00196 }