ispell_checker.cpp
00001 /* vim: set sw=8: -*- Mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ 00002 /* kspell2 - adopted from Enchant 00003 * Copyright (C) 2003 Dom Lachowicz 00004 * Copyright (C) 2004 Zack Rusin <zack@kde.org> 00005 * 00006 * This library is free software; you can redistribute it and/or 00007 * modify it under the terms of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation; either 00009 * version 2.1 of the License, or (at your option) any later version. 00010 * 00011 * This library is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 * Lesser General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Lesser General Public 00017 * License along with this library; if not, write to the 00018 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00019 * Boston, MA 02110-1301, USA. 00020 * 00021 * In addition, as a special exception, Dom Lachowicz 00022 * gives permission to link the code of this program with 00023 * non-LGPL Spelling Provider libraries (eg: a MSFT Office 00024 * spell checker backend) and distribute linked combinations including 00025 * the two. You must obey the GNU Lesser General Public License in all 00026 * respects for all of the code used other than said providers. If you modify 00027 * this file, you may extend this exception to your version of the 00028 * file, but you are not obligated to do so. If you do not wish to 00029 * do so, delete this exception statement from your version. 00030 */ 00031 00032 #include <stdio.h> 00033 #include <stdlib.h> 00034 #include <string.h> 00035 00036 #include <string> 00037 #include <vector> 00038 00039 #include "sp_spell.h" 00040 #include "ispell_checker.h" 00041 00042 #include <qmap.h> 00043 #include <qdir.h> 00044 #include <qfileinfo.h> 00045 00046 /***************************************************************************/ 00047 00048 typedef struct str_ispell_map 00049 { 00050 const char * lang; 00051 const char * dict; 00052 const char * enc; 00053 } IspellMap; 00054 00055 static const char *ispell_dirs [] = { 00056 "/usr/lib/ispell", 00057 "/usr/local/lib/ispell", 00058 "/usr/local/share/ispell", 00059 "/usr/share/ispell", 00060 "/usr/pkg/lib", 00061 0 00062 }; 00063 static const IspellMap ispell_map [] = { 00064 {"ca" ,"catala.hash" ,"iso-8859-1" }, 00065 {"ca_ES" ,"catala.hash" ,"iso-8859-1" }, 00066 {"cs" ,"czech.hash" ,"iso-8859-2" }, 00067 {"cs_CZ" ,"czech.hash" ,"iso-8859-2" }, 00068 {"da" ,"dansk.hash" ,"iso-8859-1" }, 00069 {"da_DK" ,"dansk.hash" ,"iso-8859-1" }, 00070 {"de" ,"deutsch.hash" ,"iso-8859-1" }, 00071 {"de_CH" ,"swiss.hash" ,"iso-8859-1" }, 00072 {"de_AT" ,"deutsch.hash" ,"iso-8859-1" }, 00073 {"de_DE" ,"deutsch.hash" ,"iso-8859-1" }, 00074 {"el" ,"ellhnika.hash" ,"iso-8859-7" }, 00075 {"el_GR" ,"ellhnika.hash" ,"iso-8859-7" }, 00076 {"en" ,"british.hash" ,"iso-8859-1" }, 00077 {"en_AU" ,"british.hash" ,"iso-8859-1" }, 00078 {"en_BZ" ,"british.hash" ,"iso-8859-1" }, 00079 {"en_CA" ,"british.hash" ,"iso-8859-1" }, 00080 {"en_GB" ,"british.hash" ,"iso-8859-1" }, 00081 {"en_IE" ,"british.hash" ,"iso-8859-1" }, 00082 {"en_JM" ,"british.hash" ,"iso-8859-1" }, 00083 {"en_NZ" ,"british.hash" ,"iso-8859-1" }, 00084 {"en_TT" ,"british.hash" ,"iso-8859-1" }, 00085 {"en_ZA" ,"british.hash" ,"iso-8859-1" }, 00086 {"en_ZW" ,"british.hash" ,"iso-8859-1" }, 00087 {"en_PH" ,"american.hash" ,"iso-8859-1" }, 00088 {"en_US" ,"american.hash" ,"iso-8859-1" }, 00089 {"eo" ,"esperanto.hash" ,"iso-8859-3" }, 00090 {"es" ,"espanol.hash" ,"iso-8859-1" }, 00091 {"es_AR" ,"espanol.hash" ,"iso-8859-1" }, 00092 {"es_BO" ,"espanol.hash" ,"iso-8859-1" }, 00093 {"es_CL" ,"espanol.hash" ,"iso-8859-1" }, 00094 {"es_CO" ,"espanol.hash" ,"iso-8859-1" }, 00095 {"es_CR" ,"espanol.hash" ,"iso-8859-1" }, 00096 {"es_DO" ,"espanol.hash" ,"iso-8859-1" }, 00097 {"es_EC" ,"espanol.hash" ,"iso-8859-1" }, 00098 {"es_ES" ,"espanol.hash" ,"iso-8859-1" }, 00099 {"es_GT" ,"espanol.hash" ,"iso-8859-1" }, 00100 {"es_HN" ,"espanol.hash" ,"iso-8859-1" }, 00101 {"es_MX" ,"espanol.hash" ,"iso-8859-1" }, 00102 {"es_NI" ,"espanol.hash" ,"iso-8859-1" }, 00103 {"es_PA" ,"espanol.hash" ,"iso-8859-1" }, 00104 {"es_PE" ,"espanol.hash" ,"iso-8859-1" }, 00105 {"es_PR" ,"espanol.hash" ,"iso-8859-1" }, 00106 {"es_PY" ,"espanol.hash" ,"iso-8859-1" }, 00107 {"es_SV" ,"espanol.hash" ,"iso-8859-1" }, 00108 {"es_UY" ,"espanol.hash" ,"iso-8859-1" }, 00109 {"es_VE" ,"espanol.hash" ,"iso-8859-1" }, 00110 {"fi" ,"finnish.hash" ,"iso-8859-1" }, 00111 {"fi_FI" ,"finnish.hash" ,"iso-8859-1" }, 00112 {"fr" ,"francais.hash" ,"iso-8859-1" }, 00113 {"fr_BE" ,"francais.hash" ,"iso-8859-1" }, 00114 {"fr_CA" ,"francais.hash" ,"iso-8859-1" }, 00115 {"fr_CH" ,"francais.hash" ,"iso-8859-1" }, 00116 {"fr_FR" ,"francais.hash" ,"iso-8859-1" }, 00117 {"fr_LU" ,"francais.hash" ,"iso-8859-1" }, 00118 {"fr_MC" ,"francais.hash" ,"iso-8859-1" }, 00119 {"hu" ,"hungarian.hash" ,"iso-8859-2" }, 00120 {"hu_HU" ,"hungarian.hash" ,"iso-8859-2" }, 00121 {"ga" ,"irish.hash" ,"iso-8859-1" }, 00122 {"ga_IE" ,"irish.hash" ,"iso-8859-1" }, 00123 {"gl" ,"galician.hash" ,"iso-8859-1" }, 00124 {"gl_ES" ,"galician.hash" ,"iso-8859-1" }, 00125 {"ia" ,"interlingua.hash" ,"iso-8859-1" }, 00126 {"it" ,"italian.hash" ,"iso-8859-1" }, 00127 {"it_IT" ,"italian.hash" ,"iso-8859-1" }, 00128 {"it_CH" ,"italian.hash" ,"iso-8859-1" }, 00129 {"la" ,"mlatin.hash" ,"iso-8859-1" }, 00130 {"la_IT" ,"mlatin.hash" ,"iso-8859-1" }, 00131 {"lt" ,"lietuviu.hash" ,"iso-8859-13" }, 00132 {"lt_LT" ,"lietuviu.hash" ,"iso-8859-13" }, 00133 {"nl" ,"nederlands.hash" ,"iso-8859-1" }, 00134 {"nl_NL" ,"nederlands.hash" ,"iso-8859-1" }, 00135 {"nl_BE" ,"nederlands.hash" ,"iso-8859-1" }, 00136 {"nb" ,"norsk.hash" ,"iso-8859-1" }, 00137 {"nb_NO" ,"norsk.hash" ,"iso-8859-1" }, 00138 {"nn" ,"nynorsk.hash" ,"iso-8859-1" }, 00139 {"nn_NO" ,"nynorsk.hash" ,"iso-8859-1" }, 00140 {"no" ,"norsk.hash" ,"iso-8859-1" }, 00141 {"no_NO" ,"norsk.hash" ,"iso-8859-1" }, 00142 {"pl" ,"polish.hash" ,"iso-8859-2" }, 00143 {"pl_PL" ,"polish.hash" ,"iso-8859-2" }, 00144 {"pt" ,"brazilian.hash" ,"iso-8859-1" }, 00145 {"pt_BR" ,"brazilian.hash" ,"iso-8859-1" }, 00146 {"pt_PT" ,"portugues.hash" ,"iso-8859-1" }, 00147 {"ru" ,"russian.hash" ,"koi8-r" }, 00148 {"ru_MD" ,"russian.hash" ,"koi8-r" }, 00149 {"ru_RU" ,"russian.hash" ,"koi8-r" }, 00150 {"sc" ,"sardinian.hash" ,"iso-8859-1" }, 00151 {"sc_IT" ,"sardinian.hash" ,"iso-8859-1" }, 00152 {"sk" ,"slovak.hash" ,"iso-8859-2" }, 00153 {"sk_SK" ,"slovak.hash" ,"iso-8859-2" }, 00154 {"sl" ,"slovensko.hash" ,"iso-8859-2" }, 00155 {"sl_SI" ,"slovensko.hash" ,"iso-8859-2" }, 00156 {"sv" ,"svenska.hash" ,"iso-8859-1" }, 00157 {"sv_SE" ,"svenska.hash" ,"iso-8859-1" }, 00158 {"uk" ,"ukrainian.hash" ,"koi8-u" }, 00159 {"uk_UA" ,"ukrainian.hash" ,"koi8-u" }, 00160 {"yi" ,"yiddish-yivo.hash" ,"utf-8" } 00161 }; 00162 00163 static const size_t size_ispell_map = ( sizeof(ispell_map) / sizeof((ispell_map)[0]) ); 00164 static QMap<QString, QString> ispell_dict_map; 00165 00166 00167 void 00168 ISpellChecker::try_autodetect_charset(const char * const inEncoding) 00169 { 00170 if (inEncoding && strlen(inEncoding)) 00171 { 00172 m_translate_in = QTextCodec::codecForName(inEncoding); 00173 } 00174 } 00175 00176 /***************************************************************************/ 00177 /***************************************************************************/ 00178 00179 ISpellChecker::ISpellChecker() 00180 : deftflag(-1), 00181 prefstringchar(-1), 00182 m_bSuccessfulInit(false), 00183 m_BC(NULL), 00184 m_cd(NULL), 00185 m_cl(NULL), 00186 m_cm(NULL), 00187 m_ho(NULL), 00188 m_nd(NULL), 00189 m_so(NULL), 00190 m_se(NULL), 00191 m_ti(NULL), 00192 m_te(NULL), 00193 m_hashstrings(NULL), 00194 m_hashtbl(NULL), 00195 m_pflaglist(NULL), 00196 m_sflaglist(NULL), 00197 m_chartypes(NULL), 00198 m_infile(NULL), 00199 m_outfile(NULL), 00200 m_askfilename(NULL), 00201 m_Trynum(0), 00202 m_translate_in(0) 00203 { 00204 memset(m_sflagindex,0,sizeof(m_sflagindex)); 00205 memset(m_pflagindex,0,sizeof(m_pflagindex)); 00206 } 00207 00208 #ifndef FREEP 00209 #define FREEP(p) do { if (p) free(p); } while (0) 00210 #endif 00211 00212 ISpellChecker::~ISpellChecker() 00213 { 00214 if (m_bSuccessfulInit) { 00215 // only cleanup our mess if we were successfully initialized 00216 00217 clearindex (m_pflagindex); 00218 clearindex (m_sflagindex); 00219 } 00220 00221 FREEP(m_hashtbl); 00222 FREEP(m_hashstrings); 00223 FREEP(m_sflaglist); 00224 FREEP(m_chartypes); 00225 00226 delete m_translate_in; 00227 m_translate_in = 0; 00228 } 00229 00230 bool 00231 ISpellChecker::checkWord( const QString& utf8Word ) 00232 { 00233 ichar_t iWord[INPUTWORDLEN + MAXAFFIXLEN]; 00234 if (!m_bSuccessfulInit) 00235 return false; 00236 00237 if (!utf8Word || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) || utf8Word.isEmpty()) 00238 return false; 00239 00240 bool retVal = false; 00241 QCString out; 00242 if (!m_translate_in) 00243 return false; 00244 else { 00245 /* convert to 8bit string and null terminate */ 00246 int len_out = utf8Word.length(); 00247 00248 out = m_translate_in->fromUnicode( utf8Word, len_out ); 00249 } 00250 00251 if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0)) 00252 { 00253 if (good(iWord, 0, 0, 1, 0) == 1 || 00254 compoundgood(iWord, 1) == 1) 00255 { 00256 retVal = true; 00257 } 00258 } 00259 00260 return retVal; 00261 } 00262 00263 QStringList 00264 ISpellChecker::suggestWord(const QString& utf8Word) 00265 { 00266 ichar_t iWord[INPUTWORDLEN + MAXAFFIXLEN]; 00267 int c; 00268 00269 if (!m_bSuccessfulInit) 00270 return QStringList(); 00271 00272 if (utf8Word.isEmpty() || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) || 00273 utf8Word.length() == 0) 00274 return QStringList(); 00275 00276 QCString out; 00277 if (!m_translate_in) 00278 return QStringList(); 00279 else 00280 { 00281 /* convert to 8bit string and null terminate */ 00282 00283 int len_out = utf8Word.length(); 00284 out = m_translate_in->fromUnicode( utf8Word, len_out ); 00285 } 00286 00287 if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0)) 00288 makepossibilities(iWord); 00289 else 00290 return QStringList(); 00291 00292 QStringList sugg_arr; 00293 for (c = 0; c < m_pcount; c++) 00294 { 00295 QString utf8Word; 00296 00297 if (!m_translate_in) 00298 { 00299 /* copy to 8bit string and null terminate */ 00300 utf8Word = QString::fromUtf8( m_possibilities[c] ); 00301 } 00302 else 00303 { 00304 /* convert to 32bit string and null terminate */ 00305 utf8Word = m_translate_in->toUnicode( m_possibilities[c] ); 00306 } 00307 00308 sugg_arr.append( utf8Word ); 00309 } 00310 00311 return sugg_arr; 00312 } 00313 00314 static void 00315 s_buildHashNames (std::vector<std::string> & names, const char * dict) 00316 { 00317 const char * tmp = 0; 00318 int i = 0; 00319 00320 names.clear (); 00321 00322 while ( (tmp = ispell_dirs[i++]) ) { 00323 QCString maybeFile = QCString( tmp ) + '/'; 00324 maybeFile += dict; 00325 names.push_back( maybeFile.data() ); 00326 } 00327 } 00328 00329 static void 00330 s_allDics() 00331 { 00332 const char * tmp = 0; 00333 int i = 0; 00334 00335 while ( (tmp = ispell_dirs[i++]) ) { 00336 QDir dir( tmp ); 00337 QStringList lst = dir.entryList( "*.hash" ); 00338 for ( QStringList::Iterator it = lst.begin(); it != lst.end(); ++it ) { 00339 QFileInfo info( *it ); 00340 for (size_t i = 0; i < size_ispell_map; i++) 00341 { 00342 const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i])); 00343 if (!strcmp (info.fileName().latin1(), mapping->dict)) 00344 { 00345 ispell_dict_map.insert( mapping->lang, *it ); 00346 } 00347 } 00348 } 00349 } 00350 } 00351 00352 QValueList<QString> 00353 ISpellChecker::allDics() 00354 { 00355 if ( ispell_dict_map.empty() ) 00356 s_allDics(); 00357 00358 return ispell_dict_map.keys(); 00359 } 00360 00361 QString 00362 ISpellChecker::loadDictionary (const char * szdict) 00363 { 00364 std::vector<std::string> dict_names; 00365 00366 s_buildHashNames (dict_names, szdict); 00367 00368 for (size_t i = 0; i < dict_names.size(); i++) 00369 { 00370 if (linit(const_cast<char*>(dict_names[i].c_str())) >= 0) 00371 return dict_names[i].c_str(); 00372 } 00373 00374 return QString::null; 00375 } 00376 00383 bool 00384 ISpellChecker::loadDictionaryForLanguage ( const char * szLang ) 00385 { 00386 QString hashname; 00387 00388 const char * encoding = NULL; 00389 const char * szFile = NULL; 00390 00391 for (size_t i = 0; i < size_ispell_map; i++) 00392 { 00393 const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i])); 00394 if (!strcmp (szLang, mapping->lang)) 00395 { 00396 szFile = mapping->dict; 00397 encoding = mapping->enc; 00398 break; 00399 } 00400 } 00401 00402 if (!szFile || !strlen(szFile)) 00403 return false; 00404 00405 alloc_ispell_struct(); 00406 00407 hashname = loadDictionary(szFile); 00408 if (hashname.isEmpty()) 00409 return false; 00410 00411 // one of the two above calls succeeded 00412 setDictionaryEncoding (hashname, encoding); 00413 00414 return true; 00415 } 00416 00417 void 00418 ISpellChecker::setDictionaryEncoding( const QString& hashname, const char * encoding ) 00419 { 00420 /* Get Hash encoding from XML file. This should always work! */ 00421 try_autodetect_charset(encoding); 00422 00423 if (m_translate_in) 00424 { 00425 /* We still have to setup prefstringchar*/ 00426 prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag 00427 : static_cast<int *>(NULL)); 00428 00429 if (prefstringchar < 0) 00430 { 00431 std::string teststring; 00432 for(int n1 = 1; n1 <= 15; n1++) 00433 { 00434 teststring = "latin" + n1; 00435 prefstringchar = findfiletype(teststring.c_str(), 1, 00436 deftflag < 0 ? &deftflag : static_cast<int *>(NULL)); 00437 if (prefstringchar >= 0) 00438 break; 00439 } 00440 } 00441 00442 return; /* success */ 00443 } 00444 00445 /* Test for UTF-8 first */ 00446 prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag : static_cast<int *>(NULL)); 00447 if (prefstringchar >= 0) 00448 { 00449 m_translate_in = QTextCodec::codecForName("utf8"); 00450 } 00451 00452 if (m_translate_in) 00453 return; /* success */ 00454 00455 /* Test for "latinN" */ 00456 if (!m_translate_in) 00457 { 00458 /* Look for "altstringtype" names from latin1 to latin15 */ 00459 for(int n1 = 1; n1 <= 15; n1++) 00460 { 00461 QString teststring = QString("latin%1").arg(n1); 00462 prefstringchar = findfiletype(teststring.latin1(), 1, 00463 deftflag < 0 ? &deftflag : static_cast<int *>(NULL)); 00464 if (prefstringchar >= 0) 00465 { 00466 //FIXME: latin1 might be wrong 00467 m_translate_in = QTextCodec::codecForName( teststring.latin1() ); 00468 break; 00469 } 00470 } 00471 } 00472 00473 /* If nothing found, use latin1 */ 00474 if (!m_translate_in) 00475 { 00476 m_translate_in = QTextCodec::codecForName("latin1"); 00477 } 00478 } 00479 00480 bool 00481 ISpellChecker::requestDictionary(const char *szLang) 00482 { 00483 if (!loadDictionaryForLanguage (szLang)) 00484 { 00485 // handle a shortened version of the language tag: en_US => en 00486 std::string shortened_dict (szLang); 00487 size_t uscore_pos; 00488 00489 if ((uscore_pos = shortened_dict.rfind ('_')) != ((size_t)-1)) { 00490 shortened_dict = shortened_dict.substr(0, uscore_pos); 00491 if (!loadDictionaryForLanguage (shortened_dict.c_str())) 00492 return false; 00493 } else 00494 return false; 00495 } 00496 00497 m_bSuccessfulInit = true; 00498 00499 if (prefstringchar < 0) 00500 m_defdupchar = 0; 00501 else 00502 m_defdupchar = prefstringchar; 00503 00504 return true; 00505 }