UCommon
|
00001 // Copyright (C) 2009-2010 David Sugar, Tycho Softworks. 00002 // 00003 // This file is part of GNU uCommon C++. 00004 // 00005 // GNU uCommon C++ is free software: you can redistribute it and/or modify 00006 // it under the terms of the GNU Lesser General Public License as published 00007 // by the Free Software Foundation, either version 3 of the License, or 00008 // (at your option) any later version. 00009 // 00010 // GNU uCommon C++ is distributed in the hope that it will be useful, 00011 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 // GNU Lesser General Public License for more details. 00014 // 00015 // You should have received a copy of the GNU Lesser General Public License 00016 // along with GNU uCommon C++. If not, see <http://www.gnu.org/licenses/>. 00017 00032 #ifndef _UCOMMON_UNICODE_H_ 00033 #define _UCOMMON_UNICODE_H_ 00034 00035 #ifndef _UCOMMON_STRING_H_ 00036 #include <ucommon/string.h> 00037 #endif 00038 00039 NAMESPACE_UCOMMON 00040 00045 typedef int32_t ucs4_t; 00046 00050 typedef int16_t ucs2_t; 00051 00055 typedef void *unicode_t; 00056 00062 class __EXPORT utf8 00063 { 00064 public: 00068 static const unsigned ucsize; 00069 00073 static const char *nil; 00074 00080 static unsigned size(const char *codepoint); 00081 00087 static size_t count(const char *string); 00088 00095 static char *offset(char *string, ssize_t position); 00096 00102 static ucs4_t codepoint(const char *encoded); 00103 00109 static size_t chars(const unicode_t string); 00110 00116 static size_t chars(ucs4_t character); 00117 00124 static size_t unpack(const unicode_t string, CharacterProtocol& buffer); 00125 00133 static size_t pack(unicode_t unicode, CharacterProtocol& buffer, size_t size); 00134 00142 static const char *find(const char *string, ucs4_t character, size_t start = 0); 00143 00151 static const char *rfind(const char *string, ucs4_t character, size_t end = (size_t)-1l); 00152 00159 static unsigned ccount(const char *string, ucs4_t character); 00160 00166 static ucs4_t get(CharacterProtocol& buffer); 00167 00174 static ucs4_t put(ucs4_t character, CharacterProtocol& buffer); 00175 }; 00176 00183 class __EXPORT UString : public String, public utf8 00184 { 00185 protected: 00189 UString(); 00190 00195 UString(strsize_t size); 00196 00201 UString(const unicode_t text); 00202 00209 UString(const char *text, strsize_t size); 00210 00217 UString(const unicode_t *text, const unicode_t *end); 00218 00224 UString(const UString& existing); 00225 00230 virtual ~UString(); 00231 00238 UString get(strsize_t codepoint, strsize_t size = 0) const; 00239 00246 size_t get(unicode_t unicode, size_t size) const; 00247 00252 void set(const unicode_t unicode); 00253 00258 void add(const unicode_t unicode); 00259 00265 ucs4_t at(int position) const; 00266 00273 inline size_t operator()(unicode_t unicode, size_t size) const 00274 {return get(unicode, size);}; 00275 00282 UString operator()(int codepoint, strsize_t size) const; 00283 00291 const char *operator()(int offset) const; 00292 00298 inline ucs4_t operator[](int position) const 00299 {return UString::at(position);}; 00300 00305 inline strsize_t count(void) const 00306 {return utf8::count(str->text);} 00307 00313 unsigned ccount(ucs4_t character) const; 00314 00321 const char *find(ucs4_t character, strsize_t start = 0) const; 00322 00329 const char *rfind(ucs4_t character, strsize_t end = npos) const; 00330 }; 00331 00337 class __EXPORT utf8_pointer 00338 { 00339 protected: 00340 uint8_t *text; 00341 00342 public: 00346 utf8_pointer(); 00347 00352 utf8_pointer(const char *string); 00353 00358 utf8_pointer(const utf8_pointer& copy); 00359 00364 utf8_pointer& operator ++(); 00365 00370 utf8_pointer& operator --(); 00371 00377 utf8_pointer& operator +=(long offset); 00378 00384 utf8_pointer& operator -=(long offset); 00385 00391 utf8_pointer operator+(long offset) const; 00392 00398 utf8_pointer operator-(long offset) const; 00399 00404 inline operator bool() const 00405 {return text != NULL;}; 00406 00411 inline bool operator!() const 00412 {return text == NULL;}; 00413 00419 ucs4_t operator[](long codepoint) const; 00420 00426 utf8_pointer& operator=(const char *string); 00427 00431 void inc(void); 00432 00436 void dec(void); 00437 00443 inline bool operator==(const char *string) const 00444 {return (const char *)text == string;}; 00445 00451 inline bool operator!=(const char *string) const 00452 {return (const char *)text != string;}; 00453 00458 inline ucs4_t operator*() const 00459 {return utf8::codepoint((const char *)text);}; 00460 00465 inline char *c_str(void) const 00466 {return (char *)text;}; 00467 00472 inline operator char*() const 00473 {return (char *)text;}; 00474 00479 inline size_t len(void) const 00480 {return utf8::count((const char *)text);}; 00481 }; 00482 00486 typedef UString ustring_t; 00487 00491 typedef utf8_pointer utf8_t; 00492 00493 END_NAMESPACE 00494 00495 #endif