FONTAINE 1.0
Utf8String.h
Go to the documentation of this file.
00001 //
00002 // The Fontaine Font Analysis Project 
00003 // 
00004 // Copyright (c) 2009 by Edward H. Trager
00005 // All Rights Reserved
00006 // 
00007 // Released under the GNU GPL version 2.0 or later.
00008 //     
00009 
00010 
00012 //
00013 // This file was originally part of the MADELINE 2 program 
00014 // written by Edward H. Trager and Ritu Khanna
00015 // Copyright (c) 2005 by the
00016 // Regents of the University of Michigan.
00017 // All Rights Reserved.
00018 // Released under the GNU General Public License v. 2.0 or later.
00019 // 
00021 //
00022 // utf8String.h
00023 //
00024 // (c) 2006 by Edward H. Trager
00025 // released under the GNU General Public License
00026 // 
00027 // This file was originally written for inclusion
00028 // in "Font Playground" .
00029 //
00030 // 2006.04.30.et.
00031 // LAST UPDATE: 2007.01.08
00032 // 
00033 
00034 #ifndef UTF8STRING_INCLUDED
00035 #define UTF8STRING_INCLUDED
00036 
00037 #include "ScriptCodes.h"
00038 #include <string>
00039 
00040 typedef unsigned long  UTF32; // at least 32 bits
00041 typedef unsigned short UTF16; // at least 16 bits
00042 typedef unsigned char  UTF8;
00043 
00044 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
00045 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
00046 
00047 //
00048 // The following are needed for UTF-16 conversion:
00049 // 
00050 #define UNI_SUR_HIGH_START  (UTF32)0xD800
00051 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
00052 #define UNI_SUR_LOW_START   (UTF32)0xDC00
00053 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
00054 
00055 class UTF8String : public std::string {
00056         
00057         
00058 private:
00059         
00060         const char *_UTF32ValueToUTF8( UTF32 UTF32Value );
00061         
00062 public:
00063         
00064         // Default constructor just calls base class std::String():
00065         UTF8String();
00066         // Copy Constructors:
00067         UTF8String(const std::string &s);
00068         UTF8String(const UTF8String &s);
00069         // How many Unicode values are stored in the string?:
00070         unsigned int unicodeValueCount() const;
00071         // Get the Unicode substring starting at the "stt" unicode value --
00072         // Note that stt=1 (*not* zero) returns the entire string:
00073         UTF8String unicodeSubString(unsigned int stt,unsigned int howManyCharacters=0) const;
00074         // Read-only bracket operator retrieves the nth unicode character --
00075         // Note that pos=1 (*not* zero) specifies the first character:
00076         UTF8String operator[](unsigned int pos) const;
00077         // Return the Unicode code value of the nth Unicode character:
00078         UTF32 unicodeValueAtPosition(unsigned int pos=0) const;
00079         
00080         //
00081         // Return a substring less than or equal to the howManyCharacters in
00082         // length where the end of the string is on a word boundary.
00083         // 
00084         UTF8String unicodeSubStringOnWordBoundary(unsigned int stt,unsigned int howManyCharacters) const;
00085         
00086         // Returns boolean TRUE if the string begins with a character
00087         // from a right-to-left script:
00088         bool isRTL(void) const;
00089         // Returns a boolean TRUE if the string begins with a character
00090         // from an Indic or Indic-derived script.  Such scripts have
00091         // special complex text layout requirements:
00092         bool isIndic(void) const;
00093         
00094         // Returns a boolean TRUE if the string begins with a character
00095         // from the Arabic script.  This script has
00096         // special complex text layout requirements:
00097         bool isArabic(void) const;
00098         
00099         //
00100         // Returns a script code based on the Unicode range of the first
00101         // character in the string: Currently only handles the Arabic and
00102         // Indic cases relevant for complex text layout
00103         // 
00104         SCRIPTCODE getScriptCode(void);
00105         
00106         // Returns a UTF32 String:
00107         std::basic_string<UTF32> UTF32String() const;
00108         
00109         //
00110         // Append and Derived Overloaded Assignment operators:
00111         //
00112         UTF8String& append( const std::basic_string<UTF32> &UTF32String );
00113         UTF8String& append( const std::basic_string<UTF16> &UTF16String );
00114         
00115         UTF8String& operator+=( const std::basic_string<UTF32> &UTF32String );
00116         UTF8String& operator+=( const std::basic_string<UTF16> &UTF16String );
00117         
00118         UTF8String& operator=( const std::basic_string<UTF32> &UTF32String );
00119         UTF8String& operator=( const std::basic_string<UTF16> &UTF16String );
00120         
00121         // 
00122         // Specialized constructors:
00123         // 
00124         // Construct a UTF8String from a UTF32 or UTF16 string:
00125         // 
00126         // These also ultimately use the append() methods from above:
00127         // 
00128         UTF8String( const std::basic_string<UTF32> &UTF32String );
00129         UTF8String( const std::basic_string<UTF16> &UTF16String );
00130         
00131 };
00132 
00133 #endif
00134