00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00029 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00030
00031 #include <iterator>
00032
00033 namespace utf8
00034 {
00035
00036
00037
00038 typedef unsigned char uint8_t;
00039 typedef unsigned short uint16_t;
00040 typedef unsigned int uint32_t;
00041
00042
00043 namespace internal
00044 {
00045
00046
00047
00048 const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
00049 const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
00050 const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
00051 const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
00052 const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
00053 const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
00054
00055
00056 const uint32_t CODE_POINT_MAX = 0x0010ffffu;
00057
00058 template<typename octet_type>
00059 inline uint8_t mask8(octet_type oc)
00060 {
00061 return static_cast<uint8_t>(0xff & oc);
00062 }
00063 template<typename u16_type>
00064 inline uint16_t mask16(u16_type oc)
00065 {
00066 return static_cast<uint16_t>(0xffff & oc);
00067 }
00068 template<typename octet_type>
00069 inline bool is_trail(octet_type oc)
00070 {
00071 return ((mask8(oc) >> 6) == 0x2);
00072 }
00073
00074 template <typename u16>
00075 inline bool is_surrogate(u16 cp)
00076 {
00077 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
00078 }
00079
00080 template <typename u32>
00081 inline bool is_code_point_valid(u32 cp)
00082 {
00083 return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
00084 }
00085
00086 template <typename octet_iterator>
00087 inline typename std::iterator_traits<octet_iterator>::difference_type
00088 sequence_length(octet_iterator lead_it)
00089 {
00090 uint8_t lead = mask8(*lead_it);
00091 if (lead < 0x80)
00092 return 1;
00093 else if ((lead >> 5) == 0x6)
00094 return 2;
00095 else if ((lead >> 4) == 0xe)
00096 return 3;
00097 else if ((lead >> 3) == 0x1e)
00098 return 4;
00099 else
00100 return 0;
00101 }
00102
00103 enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
00104
00105 template <typename octet_iterator>
00106 utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00107 {
00108 uint32_t cp = mask8(*it);
00109
00110 typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
00111 octet_difference_type length = sequence_length(it);
00112
00113
00114 if (length == 1) {
00115 if (end - it > 0) {
00116 if (code_point)
00117 *code_point = cp;
00118 ++it;
00119 return OK;
00120 }
00121 else
00122 return NOT_ENOUGH_ROOM;
00123 }
00124
00125
00126 if (std::distance(it, end) < length)
00127 return NOT_ENOUGH_ROOM;
00128
00129
00130 switch (length) {
00131 case 0:
00132 return INVALID_LEAD;
00133 break;
00134 case 2:
00135 if (is_trail(*(++it))) {
00136 cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
00137 }
00138 else {
00139 --it;
00140 return INCOMPLETE_SEQUENCE;
00141 }
00142 break;
00143 case 3:
00144 if (is_trail(*(++it))) {
00145 cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
00146 if (is_trail(*(++it))) {
00147 cp += (*it) & 0x3f;
00148 }
00149 else {
00150 std::advance(it, -2);
00151 return INCOMPLETE_SEQUENCE;
00152 }
00153 }
00154 else {
00155 --it;
00156 return INCOMPLETE_SEQUENCE;
00157 }
00158 break;
00159 case 4:
00160 if (is_trail(*(++it))) {
00161 cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
00162 if (is_trail(*(++it))) {
00163 cp += (mask8(*it) << 6) & 0xfff;
00164 if (is_trail(*(++it))) {
00165 cp += (*it) & 0x3f;
00166 }
00167 else {
00168 std::advance(it, -3);
00169 return INCOMPLETE_SEQUENCE;
00170 }
00171 }
00172 else {
00173 std::advance(it, -2);
00174 return INCOMPLETE_SEQUENCE;
00175 }
00176 }
00177 else {
00178 --it;
00179 return INCOMPLETE_SEQUENCE;
00180 }
00181 break;
00182 }
00183
00184 if (!is_code_point_valid(cp)) {
00185 for (octet_difference_type i = 0; i < length - 1; ++i)
00186 --it;
00187 return INVALID_CODE_POINT;
00188 }
00189
00190 if (code_point)
00191 *code_point = cp;
00192
00193 if (cp < 0x80) {
00194 if (length != 1) {
00195 std::advance(it, -(length-1));
00196 return OVERLONG_SEQUENCE;
00197 }
00198 }
00199 else if (cp < 0x800) {
00200 if (length != 2) {
00201 std::advance(it, -(length-1));
00202 return OVERLONG_SEQUENCE;
00203 }
00204 }
00205 else if (cp < 0x10000) {
00206 if (length != 3) {
00207 std::advance(it, -(length-1));
00208 return OVERLONG_SEQUENCE;
00209 }
00210 }
00211
00212 ++it;
00213 return OK;
00214 }
00215
00216 template <typename octet_iterator>
00217 inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
00218 return validate_next(it, end, 0);
00219 }
00220
00221 }
00222
00224
00225
00226 const uint8_t bom[] = {0xef, 0xbb, 0xbf};
00227
00228 template <typename octet_iterator>
00229 octet_iterator find_invalid(octet_iterator start, octet_iterator end)
00230 {
00231 octet_iterator result = start;
00232 while (result != end) {
00233 internal::utf_error err_code = internal::validate_next(result, end);
00234 if (err_code != internal::OK)
00235 return result;
00236 }
00237 return result;
00238 }
00239
00240 template <typename octet_iterator>
00241 inline bool is_valid(octet_iterator start, octet_iterator end)
00242 {
00243 return (find_invalid(start, end) == end);
00244 }
00245
00246 template <typename octet_iterator>
00247 inline bool is_bom (octet_iterator it)
00248 {
00249 return (
00250 (internal::mask8(*it++)) == bom[0] &&
00251 (internal::mask8(*it++)) == bom[1] &&
00252 (internal::mask8(*it)) == bom[2]
00253 );
00254 }
00255 }
00256
00257 #endif // header guard
00258
00259