27#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
28#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
38 using uint8_t =
unsigned char;
39 using uint16_t =
unsigned short;
40 using uint32_t =
unsigned int;
48 uint16_t
const LEAD_SURROGATE_MIN = 0xd800U;
49 uint16_t
const LEAD_SURROGATE_MAX = 0xdbffU;
50 uint16_t
const TRAIL_SURROGATE_MIN = 0xdc00U;
51 uint16_t
const TRAIL_SURROGATE_MAX = 0xdfffU;
52 uint16_t
const LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
53 uint32_t
const SURROGATE_OFFSET = 0x10000U - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
56 uint32_t
const CODE_POINT_MAX = 0x0010ffffU;
58 template <
typename octet_type>
59 inline uint8_t mask8(octet_type oc)
61 return static_cast<uint8_t
>(0xff & oc);
63 template <
typename u16_type>
64 inline uint16_t mask16(u16_type oc)
66 return static_cast<uint16_t
>(0xffff & oc);
68 template <
typename octet_type>
69 inline bool is_trail(octet_type oc)
71 return ((utf8::internal::mask8(oc) >> 6) == 0x2);
74 template <
typename u16>
75 inline bool is_lead_surrogate(u16 cp)
77 return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
80 template <
typename u16>
81 inline bool is_trail_surrogate(u16 cp)
83 return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
86 template <
typename u16>
87 inline bool is_surrogate(u16 cp)
89 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
92 template <
typename u32>
93 inline bool is_code_point_valid(u32 cp)
95 return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
98 template <
typename octet_iterator>
99 inline typename std::iterator_traits<octet_iterator>::difference_type sequence_length(octet_iterator lead_it)
101 uint8_t lead = utf8::internal::mask8(*lead_it);
104 }
else if ((lead >> 5) == 0x6) {
106 }
else if ((lead >> 4) == 0xe) {
108 }
else if ((lead >> 3) == 0x1e) {
115 template <
typename octet_difference_type>
116 inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
122 }
else if (cp < 0x800) {
126 }
else if (cp < 0x10000) {
146 template <
typename octet_iterator>
147 utf_error increase_safely(octet_iterator& it, octet_iterator end)
150 return NOT_ENOUGH_ROOM;
153 if (!utf8::internal::is_trail(*it)) {
154 return INCOMPLETE_SEQUENCE;
160#define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) \
162 utf_error ret = increase_safely(IT, END); \
163 if (ret != UTF8_OK) \
168 template <
typename octet_iterator>
169 utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
172 return NOT_ENOUGH_ROOM;
175 code_point = utf8::internal::mask8(*it);
180 template <
typename octet_iterator>
181 utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
184 return NOT_ENOUGH_ROOM;
187 code_point = utf8::internal::mask8(*it);
189 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
191 code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
196 template <
typename octet_iterator>
197 utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
200 return NOT_ENOUGH_ROOM;
203 code_point = utf8::internal::mask8(*it);
205 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
207 code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
209 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
211 code_point += (*it) & 0x3f;
216 template <
typename octet_iterator>
217 utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
220 return NOT_ENOUGH_ROOM;
223 code_point = utf8::internal::mask8(*it);
225 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
227 code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
229 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
231 code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
233 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
235 code_point += (*it) & 0x3f;
240#undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
242 template <
typename octet_iterator>
243 utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
247 octet_iterator original_it = it;
251 using octet_difference_type =
typename std::iterator_traits<octet_iterator>::difference_type;
252 octet_difference_type
const length = utf8::internal::sequence_length(it);
255 utf_error err = UTF8_OK;
260 err = utf8::internal::get_sequence_1(it, end, cp);
263 err = utf8::internal::get_sequence_2(it, end, cp);
266 err = utf8::internal::get_sequence_3(it, end, cp);
269 err = utf8::internal::get_sequence_4(it, end, cp);
273 if (err == UTF8_OK) {
275 if (utf8::internal::is_code_point_valid(cp)) {
276 if (!utf8::internal::is_overlong_sequence(cp, length)) {
283 err = OVERLONG_SEQUENCE;
285 err = INVALID_CODE_POINT;
294 template <
typename octet_iterator>
295 inline utf_error validate_next(octet_iterator& it, octet_iterator end)
297 uint32_t ignored = 0;
298 return utf8::internal::validate_next(it, end, ignored);
306 uint8_t
const bom[] = {0xef, 0xbb, 0xbf};
308 template <
typename octet_iterator>
309 octet_iterator find_invalid(octet_iterator start, octet_iterator end)
311 octet_iterator result = start;
312 while (result != end) {
313 utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
314 if (err_code != internal::UTF8_OK) {
321 template <
typename octet_iterator>
322 inline bool is_valid(octet_iterator start, octet_iterator end)
324 return (utf8::find_invalid(start, end) == end);
327 template <
typename octet_iterator>
328 inline bool starts_with_bom(octet_iterator it, octet_iterator end)
331 ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
332 ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
333 ((it != end) && (utf8::internal::mask8(*it)) == bom[2]));
337 template <
typename octet_iterator>
338 inline bool is_bom(octet_iterator it)
341 (utf8::internal::mask8(*it++)) == bom[0] && (utf8::internal::mask8(*it++)) == bom[1] &&
342 (utf8::internal::mask8(*it)) == bom[2]);