27#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
28#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
47 explicit invalid_code_point(uint32_t cp) : cp(cp) { }
48 char const * what()
const noexcept override
50 return "Invalid code point";
52 uint32_t code_point()
const
63 explicit invalid_utf8(uint8_t u) : u8(u) { }
64 char const * what()
const noexcept override
66 return "Invalid UTF-8";
68 uint8_t utf8_octet()
const
79 explicit invalid_utf16(uint16_t u) : u16(u) { }
80 char const * what()
const noexcept override
82 return "Invalid UTF-16";
84 uint16_t utf16_word()
const
93 char const * what()
const noexcept override
95 return "Not enough space";
101 template <
typename octet_iterator>
102 octet_iterator append(uint32_t cp, octet_iterator result)
104 if (!utf8::internal::is_code_point_valid(cp)) {
109 *(result++) =
static_cast<uint8_t
>(cp);
110 }
else if (cp < 0x800) {
111 *(result++) =
static_cast<uint8_t
>((cp >> 6) | 0xc0);
112 *(result++) =
static_cast<uint8_t
>((cp & 0x3f) | 0x80);
113 }
else if (cp < 0x10000) {
114 *(result++) =
static_cast<uint8_t
>((cp >> 12) | 0xe0);
115 *(result++) =
static_cast<uint8_t
>(((cp >> 6) & 0x3f) | 0x80);
116 *(result++) =
static_cast<uint8_t
>((cp & 0x3f) | 0x80);
118 *(result++) =
static_cast<uint8_t
>((cp >> 18) | 0xf0);
119 *(result++) =
static_cast<uint8_t
>(((cp >> 12) & 0x3f) | 0x80);
120 *(result++) =
static_cast<uint8_t
>(((cp >> 6) & 0x3f) | 0x80);
121 *(result++) =
static_cast<uint8_t
>((cp & 0x3f) | 0x80);
126 template <
typename octet_iterator,
typename output_iterator>
127 output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
129 while (start != end) {
130 octet_iterator sequence_start = start;
131 internal::utf_error err_code = utf8::internal::validate_next(start, end);
133 case internal::UTF8_OK:
134 for (octet_iterator it = sequence_start; it != start; ++it) {
138 case internal::NOT_ENOUGH_ROOM:
140 case internal::INVALID_LEAD:
141 out = utf8::append(replacement, out);
144 case internal::INCOMPLETE_SEQUENCE:
145 case internal::OVERLONG_SEQUENCE:
146 case internal::INVALID_CODE_POINT:
147 out = utf8::append(replacement, out);
150 while (start != end && utf8::internal::is_trail(*start)) {
159 template <
typename octet_iterator,
typename output_iterator>
160 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
162 static uint32_t
const replacement_marker = utf8::internal::mask16(0xfffd);
163 return utf8::replace_invalid(start, end, out, replacement_marker);
166 template <
typename octet_iterator>
167 uint32_t next(octet_iterator& it, octet_iterator end)
170 internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
172 case internal::UTF8_OK:
174 case internal::NOT_ENOUGH_ROOM:
176 case internal::INVALID_LEAD:
177 case internal::INCOMPLETE_SEQUENCE:
178 case internal::OVERLONG_SEQUENCE:
180 case internal::INVALID_CODE_POINT:
186 template <
typename octet_iterator>
187 uint32_t peek_next(octet_iterator it, octet_iterator end)
189 return utf8::next(it, end);
192 template <
typename octet_iterator>
193 uint32_t prior(octet_iterator& it, octet_iterator start)
200 octet_iterator end = it;
202 while (utf8::internal::is_trail(*(--it))) {
207 return utf8::peek_next(it, end);
211 template <
typename octet_iterator>
212 uint32_t previous(octet_iterator& it, octet_iterator pass_start)
214 octet_iterator end = it;
215 while (utf8::internal::is_trail(*(--it))) {
216 if (it == pass_start) {
220 octet_iterator temp = it;
221 return utf8::next(temp, end);
224 template <
typename octet_iterator,
typename distance_type>
225 void advance(octet_iterator& it, distance_type n, octet_iterator end)
227 for (distance_type i = 0; i < n; ++i) {
232 template <
typename octet_iterator>
233 typename std::iterator_traits<octet_iterator>::difference_type distance(octet_iterator first, octet_iterator last)
235 typename std::iterator_traits<octet_iterator>::difference_type dist;
236 for (dist = 0; first < last; ++dist) {
237 utf8::next(first, last);
242 template <
typename u16bit_iterator,
typename octet_iterator>
243 octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
245 while (start != end) {
246 uint32_t cp = utf8::internal::mask16(*start++);
248 if (utf8::internal::is_lead_surrogate(cp)) {
250 uint32_t trail_surrogate = utf8::internal::mask16(*start++);
251 if (utf8::internal::is_trail_surrogate(trail_surrogate)) {
252 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
262 else if (utf8::internal::is_trail_surrogate(cp)) {
266 result = utf8::append(cp, result);
271 template <
typename u16bit_iterator,
typename octet_iterator>
272 u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
274 while (start != end) {
275 uint32_t cp = utf8::next(start, end);
277 *result++ =
static_cast<uint16_t
>((cp >> 10) + internal::LEAD_OFFSET);
278 *result++ =
static_cast<uint16_t
>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
280 *result++ =
static_cast<uint16_t
>(cp);
286 template <
typename octet_iterator,
typename u32bit_iterator>
287 octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result)
289 while (start != end) {
290 result = utf8::append(*(start++), result);
296 template <
typename octet_iterator,
typename u32bit_iterator>
297 u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result)
299 while (start != end) {
300 (*result++) = utf8::next(start, end);
306 template <
typename octet_iterator>
310 using iterator_category = std::bidirectional_iterator_tag;
311 using value_type = uint32_t;
312 using difference_type = std::ptrdiff_t;
313 using pointer = value_type*;
314 using reference = value_type&;
318 octet_iterator range_start;
319 octet_iterator range_end;
322 iterator() =
default;
324 octet_iterator
const & octet_it, octet_iterator
const & range_start, octet_iterator
const & range_end) :
325 it(octet_it), range_start(range_start), range_end(range_end)
327 if (it < range_start || it > range_end) {
328 throw std::out_of_range(
"Invalid utf-8 iterator position");
332 octet_iterator base()
const
336 uint32_t operator*()
const
338 octet_iterator temp = it;
339 return utf8::next(temp, range_end);