mirror of https://github.com/godotengine/godot
2678 lines
95 KiB
C++
2678 lines
95 KiB
C++
// © 2024 and later: Unicode, Inc. and others.
|
||
// License & terms of use: https://www.unicode.org/copyright.html
|
||
|
||
// utfiterator.h
|
||
// created: 2024aug12 Markus W. Scherer
|
||
|
||
#ifndef __UTFITERATOR_H__
|
||
#define __UTFITERATOR_H__
|
||
|
||
#include "unicode/utypes.h"
|
||
|
||
#if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H)
|
||
|
||
#include <iterator>
|
||
#if defined(__cpp_lib_ranges)
|
||
#include <ranges>
|
||
#endif
|
||
#include <string>
|
||
#include <string_view>
|
||
#include <type_traits>
|
||
#include "unicode/utf16.h"
|
||
#include "unicode/utf8.h"
|
||
#include "unicode/uversion.h"
|
||
|
||
/**
|
||
* \file
|
||
* \brief C++ header-only API: C++ iterators over Unicode strings (=UTF-8/16/32 if well-formed).
|
||
*
|
||
* Sample code:
|
||
* \code
|
||
* #include <string_view>
|
||
* #include <iostream>
|
||
* #include "unicode/utypes.h"
|
||
* #include "unicode/utfiterator.h"
|
||
*
|
||
* using icu::header::utfIterator;
|
||
* using icu::header::utfStringCodePoints;
|
||
* using icu::header::unsafeUTFIterator;
|
||
* using icu::header::unsafeUTFStringCodePoints;
|
||
*
|
||
* int32_t rangeLoop16(std::u16string_view s) {
|
||
* // We are just adding up the code points for minimal-code demonstration purposes.
|
||
* int32_t sum = 0;
|
||
* for (auto units : utfStringCodePoints<UChar32, UTF_BEHAVIOR_NEGATIVE>(s)) {
|
||
* sum += units.codePoint(); // < 0 if ill-formed
|
||
* }
|
||
* return sum;
|
||
* }
|
||
*
|
||
* int32_t loopIterPlusPlus16(std::u16string_view s) {
|
||
* auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
|
||
* int32_t sum = 0;
|
||
* for (auto iter = range.begin(), limit = range.end(); iter != limit;) {
|
||
* sum += (*iter++).codePoint(); // U+FFFD if ill-formed
|
||
* }
|
||
* return sum;
|
||
* }
|
||
*
|
||
* int32_t backwardLoop16(std::u16string_view s) {
|
||
* auto range = utfStringCodePoints<UChar32, UTF_BEHAVIOR_SURROGATE>(s);
|
||
* int32_t sum = 0;
|
||
* for (auto start = range.begin(), iter = range.end(); start != iter;) {
|
||
* sum += (*--iter).codePoint(); // surrogate code point if unpaired / ill-formed
|
||
* }
|
||
* return sum;
|
||
* }
|
||
*
|
||
* int32_t reverseLoop8(std::string_view s) {
|
||
* auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
|
||
* int32_t sum = 0;
|
||
* for (auto iter = range.rbegin(), limit = range.rend(); iter != limit; ++iter) {
|
||
* sum += iter->codePoint(); // U+FFFD if ill-formed
|
||
* }
|
||
* return sum;
|
||
* }
|
||
*
|
||
* int32_t countCodePoints16(std::u16string_view s) {
|
||
* auto range = utfStringCodePoints<UChar32, UTF_BEHAVIOR_SURROGATE>(s);
|
||
* return std::distance(range.begin(), range.end());
|
||
* }
|
||
*
|
||
* int32_t unsafeRangeLoop16(std::u16string_view s) {
|
||
* int32_t sum = 0;
|
||
* for (auto units : unsafeUTFStringCodePoints<UChar32>(s)) {
|
||
* sum += units.codePoint();
|
||
* }
|
||
* return sum;
|
||
* }
|
||
*
|
||
* int32_t unsafeReverseLoop8(std::string_view s) {
|
||
* auto range = unsafeUTFStringCodePoints<UChar32>(s);
|
||
* int32_t sum = 0;
|
||
* for (auto iter = range.rbegin(), limit = range.rend(); iter != limit; ++iter) {
|
||
* sum += iter->codePoint();
|
||
* }
|
||
* return sum;
|
||
* }
|
||
*
|
||
* char32_t firstCodePointOrFFFD16(std::u16string_view s) {
|
||
* if (s.empty()) { return 0xfffd; }
|
||
* auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
|
||
* return range.begin()->codePoint();
|
||
* }
|
||
*
|
||
* std::string_view firstSequence8(std::string_view s) {
|
||
* if (s.empty()) { return {}; }
|
||
* auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
|
||
* auto units = *(range.begin());
|
||
* if (units.wellFormed()) {
|
||
* return units.stringView();
|
||
* } else {
|
||
* return {};
|
||
* }
|
||
* }
|
||
*
|
||
* template<typename InputStream> // some istream or streambuf
|
||
* std::u32string cpFromInput(InputStream &in) {
|
||
* // This is a single-pass input_iterator.
|
||
* std::istreambuf_iterator bufIter(in);
|
||
* std::istreambuf_iterator<typename InputStream::char_type> bufLimit;
|
||
* auto iter = utfIterator<char32_t, UTF_BEHAVIOR_FFFD>(bufIter);
|
||
* auto limit = utfIterator<char32_t, UTF_BEHAVIOR_FFFD>(bufLimit);
|
||
* std::u32string s32;
|
||
* for (; iter != limit; ++iter) {
|
||
* s32.push_back(iter->codePoint());
|
||
* }
|
||
* return s32;
|
||
* }
|
||
*
|
||
* std::u32string cpFromStdin() { return cpFromInput(std::cin); }
|
||
* std::u32string cpFromWideStdin() { return cpFromInput(std::wcin); }
|
||
* \endcode
|
||
*/
|
||
|
||
#ifndef U_HIDE_DRAFT_API
|
||
|
||
/**
|
||
* Some defined behaviors for handling ill-formed Unicode strings.
|
||
* This is a template parameter for UTFIterator and related classes.
|
||
*
|
||
* When a validating UTFIterator encounters an ill-formed code unit sequence,
|
||
* then CodeUnits.codePoint() is a value according to this parameter.
|
||
*
|
||
* @draft ICU 78
|
||
* @see CodeUnits
|
||
* @see UTFIterator
|
||
* @see UTFStringCodePoints
|
||
*/
|
||
typedef enum UTFIllFormedBehavior {
|
||
/**
|
||
* Returns a negative value (-1=U_SENTINEL) instead of a code point.
|
||
* If the CP32 template parameter for the relevant classes is an unsigned type,
|
||
* then the negative value becomes 0xffffffff=UINT32_MAX.
|
||
*
|
||
* @draft ICU 78
|
||
*/
|
||
UTF_BEHAVIOR_NEGATIVE,
|
||
/** Returns U+FFFD Replacement Character. @draft ICU 78 */
|
||
UTF_BEHAVIOR_FFFD,
|
||
/**
|
||
* UTF-8: Not allowed;
|
||
* UTF-16: returns the unpaired surrogate;
|
||
* UTF-32: returns the surrogate code point, or U+FFFD if out of range.
|
||
*
|
||
* @draft ICU 78
|
||
*/
|
||
UTF_BEHAVIOR_SURROGATE
|
||
} UTFIllFormedBehavior;
|
||
|
||
namespace U_HEADER_ONLY_NAMESPACE {
|
||
|
||
namespace prv {
|
||
#if U_CPLUSPLUS_VERSION >= 20
|
||
|
||
/** @internal */
|
||
template<typename Iter>
|
||
using iter_value_t = typename std::iter_value_t<Iter>;
|
||
|
||
/** @internal */
|
||
template<typename Iter>
|
||
using iter_difference_t = std::iter_difference_t<Iter>;
|
||
|
||
/** @internal */
|
||
template<typename Iter>
|
||
constexpr bool forward_iterator = std::forward_iterator<Iter>;
|
||
|
||
/** @internal */
|
||
template<typename Iter>
|
||
constexpr bool bidirectional_iterator = std::bidirectional_iterator<Iter>;
|
||
|
||
/** @internal */
|
||
template<typename Range>
|
||
constexpr bool range = std::ranges::range<Range>;
|
||
|
||
#else
|
||
|
||
/** @internal */
|
||
template<typename Iter>
|
||
using iter_value_t = typename std::iterator_traits<Iter>::value_type;
|
||
|
||
/** @internal */
|
||
template<typename Iter>
|
||
using iter_difference_t = typename std::iterator_traits<Iter>::difference_type;
|
||
|
||
/** @internal */
|
||
template<typename Iter>
|
||
constexpr bool forward_iterator =
|
||
std::is_base_of_v<
|
||
std::forward_iterator_tag,
|
||
typename std::iterator_traits<Iter>::iterator_category>;
|
||
|
||
/** @internal */
|
||
template<typename Iter>
|
||
constexpr bool bidirectional_iterator =
|
||
std::is_base_of_v<
|
||
std::bidirectional_iterator_tag,
|
||
typename std::iterator_traits<Iter>::iterator_category>;
|
||
|
||
/** @internal */
|
||
template<typename Range, typename = void>
|
||
struct range_type : std::false_type {};
|
||
|
||
/** @internal */
|
||
template<typename Range>
|
||
struct range_type<
|
||
Range,
|
||
std::void_t<decltype(std::declval<Range>().begin()),
|
||
decltype(std::declval<Range>().end())>> : std::true_type {};
|
||
|
||
/** @internal */
|
||
template<typename Range>
|
||
constexpr bool range = range_type<Range>::value;
|
||
|
||
#endif
|
||
|
||
/** @internal */
|
||
template <typename T> struct is_basic_string_view : std::false_type {};
|
||
|
||
/** @internal */
|
||
template <typename... Args>
|
||
struct is_basic_string_view<std::basic_string_view<Args...>> : std::true_type {};
|
||
|
||
/** @internal */
|
||
template <typename T> constexpr bool is_basic_string_view_v = is_basic_string_view<T>::value;
|
||
|
||
/** @internal */
|
||
template<typename CP32, bool skipSurrogates>
|
||
class CodePointsIterator {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
public:
|
||
/** C++ iterator boilerplate @internal */
|
||
using value_type = CP32;
|
||
/** C++ iterator boilerplate @internal */
|
||
using reference = value_type;
|
||
/** C++ iterator boilerplate @internal */
|
||
using pointer = CP32 *;
|
||
/** C++ iterator boilerplate @internal */
|
||
using difference_type = int32_t;
|
||
/** C++ iterator boilerplate @internal */
|
||
using iterator_category = std::forward_iterator_tag;
|
||
|
||
/** @internal */
|
||
inline CodePointsIterator(CP32 c) : c_(c) {}
|
||
/** @internal */
|
||
inline bool operator==(const CodePointsIterator &other) const { return c_ == other.c_; }
|
||
/** @internal */
|
||
inline bool operator!=(const CodePointsIterator &other) const { return !operator==(other); }
|
||
/** @internal */
|
||
inline CP32 operator*() const { return c_; }
|
||
/** @internal */
|
||
inline CodePointsIterator &operator++() { // pre-increment
|
||
++c_;
|
||
if (skipSurrogates && c_ == 0xd800) {
|
||
c_ = 0xe000;
|
||
}
|
||
return *this;
|
||
}
|
||
/** @internal */
|
||
inline CodePointsIterator operator++(int) { // post-increment
|
||
CodePointsIterator result(*this);
|
||
++(*this);
|
||
return result;
|
||
}
|
||
|
||
private:
|
||
CP32 c_;
|
||
};
|
||
|
||
} // namespace prv
|
||
|
||
/**
|
||
* A C++ "range" over all Unicode code points U+0000..U+10FFFF.
|
||
* https://www.unicode.org/glossary/#code_point
|
||
*
|
||
* Intended for test and builder code.
|
||
*
|
||
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
|
||
* @draft ICU 78
|
||
* @see U_IS_CODE_POINT
|
||
*/
|
||
template<typename CP32>
|
||
class AllCodePoints {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
public:
|
||
/** Constructor. @draft ICU 78 */
|
||
AllCodePoints() {}
|
||
/**
|
||
* @return an iterator over all Unicode code points.
|
||
* The iterator returns CP32 integers.
|
||
* @draft ICU 78
|
||
*/
|
||
auto begin() const { return prv::CodePointsIterator<CP32, false>(0); }
|
||
/**
|
||
* @return an exclusive-end iterator over all Unicode code points.
|
||
* @draft ICU 78
|
||
*/
|
||
auto end() const { return prv::CodePointsIterator<CP32, false>(0x110000); }
|
||
};
|
||
|
||
/**
|
||
* A C++ "range" over all Unicode scalar values U+0000..U+D7FF & U+E000..U+10FFFF.
|
||
* That is, all code points except surrogates.
|
||
* Only scalar values can be represented in well-formed UTF-8/16/32.
|
||
* https://www.unicode.org/glossary/#unicode_scalar_value
|
||
*
|
||
* Intended for test and builder code.
|
||
*
|
||
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
|
||
* @draft ICU 78
|
||
* @see U_IS_SCALAR_VALUE
|
||
*/
|
||
template<typename CP32>
|
||
class AllScalarValues {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
public:
|
||
/** Constructor. @draft ICU 78 */
|
||
AllScalarValues() {}
|
||
/**
|
||
* @return an iterator over all Unicode scalar values.
|
||
* The iterator returns CP32 integers.
|
||
* @draft ICU 78
|
||
*/
|
||
auto begin() const { return prv::CodePointsIterator<CP32, true>(0); }
|
||
/**
|
||
* @return an exclusive-end iterator over all Unicode scalar values.
|
||
* @draft ICU 78
|
||
*/
|
||
auto end() const { return prv::CodePointsIterator<CP32, true>(0x110000); }
|
||
};
|
||
|
||
/**
|
||
* Result of decoding a code unit sequence for one code point.
|
||
* Returned from non-validating Unicode string code point iterators.
|
||
* Base class for class CodeUnits which is returned from validating iterators.
|
||
*
|
||
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
|
||
* should be signed if UTF_BEHAVIOR_NEGATIVE
|
||
* @tparam UnitIter An iterator (often a pointer) that returns a code unit type:
|
||
* UTF-8: char or char8_t or uint8_t;
|
||
* UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
|
||
* UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
|
||
* @see UnsafeUTFIterator
|
||
* @see UnsafeUTFStringCodePoints
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename CP32, typename UnitIter, typename = void>
|
||
class UnsafeCodeUnits {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
using Unit = typename prv::iter_value_t<UnitIter>;
|
||
public:
|
||
/** @internal */
|
||
UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit) :
|
||
c_(codePoint), len_(length), start_(start), limit_(limit) {}
|
||
|
||
/** Copy constructor. @draft ICU 78 */
|
||
UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
|
||
/** Copy assignment operator. @draft ICU 78 */
|
||
UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
|
||
|
||
/**
|
||
* @return the Unicode code point decoded from the code unit sequence.
|
||
* If the sequence is ill-formed and the iterator validates,
|
||
* then this is a replacement value according to the iterator‘s
|
||
* UTFIllFormedBehavior template parameter.
|
||
* @draft ICU 78
|
||
*/
|
||
CP32 codePoint() const { return c_; }
|
||
|
||
/**
|
||
* @return the start of the code unit sequence for one code point.
|
||
* Only enabled if UnitIter is a (multi-pass) forward_iterator or better.
|
||
* @draft ICU 78
|
||
*/
|
||
UnitIter begin() const { return start_; }
|
||
|
||
/**
|
||
* @return the limit (exclusive end) of the code unit sequence for one code point.
|
||
* Only enabled if UnitIter is a (multi-pass) forward_iterator or better.
|
||
* @draft ICU 78
|
||
*/
|
||
UnitIter end() const { return limit_; }
|
||
|
||
/**
|
||
* @return the length of the code unit sequence for one code point.
|
||
* @draft ICU 78
|
||
*/
|
||
uint8_t length() const { return len_; }
|
||
|
||
#if U_CPLUSPLUS_VERSION >= 20
|
||
/**
|
||
* @return a string_view of the code unit sequence for one code point.
|
||
* Only works if UnitIter is a pointer or a contiguous_iterator.
|
||
* @draft ICU 78
|
||
*/
|
||
template<std::contiguous_iterator Iter = UnitIter>
|
||
std::basic_string_view<Unit> stringView() const {
|
||
return std::basic_string_view<Unit>(begin(), end());
|
||
}
|
||
#else
|
||
/**
|
||
* @return a string_view of the code unit sequence for one code point.
|
||
* Only works if UnitIter is a pointer or a contiguous_iterator.
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename Iter = UnitIter, typename Unit = typename std::iterator_traits<Iter>::value_type>
|
||
std::enable_if_t<std::is_pointer_v<Iter> ||
|
||
std::is_same_v<Iter, typename std::basic_string<Unit>::iterator> ||
|
||
std::is_same_v<Iter, typename std::basic_string<Unit>::const_iterator> ||
|
||
std::is_same_v<Iter, typename std::basic_string_view<Unit>::iterator> ||
|
||
std::is_same_v<Iter, typename std::basic_string_view<Unit>::const_iterator>,
|
||
std::basic_string_view<Unit>>
|
||
stringView() const {
|
||
return std::basic_string_view<Unit>(&*start_, len_);
|
||
}
|
||
#endif
|
||
|
||
private:
|
||
// Order of fields with padding and access frequency in mind.
|
||
CP32 c_;
|
||
uint8_t len_;
|
||
UnitIter start_;
|
||
UnitIter limit_;
|
||
};
|
||
|
||
#ifndef U_IN_DOXYGEN
|
||
// Partial template specialization for single-pass input iterator.
|
||
// No UnitIter field, no getter for it, no stringView().
|
||
template<typename CP32, typename UnitIter>
|
||
class UnsafeCodeUnits<
|
||
CP32,
|
||
UnitIter,
|
||
std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
public:
|
||
UnsafeCodeUnits(CP32 codePoint, uint8_t length) : c_(codePoint), len_(length) {}
|
||
|
||
UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
|
||
UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
|
||
|
||
CP32 codePoint() const { return c_; }
|
||
|
||
uint8_t length() const { return len_; }
|
||
|
||
private:
|
||
// Order of fields with padding and access frequency in mind.
|
||
CP32 c_;
|
||
uint8_t len_;
|
||
};
|
||
#endif // U_IN_DOXYGEN
|
||
|
||
/**
|
||
* Result of validating and decoding a code unit sequence for one code point.
|
||
* Returned from validating Unicode string code point iterators.
|
||
* Adds function wellFormed() to base class UnsafeCodeUnits.
|
||
*
|
||
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
|
||
* should be signed if UTF_BEHAVIOR_NEGATIVE
|
||
* @tparam UnitIter An iterator (often a pointer) that returns a code unit type:
|
||
* UTF-8: char or char8_t or uint8_t;
|
||
* UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
|
||
* UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
|
||
* @see UTFIterator
|
||
* @see UTFStringCodePoints
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename CP32, typename UnitIter, typename = void>
|
||
class CodeUnits : public UnsafeCodeUnits<CP32, UnitIter> {
|
||
public:
|
||
/** @internal */
|
||
CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit) :
|
||
UnsafeCodeUnits<CP32, UnitIter>(codePoint, length, start, limit), ok_(wellFormed) {}
|
||
|
||
/** Copy constructor. @draft ICU 78 */
|
||
CodeUnits(const CodeUnits &other) = default;
|
||
/** Copy assignment operator. @draft ICU 78 */
|
||
CodeUnits &operator=(const CodeUnits &other) = default;
|
||
|
||
/**
|
||
* @return true if the decoded code unit sequence is well-formed.
|
||
* @draft ICU 78
|
||
*/
|
||
bool wellFormed() const { return ok_; }
|
||
|
||
private:
|
||
bool ok_;
|
||
};
|
||
|
||
#ifndef U_IN_DOXYGEN
|
||
// Partial template specialization for single-pass input iterator.
|
||
// No UnitIter field, no getter for it, no stringView().
|
||
template<typename CP32, typename UnitIter>
|
||
class CodeUnits<
|
||
CP32,
|
||
UnitIter,
|
||
std::enable_if_t<!prv::forward_iterator<UnitIter>>> :
|
||
public UnsafeCodeUnits<CP32, UnitIter> {
|
||
public:
|
||
CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed) :
|
||
UnsafeCodeUnits<CP32, UnitIter>(codePoint, length), ok_(wellFormed) {}
|
||
|
||
CodeUnits(const CodeUnits &other) = default;
|
||
CodeUnits &operator=(const CodeUnits &other) = default;
|
||
|
||
bool wellFormed() const { return ok_; }
|
||
|
||
private:
|
||
bool ok_;
|
||
};
|
||
#endif // U_IN_DOXYGEN
|
||
|
||
// Validating implementations ---------------------------------------------- ***
|
||
|
||
#ifndef U_IN_DOXYGEN
|
||
template<typename CP32, UTFIllFormedBehavior behavior,
|
||
typename UnitIter, typename LimitIter = UnitIter, typename = void>
|
||
class UTFImpl;
|
||
|
||
// Note: readAndInc() functions take both a p0 and a p iterator.
|
||
// They must have the same value.
|
||
// For a multi-pass UnitIter, the caller must copy its p into a local variable p0,
|
||
// and readAndInc() copies p0 and the incremented p into the CodeUnits.
|
||
// For a single-pass UnitIter, which may not be default-constructible nor coypable,
|
||
// the caller can pass p into both references, and readAndInc() does not use p0
|
||
// and constructs CodeUnits without them.
|
||
// Moving the p0 variable into the call site avoids having to declare it inside readAndInc()
|
||
// which may not be possible for a single-pass iterator.
|
||
|
||
// UTF-8
|
||
template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
|
||
class UTFImpl<
|
||
CP32, behavior,
|
||
UnitIter, LimitIter,
|
||
std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
static_assert(behavior != UTF_BEHAVIOR_SURROGATE,
|
||
"For 8-bit strings, the SURROGATE option does not have an equivalent.");
|
||
public:
|
||
// Handle ill-formed UTF-8
|
||
U_FORCE_INLINE static CP32 sub() {
|
||
switch (behavior) {
|
||
case UTF_BEHAVIOR_NEGATIVE: return U_SENTINEL;
|
||
case UTF_BEHAVIOR_FFFD: return 0xfffd;
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
|
||
// Very similar to U8_FWD_1().
|
||
uint8_t b = *p;
|
||
++p;
|
||
if (U8_IS_LEAD(b) && p != limit) {
|
||
uint8_t t1 = *p;
|
||
if ((0xe0 <= b && b < 0xf0)) {
|
||
if (U8_IS_VALID_LEAD3_AND_T1(b, t1) &&
|
||
++p != limit && U8_IS_TRAIL(*p)) {
|
||
++p;
|
||
}
|
||
} else if (b < 0xe0) {
|
||
if (U8_IS_TRAIL(t1)) {
|
||
++p;
|
||
}
|
||
} else /* b >= 0xf0 */ {
|
||
if (U8_IS_VALID_LEAD4_AND_T1(b, t1) &&
|
||
++p != limit && U8_IS_TRAIL(*p) &&
|
||
++p != limit && U8_IS_TRAIL(*p)) {
|
||
++p;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
|
||
// Very similar to U8_BACK_1().
|
||
uint8_t c = *--p;
|
||
if (U8_IS_TRAIL(c) && p != start) {
|
||
UnitIter p1 = p;
|
||
uint8_t b1 = *--p1;
|
||
if (U8_IS_LEAD(b1)) {
|
||
if (b1 < 0xe0 ||
|
||
(b1 < 0xf0 ?
|
||
U8_IS_VALID_LEAD3_AND_T1(b1, c) :
|
||
U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
|
||
p = p1;
|
||
return;
|
||
}
|
||
} else if (U8_IS_TRAIL(b1) && p1 != start) {
|
||
uint8_t b2 = *--p1;
|
||
if (0xe0 <= b2 && b2 <= 0xf4) {
|
||
if (b2 < 0xf0 ?
|
||
U8_IS_VALID_LEAD3_AND_T1(b2, b1) :
|
||
U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
|
||
p = p1;
|
||
return;
|
||
}
|
||
} else if (U8_IS_TRAIL(b2) && p1 != start) {
|
||
uint8_t b3 = *--p1;
|
||
if (0xf0 <= b3 && b3 <= 0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
|
||
p = p1;
|
||
return;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
|
||
UnitIter &p0, UnitIter &p, const LimitIter &limit) {
|
||
constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
|
||
// Very similar to U8_NEXT_OR_FFFD().
|
||
CP32 c = uint8_t(*p);
|
||
++p;
|
||
if (U8_IS_SINGLE(c)) {
|
||
if constexpr (isMultiPass) {
|
||
return {c, 1, true, p0, p};
|
||
} else {
|
||
return {c, 1, true};
|
||
}
|
||
}
|
||
uint8_t length = 1;
|
||
uint8_t t = 0;
|
||
if (p != limit &&
|
||
// fetch/validate/assemble all but last trail byte
|
||
(c >= 0xe0 ?
|
||
(c < 0xf0 ? // U+0800..U+FFFF except surrogates
|
||
U8_LEAD3_T1_BITS[c &= 0xf] & (1 << ((t = *p) >> 5)) &&
|
||
(t &= 0x3f, 1)
|
||
: // U+10000..U+10FFFF
|
||
(c -= 0xf0) <= 4 &&
|
||
U8_LEAD4_T1_BITS[(t = *p) >> 4] & (1 << c) &&
|
||
(c = (c << 6) | (t & 0x3f), ++length, ++p != limit) &&
|
||
(t = *p - 0x80) <= 0x3f) &&
|
||
// valid second-to-last trail byte
|
||
(c = (c << 6) | t, ++length, ++p != limit)
|
||
: // U+0080..U+07FF
|
||
c >= 0xc2 && (c &= 0x1f, 1)) &&
|
||
// last trail byte
|
||
(t = *p - 0x80) <= 0x3f) {
|
||
c = (c << 6) | t;
|
||
++length;
|
||
++p;
|
||
if constexpr (isMultiPass) {
|
||
return {c, length, true, p0, p};
|
||
} else {
|
||
return {c, length, true};
|
||
}
|
||
}
|
||
if constexpr (isMultiPass) {
|
||
return {sub(), length, false, p0, p};
|
||
} else {
|
||
return {sub(), length, false};
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
|
||
// Very similar to U8_PREV_OR_FFFD().
|
||
UnitIter p0 = p;
|
||
CP32 c = uint8_t(*--p);
|
||
if (U8_IS_SINGLE(c)) {
|
||
return {c, 1, true, p, p0};
|
||
}
|
||
if (U8_IS_TRAIL(c) && p != start) {
|
||
UnitIter p1 = p;
|
||
uint8_t b1 = *--p1;
|
||
if (U8_IS_LEAD(b1)) {
|
||
if (b1 < 0xe0) {
|
||
p = p1;
|
||
c = ((b1 - 0xc0) << 6) | (c & 0x3f);
|
||
return {c, 2, true, p, p0};
|
||
} else if (b1 < 0xf0 ?
|
||
U8_IS_VALID_LEAD3_AND_T1(b1, c) :
|
||
U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
|
||
// Truncated 3- or 4-byte sequence.
|
||
p = p1;
|
||
return {sub(), 2, false, p, p0};
|
||
}
|
||
} else if (U8_IS_TRAIL(b1) && p1 != start) {
|
||
// Extract the value bits from the last trail byte.
|
||
c &= 0x3f;
|
||
uint8_t b2 = *--p1;
|
||
if (0xe0 <= b2 && b2 <= 0xf4) {
|
||
if (b2 < 0xf0) {
|
||
b2 &= 0xf;
|
||
if (U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
|
||
p = p1;
|
||
c = (b2 << 12) | ((b1 & 0x3f) << 6) | c;
|
||
return {c, 3, true, p, p0};
|
||
}
|
||
} else if (U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
|
||
// Truncated 4-byte sequence.
|
||
p = p1;
|
||
return {sub(), 3, false, p, p0};
|
||
}
|
||
} else if (U8_IS_TRAIL(b2) && p1 != start) {
|
||
uint8_t b3 = *--p1;
|
||
if (0xf0 <= b3 && b3 <= 0xf4) {
|
||
b3 &= 7;
|
||
if (U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
|
||
p = p1;
|
||
c = (b3 << 18) | ((b2 & 0x3f) << 12) | ((b1 & 0x3f) << 6) | c;
|
||
return {c, 4, true, p, p0};
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return {sub(), 1, false, p, p0};
|
||
}
|
||
};
|
||
|
||
// UTF-16
|
||
template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
|
||
class UTFImpl<
|
||
CP32, behavior,
|
||
UnitIter, LimitIter,
|
||
std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
public:
|
||
// Handle ill-formed UTF-16: One unpaired surrogate.
|
||
U_FORCE_INLINE static CP32 sub(CP32 surrogate) {
|
||
switch (behavior) {
|
||
case UTF_BEHAVIOR_NEGATIVE: return U_SENTINEL;
|
||
case UTF_BEHAVIOR_FFFD: return 0xfffd;
|
||
case UTF_BEHAVIOR_SURROGATE: return surrogate;
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
|
||
// Very similar to U16_FWD_1().
|
||
auto c = *p;
|
||
++p;
|
||
if (U16_IS_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) {
|
||
++p;
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
|
||
// Very similar to U16_BACK_1().
|
||
UnitIter p1;
|
||
if (U16_IS_TRAIL(*--p) && p != start && (p1 = p, U16_IS_LEAD(*--p1))) {
|
||
p = p1;
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
|
||
UnitIter &p0, UnitIter &p, const LimitIter &limit) {
|
||
constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
|
||
// Very similar to U16_NEXT_OR_FFFD().
|
||
CP32 c = static_cast<CP32>(*p);
|
||
++p;
|
||
if (!U16_IS_SURROGATE(c)) {
|
||
if constexpr (isMultiPass) {
|
||
return {c, 1, true, p0, p};
|
||
} else {
|
||
return {c, 1, true};
|
||
}
|
||
} else {
|
||
uint16_t c2;
|
||
if (U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(c2 = *p)) {
|
||
++p;
|
||
c = U16_GET_SUPPLEMENTARY(c, c2);
|
||
if constexpr (isMultiPass) {
|
||
return {c, 2, true, p0, p};
|
||
} else {
|
||
return {c, 2, true};
|
||
}
|
||
} else {
|
||
if constexpr (isMultiPass) {
|
||
return {sub(c), 1, false, p0, p};
|
||
} else {
|
||
return {sub(c), 1, false};
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
|
||
// Very similar to U16_PREV_OR_FFFD().
|
||
UnitIter p0 = p;
|
||
CP32 c = static_cast<CP32>(*--p);
|
||
if (!U16_IS_SURROGATE(c)) {
|
||
return {c, 1, true, p, p0};
|
||
} else {
|
||
UnitIter p1;
|
||
uint16_t c2;
|
||
if (U16_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p, U16_IS_LEAD(c2 = *--p1))) {
|
||
p = p1;
|
||
c = U16_GET_SUPPLEMENTARY(c2, c);
|
||
return {c, 2, true, p, p0};
|
||
} else {
|
||
return {sub(c), 1, false, p, p0};
|
||
}
|
||
}
|
||
}
|
||
};
|
||
|
||
// UTF-32: trivial, but still validating
|
||
template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
|
||
class UTFImpl<
|
||
CP32, behavior,
|
||
UnitIter, LimitIter,
|
||
std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
public:
|
||
// Handle ill-formed UTF-32
|
||
U_FORCE_INLINE static CP32 sub(bool forSurrogate, CP32 surrogate) {
|
||
switch (behavior) {
|
||
case UTF_BEHAVIOR_NEGATIVE: return U_SENTINEL;
|
||
case UTF_BEHAVIOR_FFFD: return 0xfffd;
|
||
case UTF_BEHAVIOR_SURROGATE: return forSurrogate ? surrogate : 0xfffd;
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &/*limit*/) {
|
||
++p;
|
||
}
|
||
|
||
U_FORCE_INLINE static void dec(UnitIter /*start*/, UnitIter &p) {
|
||
--p;
|
||
}
|
||
|
||
U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
|
||
UnitIter &p0, UnitIter &p, const LimitIter &/*limit*/) {
|
||
constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
|
||
uint32_t uc = *p;
|
||
CP32 c = uc;
|
||
++p;
|
||
if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
|
||
if constexpr (isMultiPass) {
|
||
return {c, 1, true, p0, p};
|
||
} else {
|
||
return {c, 1, true};
|
||
}
|
||
} else {
|
||
if constexpr (isMultiPass) {
|
||
return {sub(uc < 0xe000, c), 1, false, p0, p};
|
||
} else {
|
||
return {sub(uc < 0xe000, c), 1, false};
|
||
}
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter /*start*/, UnitIter &p) {
|
||
UnitIter p0 = p;
|
||
uint32_t uc = *--p;
|
||
CP32 c = uc;
|
||
if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
|
||
return {c, 1, true, p, p0};
|
||
} else {
|
||
return {sub(uc < 0xe000, c), 1, false, p, p0};
|
||
}
|
||
}
|
||
};
|
||
|
||
// Non-validating implementations ------------------------------------------ ***
|
||
|
||
template<typename CP32, typename UnitIter, typename = void>
|
||
class UnsafeUTFImpl;
|
||
|
||
// UTF-8
|
||
template<typename CP32, typename UnitIter>
|
||
class UnsafeUTFImpl<
|
||
CP32,
|
||
UnitIter,
|
||
std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
public:
|
||
U_FORCE_INLINE static void inc(UnitIter &p) {
|
||
// Very similar to U8_FWD_1_UNSAFE().
|
||
uint8_t b = *p;
|
||
std::advance(p, 1 + U8_COUNT_TRAIL_BYTES_UNSAFE(b));
|
||
}
|
||
|
||
U_FORCE_INLINE static void dec(UnitIter &p) {
|
||
// Very similar to U8_BACK_1_UNSAFE().
|
||
while (U8_IS_TRAIL(*--p)) {}
|
||
}
|
||
|
||
U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
|
||
constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
|
||
// Very similar to U8_NEXT_UNSAFE().
|
||
CP32 c = uint8_t(*p);
|
||
++p;
|
||
if (U8_IS_SINGLE(c)) {
|
||
if constexpr (isMultiPass) {
|
||
return {c, 1, p0, p};
|
||
} else {
|
||
return {c, 1};
|
||
}
|
||
} else if (c < 0xe0) {
|
||
c = ((c & 0x1f) << 6) | (*p & 0x3f);
|
||
++p;
|
||
if constexpr (isMultiPass) {
|
||
return {c, 2, p0, p};
|
||
} else {
|
||
return {c, 2};
|
||
}
|
||
} else if (c < 0xf0) {
|
||
// No need for (c&0xf) because the upper bits are truncated
|
||
// after <<12 in the cast to uint16_t.
|
||
c = uint16_t(c << 12) | ((*p & 0x3f) << 6);
|
||
++p;
|
||
c |= *p & 0x3f;
|
||
++p;
|
||
if constexpr (isMultiPass) {
|
||
return {c, 3, p0, p};
|
||
} else {
|
||
return {c, 3};
|
||
}
|
||
} else {
|
||
c = ((c & 7) << 18) | ((*p & 0x3f) << 12);
|
||
++p;
|
||
c |= (*p & 0x3f) << 6;
|
||
++p;
|
||
c |= *p & 0x3f;
|
||
++p;
|
||
if constexpr (isMultiPass) {
|
||
return {c, 4, p0, p};
|
||
} else {
|
||
return {c, 4};
|
||
}
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
|
||
// Very similar to U8_PREV_UNSAFE().
|
||
UnitIter p0 = p;
|
||
CP32 c = uint8_t(*--p);
|
||
if (U8_IS_SINGLE(c)) {
|
||
return {c, 1, p, p0};
|
||
}
|
||
// U8_IS_TRAIL(c) if well-formed
|
||
c &= 0x3f;
|
||
uint8_t count = 1;
|
||
for (uint8_t shift = 6;;) {
|
||
uint8_t b = *--p;
|
||
if (b >= 0xc0) {
|
||
U8_MASK_LEAD_BYTE(b, count);
|
||
c |= uint32_t{b} << shift;
|
||
break;
|
||
} else {
|
||
c |= (uint32_t{b} & 0x3f) << shift;
|
||
++count;
|
||
shift += 6;
|
||
}
|
||
}
|
||
++count;
|
||
return {c, count, p, p0};
|
||
}
|
||
};
|
||
|
||
// UTF-16
|
||
template<typename CP32, typename UnitIter>
|
||
class UnsafeUTFImpl<
|
||
CP32,
|
||
UnitIter,
|
||
std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
public:
|
||
U_FORCE_INLINE static void inc(UnitIter &p) {
|
||
// Very similar to U16_FWD_1_UNSAFE().
|
||
auto c = *p;
|
||
++p;
|
||
if (U16_IS_LEAD(c)) {
|
||
++p;
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE static void dec(UnitIter &p) {
|
||
// Very similar to U16_BACK_1_UNSAFE().
|
||
if (U16_IS_TRAIL(*--p)) {
|
||
--p;
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
|
||
constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
|
||
// Very similar to U16_NEXT_UNSAFE().
|
||
CP32 c = static_cast<CP32>(*p);
|
||
++p;
|
||
if (!U16_IS_LEAD(c)) {
|
||
if constexpr (isMultiPass) {
|
||
return {c, 1, p0, p};
|
||
} else {
|
||
return {c, 1};
|
||
}
|
||
} else {
|
||
uint16_t c2 = *p;
|
||
++p;
|
||
c = U16_GET_SUPPLEMENTARY(c, c2);
|
||
if constexpr (isMultiPass) {
|
||
return {c, 2, p0, p};
|
||
} else {
|
||
return {c, 2};
|
||
}
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
|
||
// Very similar to U16_PREV_UNSAFE().
|
||
UnitIter p0 = p;
|
||
CP32 c = static_cast<CP32>(*--p);
|
||
if (!U16_IS_TRAIL(c)) {
|
||
return {c, 1, p, p0};
|
||
} else {
|
||
uint16_t c2 = *--p;
|
||
c = U16_GET_SUPPLEMENTARY(c2, c);
|
||
return {c, 2, p, p0};
|
||
}
|
||
}
|
||
};
|
||
|
||
// UTF-32: trivial
|
||
template<typename CP32, typename UnitIter>
|
||
class UnsafeUTFImpl<
|
||
CP32,
|
||
UnitIter,
|
||
std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
public:
|
||
U_FORCE_INLINE static void inc(UnitIter &p) {
|
||
++p;
|
||
}
|
||
|
||
U_FORCE_INLINE static void dec(UnitIter &p) {
|
||
--p;
|
||
}
|
||
|
||
U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
|
||
constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
|
||
CP32 c = *p;
|
||
++p;
|
||
if constexpr (isMultiPass) {
|
||
return {c, 1, p0, p};
|
||
} else {
|
||
return {c, 1};
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
|
||
UnitIter p0 = p;
|
||
CP32 c = *--p;
|
||
return {c, 1, p, p0};
|
||
}
|
||
};
|
||
|
||
#endif
|
||
|
||
// Validating iterators ---------------------------------------------------- ***
|
||
|
||
/**
|
||
* Validating iterator over the code points in a Unicode string.
|
||
*
|
||
* The UnitIter can be
|
||
* an input_iterator, a forward_iterator, or a bidirectional_iterator (including a pointer).
|
||
* The UTFIterator will have the corresponding iterator_category.
|
||
*
|
||
* Call utfIterator() to have the compiler deduce the UnitIter and LimitIter types.
|
||
*
|
||
* For reverse iteration, either use this iterator directly as in <code>*--iter</code>
|
||
* or wrap it using std::make_reverse_iterator(iter).
|
||
*
|
||
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
|
||
* should be signed if UTF_BEHAVIOR_NEGATIVE
|
||
* @tparam behavior How to handle ill-formed Unicode strings
|
||
* @tparam UnitIter An iterator (often a pointer) that returns a code unit type:
|
||
* UTF-8: char or char8_t or uint8_t;
|
||
* UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
|
||
* UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
|
||
* @tparam LimitIter Either the same as UnitIter, or an iterator sentinel type.
|
||
* @draft ICU 78
|
||
* @see utfIterator
|
||
*/
|
||
template<typename CP32, UTFIllFormedBehavior behavior,
|
||
typename UnitIter, typename LimitIter = UnitIter, typename = void>
|
||
class UTFIterator {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
|
||
|
||
// Proxy type for operator->() (required by LegacyInputIterator)
|
||
// so that we don't promise always returning CodeUnits.
|
||
class Proxy {
|
||
public:
|
||
explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
|
||
CodeUnits<CP32, UnitIter> &operator*() { return units_; }
|
||
CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
|
||
private:
|
||
CodeUnits<CP32, UnitIter> units_;
|
||
};
|
||
|
||
public:
|
||
/** C++ iterator boilerplate @internal */
|
||
using value_type = CodeUnits<CP32, UnitIter>;
|
||
/** C++ iterator boilerplate @internal */
|
||
using reference = value_type;
|
||
/** C++ iterator boilerplate @internal */
|
||
using pointer = Proxy;
|
||
/** C++ iterator boilerplate @internal */
|
||
using difference_type = prv::iter_difference_t<UnitIter>;
|
||
/** C++ iterator boilerplate @internal */
|
||
using iterator_category = std::conditional_t<
|
||
prv::bidirectional_iterator<UnitIter>,
|
||
std::bidirectional_iterator_tag,
|
||
std::forward_iterator_tag>;
|
||
|
||
/**
|
||
* Constructor with start <= p < limit.
|
||
* All of these iterators/pointers should be at code point boundaries.
|
||
* Only enabled if UnitIter is a (multi-pass) forward_iterator or better.
|
||
*
|
||
* When using a code unit sentinel (UnitIter≠LimitIter),
|
||
* then that sentinel also works as a sentinel for this code point iterator.
|
||
*
|
||
* @param start Start of the range
|
||
* @param p Initial position inside the range
|
||
* @param limit Limit (exclusive end) of the range
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit) :
|
||
p_(p), start_(start), limit_(limit), units_(0, 0, false, p, p) {}
|
||
/**
|
||
* Constructor with start == p < limit.
|
||
* All of these iterators/pointers should be at code point boundaries.
|
||
*
|
||
* When using a code unit sentinel (UnitIter≠LimitIter),
|
||
* then that sentinel also works as a sentinel for this code point iterator.
|
||
*
|
||
* @param p Start of the range, and the initial position
|
||
* @param limit Limit (exclusive end) of the range
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) :
|
||
p_(p), start_(p), limit_(limit), units_(0, 0, false, p, p) {}
|
||
/**
|
||
* Constructs an iterator start or limit sentinel.
|
||
* The iterator/pointer should be at a code point boundary.
|
||
* Requires UnitIter to be copyable.
|
||
*
|
||
* When using a code unit sentinel (UnitIter≠LimitIter),
|
||
* then that sentinel also works as a sentinel for this code point iterator.
|
||
*
|
||
* @param p Range start or limit
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(p), start_(p), limit_(p), units_(0, 0, false, p, p) {}
|
||
/**
|
||
* Default constructor. Makes a non-functional iterator.
|
||
*
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE UTFIterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
|
||
|
||
/** Move constructor. @draft ICU 78 */
|
||
U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
|
||
/** Move assignment operator. @draft ICU 78 */
|
||
U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default;
|
||
|
||
/** Copy constructor. @draft ICU 78 */
|
||
U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
|
||
/** Copy assignment operator. @draft ICU 78 */
|
||
U_FORCE_INLINE UTFIterator &operator=(const UTFIterator &other) = default;
|
||
|
||
/**
|
||
* @param other Another iterator
|
||
* @return true if this iterator is at the same position as the other one
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
|
||
return getLogicalPosition() == other.getLogicalPosition();
|
||
}
|
||
/**
|
||
* @param other Another iterator
|
||
* @return true if this iterator is not at the same position as the other one
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
|
||
|
||
// Asymmetric equality & nonequality with a sentinel type.
|
||
|
||
/**
|
||
* @param iter A UTFIterator
|
||
* @param s A unit iterator sentinel
|
||
* @return true if the iterator’s position is equal to the sentinel
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename Sentinel> U_FORCE_INLINE friend
|
||
std::enable_if_t<
|
||
!std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
|
||
bool>
|
||
operator==(const UTFIterator &iter, const Sentinel &s) {
|
||
return iter.getLogicalPosition() == s;
|
||
}
|
||
|
||
#if U_CPLUSPLUS_VERSION < 20
|
||
// C++17: Need to define all four combinations of == / != vs. parameter order.
|
||
// Once we require C++20, we could remove all but the first == because
|
||
// the compiler would generate the rest.
|
||
|
||
/**
|
||
* @param s A unit iterator sentinel
|
||
* @param iter A UTFIterator
|
||
* @return true if the iterator’s position is equal to the sentinel
|
||
* @internal
|
||
*/
|
||
template<typename Sentinel> U_FORCE_INLINE friend
|
||
std::enable_if_t<
|
||
!std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
|
||
bool>
|
||
operator==(const Sentinel &s, const UTFIterator &iter) {
|
||
return iter.getLogicalPosition() == s;
|
||
}
|
||
/**
|
||
* @param iter A UTFIterator
|
||
* @param s A unit iterator sentinel
|
||
* @return true if the iterator’s position is not equal to the sentinel
|
||
* @internal
|
||
*/
|
||
template<typename Sentinel> U_FORCE_INLINE friend
|
||
std::enable_if_t<
|
||
!std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
|
||
bool>
|
||
operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
|
||
/**
|
||
* @param s A unit iterator sentinel
|
||
* @param iter A UTFIterator
|
||
* @return true if the iterator’s position is not equal to the sentinel
|
||
* @internal
|
||
*/
|
||
template<typename Sentinel> U_FORCE_INLINE friend
|
||
std::enable_if_t<
|
||
!std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
|
||
bool>
|
||
operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
|
||
#endif // C++17
|
||
|
||
/**
|
||
* Decodes the code unit sequence at the current position.
|
||
*
|
||
* @return CodeUnits with the decoded code point etc.
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE CodeUnits<CP32, UnitIter> operator*() const {
|
||
if (state_ == 0) {
|
||
UnitIter p0 = p_;
|
||
units_ = Impl::readAndInc(p0, p_, limit_);
|
||
state_ = 1;
|
||
}
|
||
return units_;
|
||
}
|
||
|
||
/**
|
||
* Decodes the code unit sequence at the current position.
|
||
* Used like <code>iter->codePoint()</code> or <code>iter->stringView()</code> etc.
|
||
*
|
||
* @return CodeUnits with the decoded code point etc., wrapped into
|
||
* an opaque proxy object so that <code>iter->codePoint()</code> etc. works.
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE Proxy operator->() const {
|
||
if (state_ == 0) {
|
||
UnitIter p0 = p_;
|
||
units_ = Impl::readAndInc(p0, p_, limit_);
|
||
state_ = 1;
|
||
}
|
||
return Proxy(units_);
|
||
}
|
||
|
||
/**
|
||
* Pre-increment operator.
|
||
*
|
||
* @return this iterator
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE UTFIterator &operator++() { // pre-increment
|
||
if (state_ > 0) {
|
||
// operator*() called readAndInc() so p_ is already ahead.
|
||
state_ = 0;
|
||
} else if (state_ == 0) {
|
||
Impl::inc(p_, limit_);
|
||
} else /* state_ < 0 */ {
|
||
// operator--() called decAndRead() so we know how far to skip.
|
||
p_ = units_.end();
|
||
state_ = 0;
|
||
}
|
||
return *this;
|
||
}
|
||
|
||
/**
|
||
* Post-increment operator.
|
||
*
|
||
* @return a copy of this iterator from before the increment.
|
||
* If UnitIter is a single-pass input_iterator, then this function
|
||
* returns an opaque proxy object so that <code>*iter++</code> still works.
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE UTFIterator operator++(int) { // post-increment
|
||
if (state_ > 0) {
|
||
// operator*() called readAndInc() so p_ is already ahead.
|
||
UTFIterator result(*this);
|
||
state_ = 0;
|
||
return result;
|
||
} else if (state_ == 0) {
|
||
UnitIter p0 = p_;
|
||
units_ = Impl::readAndInc(p0, p_, limit_);
|
||
UTFIterator result(*this);
|
||
result.state_ = 1;
|
||
// keep this->state_ == 0
|
||
return result;
|
||
} else /* state_ < 0 */ {
|
||
UTFIterator result(*this);
|
||
// operator--() called decAndRead() so we know how far to skip.
|
||
p_ = units_.end();
|
||
state_ = 0;
|
||
return result;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Pre-decrement operator.
|
||
* Only enabled if UnitIter is a bidirectional_iterator (including a pointer).
|
||
*
|
||
* @return this iterator
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename Iter = UnitIter>
|
||
U_FORCE_INLINE
|
||
std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator &>
|
||
operator--() { // pre-decrement
|
||
if (state_ > 0) {
|
||
// operator*() called readAndInc() so p_ is ahead of the logical position.
|
||
p_ = units_.begin();
|
||
}
|
||
units_ = Impl::decAndRead(start_, p_);
|
||
state_ = -1;
|
||
return *this;
|
||
}
|
||
|
||
/**
|
||
* Post-decrement operator.
|
||
* Only enabled if UnitIter is a bidirectional_iterator (including a pointer).
|
||
*
|
||
* @return a copy of this iterator from before the decrement.
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename Iter = UnitIter>
|
||
U_FORCE_INLINE
|
||
std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator>
|
||
operator--(int) { // post-decrement
|
||
UTFIterator result(*this);
|
||
operator--();
|
||
return result;
|
||
}
|
||
|
||
private:
|
||
friend class std::reverse_iterator<UTFIterator<CP32, behavior, UnitIter>>;
|
||
|
||
U_FORCE_INLINE UnitIter getLogicalPosition() const {
|
||
return state_ <= 0 ? p_ : units_.begin();
|
||
}
|
||
|
||
// operator*() etc. are logically const.
|
||
mutable UnitIter p_;
|
||
// In a validating iterator, we need start_ & limit_ so that when we read a code point
|
||
// (forward or backward) we can test if there are enough code units.
|
||
UnitIter start_;
|
||
LimitIter limit_;
|
||
// Keep state so that we call readAndInc() only once for both operator*() and ++
|
||
// to make it easy for the compiler to optimize.
|
||
mutable CodeUnits<CP32, UnitIter> units_;
|
||
// >0: units_ = readAndInc(), p_ = units limit
|
||
// which means that p_ is ahead of its logical position
|
||
// 0: initial state
|
||
// <0: units_ = decAndRead(), p_ = units start
|
||
mutable int8_t state_ = 0;
|
||
};
|
||
|
||
#ifndef U_IN_DOXYGEN
|
||
// Partial template specialization for single-pass input iterator.
|
||
template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
|
||
class UTFIterator<
|
||
CP32, behavior,
|
||
UnitIter, LimitIter,
|
||
std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
|
||
|
||
// Proxy type for post-increment return value, to make *iter++ work.
|
||
// Also for operator->() (required by LegacyInputIterator)
|
||
// so that we don't promise always returning CodeUnits.
|
||
class Proxy {
|
||
public:
|
||
explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
|
||
CodeUnits<CP32, UnitIter> &operator*() { return units_; }
|
||
CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
|
||
private:
|
||
CodeUnits<CP32, UnitIter> units_;
|
||
};
|
||
|
||
public:
|
||
using value_type = CodeUnits<CP32, UnitIter>;
|
||
using reference = value_type;
|
||
using pointer = Proxy;
|
||
using difference_type = prv::iter_difference_t<UnitIter>;
|
||
using iterator_category = std::input_iterator_tag;
|
||
|
||
U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) : p_(std::move(p)), limit_(std::move(limit)) {}
|
||
|
||
// Constructs an iterator start or limit sentinel.
|
||
// Requires p to be copyable.
|
||
U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(std::move(p)), limit_(p_) {}
|
||
|
||
U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
|
||
U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default;
|
||
|
||
U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
|
||
U_FORCE_INLINE UTFIterator &operator=(const UTFIterator &other) = default;
|
||
|
||
U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
|
||
return p_ == other.p_ && ahead_ == other.ahead_;
|
||
// Strictly speaking, we should check if the logical position is the same.
|
||
// However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
|
||
}
|
||
U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
|
||
|
||
template<typename Sentinel> U_FORCE_INLINE friend
|
||
std::enable_if_t<
|
||
!std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
|
||
bool>
|
||
operator==(const UTFIterator &iter, const Sentinel &s) {
|
||
return !iter.ahead_ && iter.p_ == s;
|
||
}
|
||
|
||
#if U_CPLUSPLUS_VERSION < 20
|
||
template<typename Sentinel> U_FORCE_INLINE friend
|
||
std::enable_if_t<
|
||
!std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
|
||
bool>
|
||
operator==(const Sentinel &s, const UTFIterator &iter) {
|
||
return !iter.ahead_ && iter.p_ == s;
|
||
}
|
||
|
||
template<typename Sentinel> U_FORCE_INLINE friend
|
||
std::enable_if_t<
|
||
!std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
|
||
bool>
|
||
operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
|
||
|
||
template<typename Sentinel> U_FORCE_INLINE friend
|
||
std::enable_if_t<
|
||
!std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
|
||
bool>
|
||
operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
|
||
#endif // C++17
|
||
|
||
U_FORCE_INLINE CodeUnits<CP32, UnitIter> operator*() const {
|
||
if (!ahead_) {
|
||
units_ = Impl::readAndInc(p_, p_, limit_);
|
||
ahead_ = true;
|
||
}
|
||
return units_;
|
||
}
|
||
|
||
U_FORCE_INLINE Proxy operator->() const {
|
||
if (!ahead_) {
|
||
units_ = Impl::readAndInc(p_, p_, limit_);
|
||
ahead_ = true;
|
||
}
|
||
return Proxy(units_);
|
||
}
|
||
|
||
U_FORCE_INLINE UTFIterator &operator++() { // pre-increment
|
||
if (ahead_) {
|
||
// operator*() called readAndInc() so p_ is already ahead.
|
||
ahead_ = false;
|
||
} else {
|
||
Impl::inc(p_, limit_);
|
||
}
|
||
return *this;
|
||
}
|
||
|
||
U_FORCE_INLINE Proxy operator++(int) { // post-increment
|
||
if (ahead_) {
|
||
// operator*() called readAndInc() so p_ is already ahead.
|
||
ahead_ = false;
|
||
} else {
|
||
units_ = Impl::readAndInc(p_, p_, limit_);
|
||
// keep this->ahead_ == false
|
||
}
|
||
return Proxy(units_);
|
||
}
|
||
|
||
private:
|
||
// operator*() etc. are logically const.
|
||
mutable UnitIter p_;
|
||
// In a validating iterator, we need limit_ so that when we read a code point
|
||
// we can test if there are enough code units.
|
||
LimitIter limit_;
|
||
// Keep state so that we call readAndInc() only once for both operator*() and ++
|
||
// so that we can use a single-pass input iterator for UnitIter.
|
||
mutable CodeUnits<CP32, UnitIter> units_ = {0, 0, false};
|
||
// true: units_ = readAndInc(), p_ = units limit
|
||
// which means that p_ is ahead of its logical position
|
||
// false: initial state
|
||
mutable bool ahead_ = false;
|
||
};
|
||
#endif // U_IN_DOXYGEN
|
||
|
||
} // namespace U_HEADER_ONLY_NAMESPACE
|
||
|
||
#ifndef U_IN_DOXYGEN
|
||
// Bespoke specialization of reverse_iterator.
|
||
// The default implementation implements reverse operator*() and ++ in a way
|
||
// that does most of the same work twice for reading variable-length sequences.
|
||
template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
|
||
class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter>> {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
using Impl = U_HEADER_ONLY_NAMESPACE::UTFImpl<CP32, behavior, UnitIter>;
|
||
using CodeUnits_ = U_HEADER_ONLY_NAMESPACE::CodeUnits<CP32, UnitIter>;
|
||
|
||
// Proxy type for operator->() (required by LegacyInputIterator)
|
||
// so that we don't promise always returning CodeUnits.
|
||
class Proxy {
|
||
public:
|
||
explicit Proxy(CodeUnits_ units) : units_(units) {}
|
||
CodeUnits_ &operator*() { return units_; }
|
||
CodeUnits_ *operator->() { return &units_; }
|
||
private:
|
||
CodeUnits_ units_;
|
||
};
|
||
|
||
public:
|
||
using value_type = CodeUnits_;
|
||
using reference = value_type;
|
||
using pointer = Proxy;
|
||
using difference_type = U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t<UnitIter>;
|
||
using iterator_category = std::bidirectional_iterator_tag;
|
||
|
||
U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter> iter) :
|
||
p_(iter.getLogicalPosition()), start_(iter.start_), limit_(iter.limit_),
|
||
units_(0, 0, false, p_, p_) {}
|
||
U_FORCE_INLINE reverse_iterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
|
||
|
||
U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
|
||
U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
|
||
|
||
U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
|
||
U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
|
||
|
||
U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
|
||
return getLogicalPosition() == other.getLogicalPosition();
|
||
}
|
||
U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
|
||
|
||
U_FORCE_INLINE CodeUnits_ operator*() const {
|
||
if (state_ == 0) {
|
||
units_ = Impl::decAndRead(start_, p_);
|
||
state_ = -1;
|
||
}
|
||
return units_;
|
||
}
|
||
|
||
U_FORCE_INLINE Proxy operator->() const {
|
||
if (state_ == 0) {
|
||
units_ = Impl::decAndRead(start_, p_);
|
||
state_ = -1;
|
||
}
|
||
return Proxy(units_);
|
||
}
|
||
|
||
U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment
|
||
if (state_ < 0) {
|
||
// operator*() called decAndRead() so p_ is already behind.
|
||
state_ = 0;
|
||
} else if (state_ == 0) {
|
||
Impl::dec(start_, p_);
|
||
} else /* state_ > 0 */ {
|
||
// operator--() called readAndInc() so we know how far to skip.
|
||
p_ = units_.begin();
|
||
state_ = 0;
|
||
}
|
||
return *this;
|
||
}
|
||
|
||
U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment
|
||
if (state_ < 0) {
|
||
// operator*() called decAndRead() so p_ is already behind.
|
||
reverse_iterator result(*this);
|
||
state_ = 0;
|
||
return result;
|
||
} else if (state_ == 0) {
|
||
units_ = Impl::decAndRead(start_, p_);
|
||
reverse_iterator result(*this);
|
||
result.state_ = -1;
|
||
// keep this->state_ == 0
|
||
return result;
|
||
} else /* state_ > 0 */ {
|
||
reverse_iterator result(*this);
|
||
// operator--() called readAndInc() so we know how far to skip.
|
||
p_ = units_.begin();
|
||
state_ = 0;
|
||
return result;
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement
|
||
if (state_ < 0) {
|
||
// operator*() called decAndRead() so p_ is behind the logical position.
|
||
p_ = units_.end();
|
||
}
|
||
UnitIter p0 = p_;
|
||
units_ = Impl::readAndInc(p0, p_, limit_);
|
||
state_ = 1;
|
||
return *this;
|
||
}
|
||
|
||
U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement
|
||
reverse_iterator result(*this);
|
||
operator--();
|
||
return result;
|
||
}
|
||
|
||
private:
|
||
U_FORCE_INLINE UnitIter getLogicalPosition() const {
|
||
return state_ >= 0 ? p_ : units_.end();
|
||
}
|
||
|
||
// operator*() etc. are logically const.
|
||
mutable UnitIter p_;
|
||
// In a validating iterator, we need start_ & limit_ so that when we read a code point
|
||
// (forward or backward) we can test if there are enough code units.
|
||
UnitIter start_;
|
||
UnitIter limit_;
|
||
// Keep state so that we call decAndRead() only once for both operator*() and ++
|
||
// to make it easy for the compiler to optimize.
|
||
mutable CodeUnits_ units_;
|
||
// >0: units_ = readAndInc(), p_ = units limit
|
||
// 0: initial state
|
||
// <0: units_ = decAndRead(), p_ = units start
|
||
// which means that p_ is behind its logical position
|
||
mutable int8_t state_ = 0;
|
||
};
|
||
#endif // U_IN_DOXYGEN
|
||
|
||
namespace U_HEADER_ONLY_NAMESPACE {
|
||
|
||
/**
|
||
* UTFIterator factory function for start <= p < limit.
|
||
* Deduces the UnitIter and LimitIter template parameters from the inputs.
|
||
* Only enabled if UnitIter is a (multi-pass) forward_iterator or better.
|
||
*
|
||
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
|
||
* @tparam behavior How to handle ill-formed Unicode strings
|
||
* @tparam UnitIter Can usually be omitted/deduced:
|
||
* An iterator (often a pointer) that returns a code unit type:
|
||
* UTF-8: char or char8_t or uint8_t;
|
||
* UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
|
||
* UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
|
||
* @tparam LimitIter Either the same as UnitIter, or an iterator sentinel type.
|
||
* @param start start code unit iterator
|
||
* @param p current-position code unit iterator
|
||
* @param limit limit (exclusive-end) code unit iterator.
|
||
* When using a code unit sentinel (UnitIter≠LimitIter),
|
||
* then that sentinel also works as a sentinel for the code point iterator.
|
||
* @return a UTFIterator<CP32, behavior, UnitIter>
|
||
* for the given code unit iterators or character pointers
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename CP32, UTFIllFormedBehavior behavior,
|
||
typename UnitIter, typename LimitIter = UnitIter>
|
||
auto utfIterator(UnitIter start, UnitIter p, LimitIter limit) {
|
||
return UTFIterator<CP32, behavior, UnitIter, LimitIter>(
|
||
std::move(start), std::move(p), std::move(limit));
|
||
}
|
||
|
||
/**
|
||
* UTFIterator factory function for start = p < limit.
|
||
* Deduces the UnitIter and LimitIter template parameters from the inputs.
|
||
*
|
||
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
|
||
* @tparam behavior How to handle ill-formed Unicode strings
|
||
* @tparam UnitIter Can usually be omitted/deduced:
|
||
* An iterator (often a pointer) that returns a code unit type:
|
||
* UTF-8: char or char8_t or uint8_t;
|
||
* UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
|
||
* UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
|
||
* @tparam LimitIter Either the same as UnitIter, or an iterator sentinel type.
|
||
* @param p start and current-position code unit iterator
|
||
* @param limit limit (exclusive-end) code unit iterator.
|
||
* When using a code unit sentinel (UnitIter≠LimitIter),
|
||
* then that sentinel also works as a sentinel for the code point iterator.
|
||
* @return a UTFIterator<CP32, behavior, UnitIter>
|
||
* for the given code unit iterators or character pointers
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename CP32, UTFIllFormedBehavior behavior,
|
||
typename UnitIter, typename LimitIter = UnitIter>
|
||
auto utfIterator(UnitIter p, LimitIter limit) {
|
||
return UTFIterator<CP32, behavior, UnitIter, LimitIter>(
|
||
std::move(p), std::move(limit));
|
||
}
|
||
|
||
// Note: We should only enable the following factory function for a copyable UnitIter.
|
||
// In C++17, we would have to partially specialize with enable_if_t testing for forward_iterator,
|
||
// but a function template partial specialization is not allowed.
|
||
// In C++20, we might be able to require the std::copyable concept.
|
||
|
||
/**
|
||
* UTFIterator factory function for a start or limit sentinel.
|
||
* Deduces the UnitIter template parameter from the input.
|
||
* Requires UnitIter to be copyable.
|
||
*
|
||
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
|
||
* @tparam behavior How to handle ill-formed Unicode strings
|
||
* @tparam UnitIter Can usually be omitted/deduced:
|
||
* An iterator (often a pointer) that returns a code unit type:
|
||
* UTF-8: char or char8_t or uint8_t;
|
||
* UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
|
||
* UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
|
||
* @param p code unit iterator.
|
||
* When using a code unit sentinel,
|
||
* then that sentinel also works as a sentinel for the code point iterator.
|
||
* @return a UTFIterator<CP32, behavior, UnitIter>
|
||
* for the given code unit iterator or character pointer
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
|
||
auto utfIterator(UnitIter p) {
|
||
return UTFIterator<CP32, behavior, UnitIter>(std::move(p));
|
||
}
|
||
|
||
/**
|
||
* A C++ "range" for validating iteration over all of the code points of a code unit range.
|
||
*
|
||
* Call utfStringCodePoints() to have the compiler deduce the Range type.
|
||
*
|
||
* UTFStringCodePoints is conditionally borrowed; that is, if Range is a borrowed range
|
||
* so is UTFStringCodePoints<CP32, behavior, Range>.
|
||
* Note that when given a range r that is an lvalue and is not a view, utfStringCodePoints(r) uses a
|
||
* ref_view of r as the Range type, which is a borrowed range.
|
||
* In practice, this means that given a container variable r, the iterators of utfStringCodePoints(r) can
|
||
* be used as long as iterators on r are valid, without having to keep utfStringCodePoints(r) around.
|
||
* For instance:
|
||
* \code
|
||
* std::u8string s = "𒇧𒇧";
|
||
* // it outlives utfStringCodePoints<char32_t>(s).
|
||
* auto it = utfStringCodePoints<char32_t>(s).begin();
|
||
* ++it;
|
||
* char32_t second_code_point = it->codePoint(); // OK.
|
||
* \endcode
|
||
*
|
||
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
|
||
* should be signed if UTF_BEHAVIOR_NEGATIVE
|
||
* @tparam behavior How to handle ill-formed Unicode strings
|
||
* @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units
|
||
* @draft ICU 78
|
||
* @see utfStringCodePoints
|
||
*/
|
||
template<typename CP32, UTFIllFormedBehavior behavior, typename Range>
|
||
class UTFStringCodePoints {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
public:
|
||
/**
|
||
* Constructs an empty C++ "range" object.
|
||
* @draft ICU 78
|
||
*/
|
||
UTFStringCodePoints() = default;
|
||
|
||
/**
|
||
* Constructs a C++ "range" object over the code points in the string.
|
||
* @param unitRange input range
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
|
||
explicit UTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
|
||
/**
|
||
* Constructs a C++ "range" object over the code points in the string,
|
||
* keeping a reference to the code unit range. This overload is used by
|
||
* utfStringCodePoints in C++17; in C+20, a ref_view is used instead (via
|
||
* views::all).
|
||
* @param unitRange input range
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
|
||
explicit UTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
|
||
|
||
/** Copy constructor. @draft ICU 78 */
|
||
UTFStringCodePoints(const UTFStringCodePoints &other) = default;
|
||
|
||
/** Copy assignment operator. @draft ICU 78 */
|
||
UTFStringCodePoints &operator=(const UTFStringCodePoints &other) = default;
|
||
|
||
/**
|
||
* @return the range start iterator
|
||
* @draft ICU 78
|
||
*/
|
||
auto begin() {
|
||
return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
|
||
}
|
||
|
||
/**
|
||
* @return the range start iterator
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
|
||
auto begin() const {
|
||
return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
|
||
}
|
||
|
||
/**
|
||
* @return the range limit (exclusive end) iterator
|
||
* @draft ICU 78
|
||
*/
|
||
auto end() {
|
||
using UnitIter = decltype(unitRange.begin());
|
||
using LimitIter = decltype(unitRange.end());
|
||
if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
|
||
// Return the code unit sentinel.
|
||
return unitRange.end();
|
||
} else if constexpr (prv::bidirectional_iterator<UnitIter>) {
|
||
return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
|
||
} else {
|
||
// The input iterator specialization has no three-argument constructor.
|
||
return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
|
||
}
|
||
}
|
||
|
||
/**
|
||
* @return the range limit (exclusive end) iterator
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
|
||
auto end() const {
|
||
using UnitIter = decltype(unitRange.begin());
|
||
using LimitIter = decltype(unitRange.end());
|
||
if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
|
||
// Return the code unit sentinel.
|
||
return unitRange.end();
|
||
} else if constexpr (prv::bidirectional_iterator<UnitIter>) {
|
||
return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
|
||
} else {
|
||
// The input iterator specialization has no three-argument constructor.
|
||
return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
|
||
}
|
||
}
|
||
|
||
/**
|
||
* @return std::reverse_iterator(end())
|
||
* @draft ICU 78
|
||
*/
|
||
auto rbegin() const {
|
||
return std::make_reverse_iterator(end());
|
||
}
|
||
|
||
/**
|
||
* @return std::reverse_iterator(begin())
|
||
* @draft ICU 78
|
||
*/
|
||
auto rend() const {
|
||
return std::make_reverse_iterator(begin());
|
||
}
|
||
|
||
private:
|
||
Range unitRange;
|
||
};
|
||
|
||
/** @internal */
|
||
template<typename CP32, UTFIllFormedBehavior behavior>
|
||
struct UTFStringCodePointsAdaptor
|
||
#if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \
|
||
__cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
|
||
: std::ranges::range_adaptor_closure<UTFStringCodePointsAdaptor<CP32, behavior>>
|
||
#endif
|
||
{
|
||
/** @internal */
|
||
template<typename Range>
|
||
auto operator()(Range &&unitRange) const {
|
||
#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2.
|
||
return UTFStringCodePoints<CP32, behavior, std::ranges::views::all_t<Range>>(
|
||
std::forward<Range>(unitRange));
|
||
#else
|
||
if constexpr (prv::is_basic_string_view_v<std::decay_t<Range>>) {
|
||
// Take basic_string_view by copy, not by reference. In C++20 this is handled by
|
||
// all_t<Range>, which is Range if Range is a view.
|
||
return UTFStringCodePoints<CP32, behavior, std::decay_t<Range>>(
|
||
std::forward<Range>(unitRange));
|
||
} else {
|
||
return UTFStringCodePoints<CP32, behavior, Range>(std::forward<Range>(unitRange));
|
||
}
|
||
#endif
|
||
}
|
||
};
|
||
|
||
/**
|
||
* Range adaptor function object returning a UTFStringCodePoints object that represents a "range" of code
|
||
* points in a code unit range, which validates while decoding.
|
||
* Deduces the Range template parameter from the input, taking into account the value category: the
|
||
* code units will be referenced if possible, and moved if necessary.
|
||
*
|
||
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
|
||
* should be signed if UTF_BEHAVIOR_NEGATIVE
|
||
* @tparam behavior How to handle ill-formed Unicode strings
|
||
* @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units
|
||
* @param unitRange input range
|
||
* @return a UTFStringCodePoints<CP32, behavior, Range> for the given unitRange
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename CP32, UTFIllFormedBehavior behavior>
|
||
constexpr UTFStringCodePointsAdaptor<CP32, behavior> utfStringCodePoints;
|
||
|
||
// Non-validating iterators ------------------------------------------------ ***
|
||
|
||
/**
|
||
* Non-validating iterator over the code points in a Unicode string.
|
||
* The string must be well-formed.
|
||
*
|
||
* The UnitIter can be
|
||
* an input_iterator, a forward_iterator, or a bidirectional_iterator (including a pointer).
|
||
* The UTFIterator will have the corresponding iterator_category.
|
||
*
|
||
* Call unsafeUTFIterator() to have the compiler deduce the UnitIter type.
|
||
*
|
||
* For reverse iteration, either use this iterator directly as in <code>*--iter</code>
|
||
* or wrap it using std::make_reverse_iterator(iter).
|
||
*
|
||
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
|
||
* @tparam UnitIter An iterator (often a pointer) that returns a code unit type:
|
||
* UTF-8: char or char8_t or uint8_t;
|
||
* UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
|
||
* UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
|
||
* @draft ICU 78
|
||
* @see unsafeUTFIterator
|
||
*/
|
||
template<typename CP32, typename UnitIter, typename = void>
|
||
class UnsafeUTFIterator {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
using Impl = UnsafeUTFImpl<CP32, UnitIter>;
|
||
|
||
// Proxy type for operator->() (required by LegacyInputIterator)
|
||
// so that we don't promise always returning UnsafeCodeUnits.
|
||
class Proxy {
|
||
public:
|
||
explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
|
||
UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
|
||
UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
|
||
private:
|
||
UnsafeCodeUnits<CP32, UnitIter> units_;
|
||
};
|
||
|
||
public:
|
||
/** C++ iterator boilerplate @internal */
|
||
using value_type = UnsafeCodeUnits<CP32, UnitIter>;
|
||
/** C++ iterator boilerplate @internal */
|
||
using reference = value_type;
|
||
/** C++ iterator boilerplate @internal */
|
||
using pointer = Proxy;
|
||
/** C++ iterator boilerplate @internal */
|
||
using difference_type = prv::iter_difference_t<UnitIter>;
|
||
/** C++ iterator boilerplate @internal */
|
||
using iterator_category = std::conditional_t<
|
||
prv::bidirectional_iterator<UnitIter>,
|
||
std::bidirectional_iterator_tag,
|
||
std::forward_iterator_tag>;
|
||
|
||
/**
|
||
* Constructor; the iterator/pointer should be at a code point boundary.
|
||
*
|
||
* When using a code unit sentinel,
|
||
* then that sentinel also works as a sentinel for this code point iterator.
|
||
*
|
||
* @param p Initial position inside the range, or a range sentinel
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(p), units_(0, 0, p, p) {}
|
||
/**
|
||
* Default constructor. Makes a non-functional iterator.
|
||
*
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE UnsafeUTFIterator() : p_{}, units_(0, 0, p_, p_) {}
|
||
|
||
/** Move constructor. @draft ICU 78 */
|
||
U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default;
|
||
/** Move assignment operator. @draft ICU 78 */
|
||
U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src) noexcept = default;
|
||
|
||
/** Copy constructor. @draft ICU 78 */
|
||
U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default;
|
||
/** Copy assignment operator. @draft ICU 78 */
|
||
U_FORCE_INLINE UnsafeUTFIterator &operator=(const UnsafeUTFIterator &other) = default;
|
||
|
||
/**
|
||
* @param other Another iterator
|
||
* @return true if this iterator is at the same position as the other one
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const {
|
||
return getLogicalPosition() == other.getLogicalPosition();
|
||
}
|
||
/**
|
||
* @param other Another iterator
|
||
* @return true if this iterator is not at the same position as the other one
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
|
||
|
||
/**
|
||
* @param iter An UnsafeUTFIterator
|
||
* @param s A unit iterator sentinel
|
||
* @return true if the iterator’s position is equal to the sentinel
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename Sentinel> U_FORCE_INLINE friend
|
||
std::enable_if_t<
|
||
!std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
|
||
bool>
|
||
operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
|
||
return iter.getLogicalPosition() == s;
|
||
}
|
||
|
||
#if U_CPLUSPLUS_VERSION < 20
|
||
/**
|
||
* @param s A unit iterator sentinel
|
||
* @param iter An UnsafeUTFIterator
|
||
* @return true if the iterator’s position is equal to the sentinel
|
||
* @internal
|
||
*/
|
||
template<typename Sentinel> U_FORCE_INLINE friend
|
||
std::enable_if_t<
|
||
!std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
|
||
bool>
|
||
operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
|
||
return iter.getLogicalPosition() == s;
|
||
}
|
||
/**
|
||
* @param iter An UnsafeUTFIterator
|
||
* @param s A unit iterator sentinel
|
||
* @return true if the iterator’s position is not equal to the sentinel
|
||
* @internal
|
||
*/
|
||
template<typename Sentinel> U_FORCE_INLINE friend
|
||
std::enable_if_t<
|
||
!std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
|
||
bool>
|
||
operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
|
||
/**
|
||
* @param s A unit iterator sentinel
|
||
* @param iter An UnsafeUTFIterator
|
||
* @return true if the iterator’s position is not equal to the sentinel
|
||
* @internal
|
||
*/
|
||
template<typename Sentinel> U_FORCE_INLINE friend
|
||
std::enable_if_t<
|
||
!std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
|
||
bool>
|
||
operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
|
||
#endif // C++17
|
||
|
||
/**
|
||
* Decodes the code unit sequence at the current position.
|
||
*
|
||
* @return CodeUnits with the decoded code point etc.
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const {
|
||
if (state_ == 0) {
|
||
UnitIter p0 = p_;
|
||
units_ = Impl::readAndInc(p0, p_);
|
||
state_ = 1;
|
||
}
|
||
return units_;
|
||
}
|
||
|
||
/**
|
||
* Decodes the code unit sequence at the current position.
|
||
* Used like <code>iter->codePoint()</code> or <code>iter->stringView()</code> etc.
|
||
*
|
||
* @return CodeUnits with the decoded code point etc., wrapped into
|
||
* an opaque proxy object so that <code>iter->codePoint()</code> etc. works.
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE Proxy operator->() const {
|
||
if (state_ == 0) {
|
||
UnitIter p0 = p_;
|
||
units_ = Impl::readAndInc(p0, p_);
|
||
state_ = 1;
|
||
}
|
||
return Proxy(units_);
|
||
}
|
||
|
||
/**
|
||
* Pre-increment operator.
|
||
*
|
||
* @return this iterator
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE UnsafeUTFIterator &operator++() { // pre-increment
|
||
if (state_ > 0) {
|
||
// operator*() called readAndInc() so p_ is already ahead.
|
||
state_ = 0;
|
||
} else if (state_ == 0) {
|
||
Impl::inc(p_);
|
||
} else /* state_ < 0 */ {
|
||
// operator--() called decAndRead() so we know how far to skip.
|
||
p_ = units_.end();
|
||
state_ = 0;
|
||
}
|
||
return *this;
|
||
}
|
||
|
||
/**
|
||
* Post-increment operator.
|
||
*
|
||
* @return a copy of this iterator from before the increment.
|
||
* If UnitIter is a single-pass input_iterator, then this function
|
||
* returns an opaque proxy object so that <code>*iter++</code> still works.
|
||
* @draft ICU 78
|
||
*/
|
||
U_FORCE_INLINE UnsafeUTFIterator operator++(int) { // post-increment
|
||
if (state_ > 0) {
|
||
// operator*() called readAndInc() so p_ is already ahead.
|
||
UnsafeUTFIterator result(*this);
|
||
state_ = 0;
|
||
return result;
|
||
} else if (state_ == 0) {
|
||
UnitIter p0 = p_;
|
||
units_ = Impl::readAndInc(p0, p_);
|
||
UnsafeUTFIterator result(*this);
|
||
result.state_ = 1;
|
||
// keep this->state_ == 0
|
||
return result;
|
||
} else /* state_ < 0 */ {
|
||
UnsafeUTFIterator result(*this);
|
||
// operator--() called decAndRead() so we know how far to skip.
|
||
p_ = units_.end();
|
||
state_ = 0;
|
||
return result;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Pre-decrement operator.
|
||
* Only enabled if UnitIter is a bidirectional_iterator (including a pointer).
|
||
*
|
||
* @return this iterator
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename Iter = UnitIter>
|
||
U_FORCE_INLINE
|
||
std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator &>
|
||
operator--() { // pre-decrement
|
||
if (state_ > 0) {
|
||
// operator*() called readAndInc() so p_ is ahead of the logical position.
|
||
p_ = units_.begin();
|
||
}
|
||
units_ = Impl::decAndRead(p_);
|
||
state_ = -1;
|
||
return *this;
|
||
}
|
||
|
||
/**
|
||
* Post-decrement operator.
|
||
* Only enabled if UnitIter is a bidirectional_iterator (including a pointer).
|
||
*
|
||
* @return a copy of this iterator from before the decrement.
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename Iter = UnitIter>
|
||
U_FORCE_INLINE
|
||
std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator>
|
||
operator--(int) { // post-decrement
|
||
UnsafeUTFIterator result(*this);
|
||
operator--();
|
||
return result;
|
||
}
|
||
|
||
private:
|
||
friend class std::reverse_iterator<UnsafeUTFIterator<CP32, UnitIter>>;
|
||
|
||
U_FORCE_INLINE UnitIter getLogicalPosition() const {
|
||
return state_ <= 0 ? p_ : units_.begin();
|
||
}
|
||
|
||
// operator*() etc. are logically const.
|
||
mutable UnitIter p_;
|
||
// Keep state so that we call readAndInc() only once for both operator*() and ++
|
||
// to make it easy for the compiler to optimize.
|
||
mutable UnsafeCodeUnits<CP32, UnitIter> units_;
|
||
// >0: units_ = readAndInc(), p_ = units limit
|
||
// which means that p_ is ahead of its logical position
|
||
// 0: initial state
|
||
// <0: units_ = decAndRead(), p_ = units start
|
||
mutable int8_t state_ = 0;
|
||
};
|
||
|
||
#ifndef U_IN_DOXYGEN
|
||
// Partial template specialization for single-pass input iterator.
|
||
template<typename CP32, typename UnitIter>
|
||
class UnsafeUTFIterator<
|
||
CP32,
|
||
UnitIter,
|
||
std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
using Impl = UnsafeUTFImpl<CP32, UnitIter>;
|
||
|
||
// Proxy type for post-increment return value, to make *iter++ work.
|
||
// Also for operator->() (required by LegacyInputIterator)
|
||
// so that we don't promise always returning UnsafeCodeUnits.
|
||
class Proxy {
|
||
public:
|
||
explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
|
||
UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
|
||
UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
|
||
private:
|
||
UnsafeCodeUnits<CP32, UnitIter> units_;
|
||
};
|
||
|
||
public:
|
||
using value_type = UnsafeCodeUnits<CP32, UnitIter>;
|
||
using reference = value_type;
|
||
using pointer = Proxy;
|
||
using difference_type = prv::iter_difference_t<UnitIter>;
|
||
using iterator_category = std::input_iterator_tag;
|
||
|
||
U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(std::move(p)) {}
|
||
|
||
U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default;
|
||
U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src) noexcept = default;
|
||
|
||
U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default;
|
||
U_FORCE_INLINE UnsafeUTFIterator &operator=(const UnsafeUTFIterator &other) = default;
|
||
|
||
U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const {
|
||
return p_ == other.p_ && ahead_ == other.ahead_;
|
||
// Strictly speaking, we should check if the logical position is the same.
|
||
// However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
|
||
}
|
||
U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
|
||
|
||
template<typename Sentinel> U_FORCE_INLINE friend
|
||
std::enable_if_t<
|
||
!std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
|
||
bool>
|
||
operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
|
||
return !iter.ahead_ && iter.p_ == s;
|
||
}
|
||
|
||
#if U_CPLUSPLUS_VERSION < 20
|
||
template<typename Sentinel> U_FORCE_INLINE friend
|
||
std::enable_if_t<
|
||
!std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
|
||
bool>
|
||
operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
|
||
return !iter.ahead_ && iter.p_ == s;
|
||
}
|
||
|
||
template<typename Sentinel> U_FORCE_INLINE friend
|
||
std::enable_if_t<
|
||
!std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
|
||
bool>
|
||
operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
|
||
|
||
template<typename Sentinel> U_FORCE_INLINE friend
|
||
std::enable_if_t<
|
||
!std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
|
||
bool>
|
||
operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
|
||
#endif // C++17
|
||
|
||
U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const {
|
||
if (!ahead_) {
|
||
units_ = Impl::readAndInc(p_, p_);
|
||
ahead_ = true;
|
||
}
|
||
return units_;
|
||
}
|
||
|
||
U_FORCE_INLINE Proxy operator->() const {
|
||
if (!ahead_) {
|
||
units_ = Impl::readAndInc(p_, p_);
|
||
ahead_ = true;
|
||
}
|
||
return Proxy(units_);
|
||
}
|
||
|
||
U_FORCE_INLINE UnsafeUTFIterator &operator++() { // pre-increment
|
||
if (ahead_) {
|
||
// operator*() called readAndInc() so p_ is already ahead.
|
||
ahead_ = false;
|
||
} else {
|
||
Impl::inc(p_);
|
||
}
|
||
return *this;
|
||
}
|
||
|
||
U_FORCE_INLINE Proxy operator++(int) { // post-increment
|
||
if (ahead_) {
|
||
// operator*() called readAndInc() so p_ is already ahead.
|
||
ahead_ = false;
|
||
} else {
|
||
units_ = Impl::readAndInc(p_, p_);
|
||
// keep this->ahead_ == false
|
||
}
|
||
return Proxy(units_);
|
||
}
|
||
|
||
private:
|
||
// operator*() etc. are logically const.
|
||
mutable UnitIter p_;
|
||
// Keep state so that we call readAndInc() only once for both operator*() and ++
|
||
// so that we can use a single-pass input iterator for UnitIter.
|
||
mutable UnsafeCodeUnits<CP32, UnitIter> units_ = {0, 0};
|
||
// true: units_ = readAndInc(), p_ = units limit
|
||
// which means that p_ is ahead of its logical position
|
||
// false: initial state
|
||
mutable bool ahead_ = false;
|
||
};
|
||
#endif // U_IN_DOXYGEN
|
||
|
||
} // namespace U_HEADER_ONLY_NAMESPACE
|
||
|
||
#ifndef U_IN_DOXYGEN
|
||
// Bespoke specialization of reverse_iterator.
|
||
// The default implementation implements reverse operator*() and ++ in a way
|
||
// that does most of the same work twice for reading variable-length sequences.
|
||
template<typename CP32, typename UnitIter>
|
||
class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter>> {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
using Impl = U_HEADER_ONLY_NAMESPACE::UnsafeUTFImpl<CP32, UnitIter>;
|
||
using UnsafeCodeUnits_ = U_HEADER_ONLY_NAMESPACE::UnsafeCodeUnits<CP32, UnitIter>;
|
||
|
||
// Proxy type for operator->() (required by LegacyInputIterator)
|
||
// so that we don't promise always returning UnsafeCodeUnits.
|
||
class Proxy {
|
||
public:
|
||
explicit Proxy(UnsafeCodeUnits_ units) : units_(units) {}
|
||
UnsafeCodeUnits_ &operator*() { return units_; }
|
||
UnsafeCodeUnits_ *operator->() { return &units_; }
|
||
private:
|
||
UnsafeCodeUnits_ units_;
|
||
};
|
||
|
||
public:
|
||
using value_type = UnsafeCodeUnits_;
|
||
using reference = value_type;
|
||
using pointer = Proxy;
|
||
using difference_type = U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t<UnitIter>;
|
||
using iterator_category = std::bidirectional_iterator_tag;
|
||
|
||
U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter> iter) :
|
||
p_(iter.getLogicalPosition()), units_(0, 0, p_, p_) {}
|
||
U_FORCE_INLINE reverse_iterator() : p_{}, units_(0, 0, p_, p_) {}
|
||
|
||
U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
|
||
U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
|
||
|
||
U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
|
||
U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
|
||
|
||
U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
|
||
return getLogicalPosition() == other.getLogicalPosition();
|
||
}
|
||
U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
|
||
|
||
U_FORCE_INLINE UnsafeCodeUnits_ operator*() const {
|
||
if (state_ == 0) {
|
||
units_ = Impl::decAndRead(p_);
|
||
state_ = -1;
|
||
}
|
||
return units_;
|
||
}
|
||
|
||
U_FORCE_INLINE Proxy operator->() const {
|
||
if (state_ == 0) {
|
||
units_ = Impl::decAndRead(p_);
|
||
state_ = -1;
|
||
}
|
||
return Proxy(units_);
|
||
}
|
||
|
||
U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment
|
||
if (state_ < 0) {
|
||
// operator*() called decAndRead() so p_ is already behind.
|
||
state_ = 0;
|
||
} else if (state_ == 0) {
|
||
Impl::dec(p_);
|
||
} else /* state_ > 0 */ {
|
||
// operator--() called readAndInc() so we know how far to skip.
|
||
p_ = units_.begin();
|
||
state_ = 0;
|
||
}
|
||
return *this;
|
||
}
|
||
|
||
U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment
|
||
if (state_ < 0) {
|
||
// operator*() called decAndRead() so p_ is already behind.
|
||
reverse_iterator result(*this);
|
||
state_ = 0;
|
||
return result;
|
||
} else if (state_ == 0) {
|
||
units_ = Impl::decAndRead(p_);
|
||
reverse_iterator result(*this);
|
||
result.state_ = -1;
|
||
// keep this->state_ == 0
|
||
return result;
|
||
} else /* state_ > 0 */ {
|
||
reverse_iterator result(*this);
|
||
// operator--() called readAndInc() so we know how far to skip.
|
||
p_ = units_.begin();
|
||
state_ = 0;
|
||
return result;
|
||
}
|
||
}
|
||
|
||
U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement
|
||
if (state_ < 0) {
|
||
// operator*() called decAndRead() so p_ is behind the logical position.
|
||
p_ = units_.end();
|
||
}
|
||
UnitIter p0 = p_;
|
||
units_ = Impl::readAndInc(p0, p_);
|
||
state_ = 1;
|
||
return *this;
|
||
}
|
||
|
||
U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement
|
||
reverse_iterator result(*this);
|
||
operator--();
|
||
return result;
|
||
}
|
||
|
||
private:
|
||
U_FORCE_INLINE UnitIter getLogicalPosition() const {
|
||
return state_ >= 0 ? p_ : units_.end();
|
||
}
|
||
|
||
// operator*() etc. are logically const.
|
||
mutable UnitIter p_;
|
||
// Keep state so that we call decAndRead() only once for both operator*() and ++
|
||
// to make it easy for the compiler to optimize.
|
||
mutable UnsafeCodeUnits_ units_;
|
||
// >0: units_ = readAndInc(), p_ = units limit
|
||
// 0: initial state
|
||
// <0: units_ = decAndRead(), p_ = units start
|
||
// which means that p_ is behind its logical position
|
||
mutable int8_t state_ = 0;
|
||
};
|
||
#endif // U_IN_DOXYGEN
|
||
|
||
namespace U_HEADER_ONLY_NAMESPACE {
|
||
|
||
/**
|
||
* UnsafeUTFIterator factory function.
|
||
* Deduces the UnitIter template parameter from the input.
|
||
*
|
||
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
|
||
* @tparam UnitIter Can usually be omitted/deduced:
|
||
* An iterator (often a pointer) that returns a code unit type:
|
||
* UTF-8: char or char8_t or uint8_t;
|
||
* UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
|
||
* UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
|
||
* @param iter code unit iterator
|
||
* @return an UnsafeUTFIterator<CP32, UnitIter>
|
||
* for the given code unit iterator or character pointer
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename CP32, typename UnitIter>
|
||
auto unsafeUTFIterator(UnitIter iter) {
|
||
return UnsafeUTFIterator<CP32, UnitIter>(std::move(iter));
|
||
}
|
||
|
||
/**
|
||
* A C++ "range" for non-validating iteration over all of the code points of a code unit range.
|
||
* The string must be well-formed.
|
||
*
|
||
* Call unsafeUTFStringCodePoints() to have the compiler deduce the Range type.
|
||
*
|
||
* UnsafeUTFStringCodePoints is conditionally borrowed; that is, if Range is a borrowed range
|
||
* so is UnsafeUTFStringCodePoints<CP32, behavior, Range>.
|
||
* Note that when given a range r that is an lvalue and is not a view, unsafeUTFStringCodePoints(r) uses
|
||
* a ref_view of r as the Range type, which is a borrowed range.
|
||
* In practice, this means that given a container variable r, the iterators of
|
||
* unsafeUTFStringCodePoints(r) can be used as long as iterators on r are valid, without having to keep
|
||
* unsafeUTFStringCodePoints(r) around.
|
||
* For instance:
|
||
* \code
|
||
* std::u8string s = "𒇧𒇧";
|
||
* // it outlives unsafeUTFStringCodePoints<char32_t>(s).
|
||
* auto it = unsafeUTFStringCodePoints<char32_t>(s).begin();
|
||
* ++it;
|
||
* char32_t second_code_point = it->codePoint(); // OK.
|
||
* \endcode
|
||
*
|
||
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
|
||
* @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units
|
||
* @draft ICU 78
|
||
* @see unsafeUTFStringCodePoints
|
||
*/
|
||
template<typename CP32, typename Range>
|
||
class UnsafeUTFStringCodePoints {
|
||
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
|
||
public:
|
||
/**
|
||
* Constructs an empty C++ "range" object.
|
||
* @draft ICU 78
|
||
*/
|
||
UnsafeUTFStringCodePoints() = default;
|
||
|
||
/**
|
||
* Constructs a C++ "range" object over the code points in the string.
|
||
* @param unitRange input range
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
|
||
explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
|
||
/**
|
||
* Constructs a C++ "range" object over the code points in the string,
|
||
* keeping a reference to the code unit range. This overload is used by
|
||
* utfStringCodePoints in C++17; in C++20, a ref_view is used instead (via
|
||
* views::all).
|
||
* @param unitRange input range
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
|
||
explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
|
||
|
||
/** Copy constructor. @draft ICU 78 */
|
||
UnsafeUTFStringCodePoints(const UnsafeUTFStringCodePoints &other) = default;
|
||
|
||
/** Copy assignment operator. @draft ICU 78 */
|
||
UnsafeUTFStringCodePoints &operator=(const UnsafeUTFStringCodePoints &other) = default;
|
||
|
||
/**
|
||
* @return the range start iterator
|
||
* @draft ICU 78
|
||
*/
|
||
auto begin() {
|
||
return unsafeUTFIterator<CP32>(unitRange.begin());
|
||
}
|
||
|
||
/**
|
||
* @return the range start iterator
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
|
||
auto begin() const {
|
||
return unsafeUTFIterator<CP32>(unitRange.begin());
|
||
}
|
||
|
||
/**
|
||
* @return the range limit (exclusive end) iterator
|
||
* @draft ICU 78
|
||
*/
|
||
auto end() {
|
||
using UnitIter = decltype(unitRange.begin());
|
||
using LimitIter = decltype(unitRange.end());
|
||
if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
|
||
// Return the code unit sentinel.
|
||
return unitRange.end();
|
||
} else {
|
||
return unsafeUTFIterator<CP32>(unitRange.end());
|
||
}
|
||
}
|
||
|
||
/**
|
||
* @return the range limit (exclusive end) iterator
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
|
||
auto end() const {
|
||
using UnitIter = decltype(unitRange.begin());
|
||
using LimitIter = decltype(unitRange.end());
|
||
if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
|
||
// Return the code unit sentinel.
|
||
return unitRange.end();
|
||
} else {
|
||
return unsafeUTFIterator<CP32>(unitRange.end());
|
||
}
|
||
}
|
||
|
||
/**
|
||
* @return std::reverse_iterator(end())
|
||
* @draft ICU 78
|
||
*/
|
||
auto rbegin() const {
|
||
return std::make_reverse_iterator(end());
|
||
}
|
||
|
||
/**
|
||
* @return std::reverse_iterator(begin())
|
||
* @draft ICU 78
|
||
*/
|
||
auto rend() const {
|
||
return std::make_reverse_iterator(begin());
|
||
}
|
||
|
||
private:
|
||
Range unitRange;
|
||
};
|
||
|
||
/** @internal */
|
||
template<typename CP32>
|
||
struct UnsafeUTFStringCodePointsAdaptor
|
||
#if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \
|
||
__cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
|
||
: std::ranges::range_adaptor_closure<UnsafeUTFStringCodePointsAdaptor<CP32>>
|
||
#endif
|
||
{
|
||
/** @internal */
|
||
template<typename Range>
|
||
auto operator()(Range &&unitRange) const {
|
||
#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2.
|
||
return UnsafeUTFStringCodePoints<CP32, std::ranges::views::all_t<Range>>(std::forward<Range>(unitRange));
|
||
#else
|
||
if constexpr (prv::is_basic_string_view_v<std::decay_t<Range>>) {
|
||
// Take basic_string_view by copy, not by reference. In C++20 this is handled by
|
||
// all_t<Range>, which is Range if Range is a view.
|
||
return UnsafeUTFStringCodePoints<CP32, std::decay_t<Range>>(std::forward<Range>(unitRange));
|
||
} else {
|
||
return UnsafeUTFStringCodePoints<CP32, Range>(std::forward<Range>(unitRange));
|
||
}
|
||
#endif
|
||
}
|
||
};
|
||
|
||
|
||
/**
|
||
* Range adaptor function object returning an UnsafeUTFStringCodePoints object that represents a
|
||
* "range" of code points in a code unit range. The string must be well-formed.
|
||
* Deduces the Range template parameter from the input, taking into account the value category: the
|
||
* code units will be referenced if possible, and moved if necessary.
|
||
*
|
||
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
|
||
* @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units
|
||
* @param unitRange input range
|
||
* @return an UnsafeUTFStringCodePoints<CP32, Range> for the given unitRange
|
||
* @draft ICU 78
|
||
*/
|
||
template<typename CP32>
|
||
constexpr UnsafeUTFStringCodePointsAdaptor<CP32> unsafeUTFStringCodePoints;
|
||
|
||
} // namespace U_HEADER_ONLY_NAMESPACE
|
||
|
||
|
||
#if defined(__cpp_lib_ranges)
|
||
template <typename CP32, UTFIllFormedBehavior behavior, typename Range>
|
||
constexpr bool std::ranges::enable_borrowed_range<
|
||
U_HEADER_ONLY_NAMESPACE::UTFStringCodePoints<CP32, behavior, Range>> =
|
||
std::ranges::enable_borrowed_range<Range>;
|
||
|
||
template <typename CP32, typename Range>
|
||
constexpr bool std::ranges::enable_borrowed_range<
|
||
U_HEADER_ONLY_NAMESPACE::UnsafeUTFStringCodePoints<CP32, Range>> =
|
||
std::ranges::enable_borrowed_range<Range>;
|
||
#endif
|
||
|
||
#endif // U_HIDE_DRAFT_API
|
||
#endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
|
||
#endif // __UTFITERATOR_H__
|