#ifndef MONOFORMAT_UTF8_HPP #define MONOFORMAT_UTF8_HPP #include #include #include #include #include namespace monoformat { inline std::pair const UTF8CodePointRanges[] = { {0x000000u, 0x00007fu}, {0x000080u, 0x0007ffu}, {0x000800u, 0x00ffffu}, {0x010000u, 0x10ffffu}, }; class UTF8Iterator { public: UTF8Iterator() = default; explicit UTF8Iterator(std::string_view buffer) noexcept : UTF8Iterator{buffer.data(), buffer.data() + buffer.size()} { } explicit UTF8Iterator(std::span buffer) noexcept : UTF8Iterator{buffer.data(), buffer.data() + buffer.size()} { } explicit UTF8Iterator(std::span buffer) noexcept : UTF8Iterator{buffer.data(), buffer.data() + buffer.size()} { } explicit UTF8Iterator(std::span buffer) noexcept : UTF8Iterator{buffer.data(), buffer.data() + buffer.size()} { } explicit UTF8Iterator(std::span buffer) noexcept : UTF8Iterator{buffer.data(), buffer.data() + buffer.size()} { } explicit UTF8Iterator(char const* ptr, char const* end) noexcept : m_ptr{reinterpret_cast(ptr)} , m_next{reinterpret_cast(ptr)} , m_end{reinterpret_cast(end)} { findNext(); } explicit UTF8Iterator(unsigned char const* ptr, unsigned char const* end) noexcept : m_ptr{reinterpret_cast(ptr)} , m_next{reinterpret_cast(ptr)} , m_end{reinterpret_cast(end)} { findNext(); } explicit UTF8Iterator(signed char const* ptr, signed char const* end) noexcept : m_ptr{reinterpret_cast(ptr)} , m_next{reinterpret_cast(ptr)} , m_end{reinterpret_cast(end)} { findNext(); } explicit UTF8Iterator(std::byte const* ptr, std::byte const* end) noexcept : m_ptr{reinterpret_cast(ptr)} , m_next{reinterpret_cast(ptr)} , m_end{reinterpret_cast(end)} { findNext(); } UTF8Iterator operator++(int) noexcept { UTF8Iterator result{*this}; operator++(); return result; } UTF8Iterator& operator++() noexcept { m_ptr = m_next; findNext(); return *this; } friend inline bool operator==(UTF8Iterator const& a, UTF8Iterator const& b) noexcept { return a.m_ptr == b.m_ptr || (a.m_ptr == a.m_end && b.m_ptr == nullptr) || (b.m_ptr == b.m_end && a.m_ptr == nullptr); } friend inline bool operator<(UTF8Iterator const& a, UTF8Iterator const& b) noexcept { if (a == b) { return false; } if (a.m_ptr == a.m_end || !a.m_ptr) { return false; } if (b.m_ptr == b.m_end || !b.m_ptr) { return true; } return a.m_ptr < b.m_ptr; } inline std::uint32_t operator*() const noexcept { return m_codePoint; } private: void findNext() noexcept { if (!m_ptr || m_ptr == m_end) { m_codePoint = ~std::uint32_t{0}; return; } m_ptr = m_next; if (!m_ptr || m_ptr == m_end) { m_codePoint = ~std::uint32_t{0}; return; } int t = 0; int n = -1; m_codePoint = 0; m_next = m_ptr; while (n != 0 && m_next < m_end) { std::uint8_t value = *m_next++; if (n == -1) { if ((value & 0b1111'1000u) == 0b1111'0000u) { t = 4; n = 3; m_codePoint = (value & 0b0000'0111u); } else if ((value & 0b1111'0000u) == 0b1110'0000u) { t = 3; n = 2; m_codePoint = (value & 0b0000'1111u); } else if ((value & 0b1110'0000u) == 0b1100'0000u) { t = 2; n = 1; m_codePoint = (value & 0b0001'1111u); } else if ((value & 0b1000'0000u) == 0b0000'0000u) { t = 1; n = 0; m_codePoint = value; } else { // Illegal sequence, stop here t = 1; n = 0; m_codePoint = ~std::uint32_t{0}; } } else { if ((value & 0b1100'0000u) != 0b1000'0000u) { // Illegal sequence, stop here t = 1; n = 0; m_codePoint = ~std::uint32_t{0}; } m_codePoint <<= 6; m_codePoint |= value & 0b0011'1111u; --n; } } if (n == 0) { if (m_codePoint < UTF8CodePointRanges[t - 1].first || m_codePoint > UTF8CodePointRanges[t - 1].second) { m_codePoint = ~std::uint32_t{0}; return; } // Surrogates are illegal if (m_codePoint >= 0xd800u && m_codePoint <= 0xdfffu) { m_codePoint = ~std::uint32_t{0}; return; } // Code point is OK return; } else { /* We've reached the end of the string inside a sequence, so make * this an illegal character */ m_codePoint = ~std::uint32_t{0}; return; } } std::uint8_t const* m_ptr{nullptr}; std::uint8_t const* m_next{nullptr}; std::uint8_t const* m_end{nullptr}; std::uint32_t m_codePoint{}; }; inline UTF8Iterator begin(UTF8Iterator const& a) noexcept { return a; } inline UTF8Iterator end(UTF8Iterator const&) noexcept { return UTF8Iterator{}; } } // monoformat #endif // MONOFORMAT_UTF8_HPP