You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
209 lines
6.0 KiB
209 lines
6.0 KiB
#ifndef MONOFORMAT_UTF8_HPP
|
|
#define MONOFORMAT_UTF8_HPP
|
|
|
|
#include <cstdint>
|
|
#include <cstddef>
|
|
#include <string_view>
|
|
#include <span>
|
|
#include <utility>
|
|
|
|
namespace monoformat {
|
|
|
|
inline std::pair<std::uint32_t, std::uint32_t> const UTF8CodePointRanges[] = {
|
|
{0x000000u, 0x00007fu},
|
|
{0x000080u, 0x0007ffu},
|
|
{0x000800u, 0x00ffffu},
|
|
{0x010000u, 0x10ffffu},
|
|
};
|
|
|
|
class UTF8Iterator {
|
|
public:
|
|
UTF8Iterator() = default;
|
|
|
|
explicit UTF8Iterator(std::string_view buffer) noexcept
|
|
: UTF8Iterator{buffer.data(), buffer.data() + buffer.size()}
|
|
{
|
|
}
|
|
|
|
explicit UTF8Iterator(std::span<char const> buffer) noexcept
|
|
: UTF8Iterator{buffer.data(), buffer.data() + buffer.size()}
|
|
{
|
|
}
|
|
|
|
explicit UTF8Iterator(std::span<unsigned char const> buffer) noexcept
|
|
: UTF8Iterator{buffer.data(), buffer.data() + buffer.size()}
|
|
{
|
|
}
|
|
|
|
explicit UTF8Iterator(std::span<signed char const> buffer) noexcept
|
|
: UTF8Iterator{buffer.data(), buffer.data() + buffer.size()}
|
|
{
|
|
}
|
|
|
|
explicit UTF8Iterator(std::span<std::byte const> buffer) noexcept
|
|
: UTF8Iterator{buffer.data(), buffer.data() + buffer.size()}
|
|
{
|
|
}
|
|
|
|
explicit UTF8Iterator(char const* ptr, char const* end) noexcept
|
|
: m_ptr{reinterpret_cast<std::uint8_t const*>(ptr)}
|
|
, m_next{reinterpret_cast<std::uint8_t const*>(ptr)}
|
|
, m_end{reinterpret_cast<std::uint8_t const*>(end)}
|
|
{
|
|
findNext();
|
|
}
|
|
|
|
explicit UTF8Iterator(unsigned char const* ptr, unsigned char const* end) noexcept
|
|
: m_ptr{reinterpret_cast<std::uint8_t const*>(ptr)}
|
|
, m_next{reinterpret_cast<std::uint8_t const*>(ptr)}
|
|
, m_end{reinterpret_cast<std::uint8_t const*>(end)}
|
|
{
|
|
findNext();
|
|
}
|
|
|
|
explicit UTF8Iterator(signed char const* ptr, signed char const* end) noexcept
|
|
: m_ptr{reinterpret_cast<std::uint8_t const*>(ptr)}
|
|
, m_next{reinterpret_cast<std::uint8_t const*>(ptr)}
|
|
, m_end{reinterpret_cast<std::uint8_t const*>(end)}
|
|
{
|
|
findNext();
|
|
}
|
|
|
|
explicit UTF8Iterator(std::byte const* ptr, std::byte const* end) noexcept
|
|
: m_ptr{reinterpret_cast<std::uint8_t const*>(ptr)}
|
|
, m_next{reinterpret_cast<std::uint8_t const*>(ptr)}
|
|
, m_end{reinterpret_cast<std::uint8_t const*>(end)}
|
|
{
|
|
findNext();
|
|
}
|
|
|
|
UTF8Iterator operator++(int) noexcept {
|
|
UTF8Iterator result{*this};
|
|
operator++();
|
|
return result;
|
|
}
|
|
|
|
UTF8Iterator& operator++() noexcept {
|
|
m_ptr = m_next;
|
|
findNext();
|
|
return *this;
|
|
}
|
|
|
|
friend inline bool operator==(UTF8Iterator const& a, UTF8Iterator const& b) noexcept {
|
|
return a.m_ptr == b.m_ptr
|
|
|| (a.m_ptr == a.m_end && b.m_ptr == nullptr)
|
|
|| (b.m_ptr == b.m_end && a.m_ptr == nullptr);
|
|
}
|
|
|
|
friend inline bool operator<(UTF8Iterator const& a, UTF8Iterator const& b) noexcept {
|
|
if (a == b) {
|
|
return false;
|
|
}
|
|
if (a.m_ptr == a.m_end || !a.m_ptr) {
|
|
return false;
|
|
}
|
|
if (b.m_ptr == b.m_end || !b.m_ptr) {
|
|
return true;
|
|
}
|
|
return a.m_ptr < b.m_ptr;
|
|
}
|
|
|
|
inline std::uint32_t operator*() const noexcept {
|
|
return m_codePoint;
|
|
}
|
|
|
|
private:
|
|
void findNext() noexcept {
|
|
if (!m_ptr || m_ptr == m_end) {
|
|
m_codePoint = ~std::uint32_t{0};
|
|
return;
|
|
}
|
|
m_ptr = m_next;
|
|
if (!m_ptr || m_ptr == m_end) {
|
|
m_codePoint = ~std::uint32_t{0};
|
|
return;
|
|
}
|
|
|
|
int t = 0;
|
|
int n = -1;
|
|
m_codePoint = 0;
|
|
m_next = m_ptr;
|
|
while (n != 0 && m_next < m_end) {
|
|
std::uint8_t value = *m_next++;
|
|
if (n == -1) {
|
|
if ((value & 0b1111'1000u) == 0b1111'0000u) {
|
|
t = 4;
|
|
n = 3;
|
|
m_codePoint = (value & 0b0000'0111u);
|
|
} else if ((value & 0b1111'0000u) == 0b1110'0000u) {
|
|
t = 3;
|
|
n = 2;
|
|
m_codePoint = (value & 0b0000'1111u);
|
|
} else if ((value & 0b1110'0000u) == 0b1100'0000u) {
|
|
t = 2;
|
|
n = 1;
|
|
m_codePoint = (value & 0b0001'1111u);
|
|
} else if ((value & 0b1000'0000u) == 0b0000'0000u) {
|
|
t = 1;
|
|
n = 0;
|
|
m_codePoint = value;
|
|
} else {
|
|
// Illegal sequence, stop here
|
|
t = 1;
|
|
n = 0;
|
|
m_codePoint = ~std::uint32_t{0};
|
|
}
|
|
} else {
|
|
if ((value & 0b1100'0000u) != 0b1000'0000u) {
|
|
// Illegal sequence, stop here
|
|
t = 1;
|
|
n = 0;
|
|
m_codePoint = ~std::uint32_t{0};
|
|
}
|
|
|
|
m_codePoint <<= 6;
|
|
m_codePoint |= value & 0b0011'1111u;
|
|
--n;
|
|
}
|
|
}
|
|
|
|
if (n == 0) {
|
|
if (m_codePoint < UTF8CodePointRanges[t - 1].first || m_codePoint > UTF8CodePointRanges[t - 1].second) {
|
|
m_codePoint = ~std::uint32_t{0};
|
|
return;
|
|
}
|
|
|
|
// Surrogates are illegal
|
|
if (m_codePoint >= 0xd800u && m_codePoint <= 0xdfffu) {
|
|
m_codePoint = ~std::uint32_t{0};
|
|
return;
|
|
}
|
|
|
|
// Code point is OK
|
|
return;
|
|
} else {
|
|
/* We've reached the end of the string inside a sequence, so make
|
|
* this an illegal character
|
|
*/
|
|
m_codePoint = ~std::uint32_t{0};
|
|
return;
|
|
}
|
|
}
|
|
|
|
std::uint8_t const* m_ptr{nullptr};
|
|
std::uint8_t const* m_next{nullptr};
|
|
std::uint8_t const* m_end{nullptr};
|
|
std::uint32_t m_codePoint{};
|
|
};
|
|
|
|
inline UTF8Iterator begin(UTF8Iterator const& a) noexcept {
|
|
return a;
|
|
}
|
|
|
|
inline UTF8Iterator end(UTF8Iterator const&) noexcept {
|
|
return UTF8Iterator{};
|
|
}
|
|
|
|
} // monoformat
|
|
|
|
#endif // MONOFORMAT_UTF8_HPP
|
|
|