Abfahrtsanzeiger Display Basic Library
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
libmonoformat/cpp/src/monoformat_utf8.hpp

209 lines
6.0 KiB

#ifndef MONOFORMAT_UTF8_HPP
#define MONOFORMAT_UTF8_HPP
#include <cstdint>
#include <cstddef>
#include <string_view>
#include <span>
#include <utility>
namespace monoformat {
inline std::pair<std::uint32_t, std::uint32_t> const UTF8CodePointRanges[] = {
{0x000000u, 0x00007fu},
{0x000080u, 0x0007ffu},
{0x000800u, 0x00ffffu},
{0x010000u, 0x10ffffu},
};
class UTF8Iterator {
public:
UTF8Iterator() = default;
explicit UTF8Iterator(std::string_view buffer) noexcept
: UTF8Iterator{buffer.data(), buffer.data() + buffer.size()}
{
}
explicit UTF8Iterator(std::span<char const> buffer) noexcept
: UTF8Iterator{buffer.data(), buffer.data() + buffer.size()}
{
}
explicit UTF8Iterator(std::span<unsigned char const> buffer) noexcept
: UTF8Iterator{buffer.data(), buffer.data() + buffer.size()}
{
}
explicit UTF8Iterator(std::span<signed char const> buffer) noexcept
: UTF8Iterator{buffer.data(), buffer.data() + buffer.size()}
{
}
explicit UTF8Iterator(std::span<std::byte const> buffer) noexcept
: UTF8Iterator{buffer.data(), buffer.data() + buffer.size()}
{
}
explicit UTF8Iterator(char const* ptr, char const* end) noexcept
: m_ptr{reinterpret_cast<std::uint8_t const*>(ptr)}
, m_next{reinterpret_cast<std::uint8_t const*>(ptr)}
, m_end{reinterpret_cast<std::uint8_t const*>(end)}
{
findNext();
}
explicit UTF8Iterator(unsigned char const* ptr, unsigned char const* end) noexcept
: m_ptr{reinterpret_cast<std::uint8_t const*>(ptr)}
, m_next{reinterpret_cast<std::uint8_t const*>(ptr)}
, m_end{reinterpret_cast<std::uint8_t const*>(end)}
{
findNext();
}
explicit UTF8Iterator(signed char const* ptr, signed char const* end) noexcept
: m_ptr{reinterpret_cast<std::uint8_t const*>(ptr)}
, m_next{reinterpret_cast<std::uint8_t const*>(ptr)}
, m_end{reinterpret_cast<std::uint8_t const*>(end)}
{
findNext();
}
explicit UTF8Iterator(std::byte const* ptr, std::byte const* end) noexcept
: m_ptr{reinterpret_cast<std::uint8_t const*>(ptr)}
, m_next{reinterpret_cast<std::uint8_t const*>(ptr)}
, m_end{reinterpret_cast<std::uint8_t const*>(end)}
{
findNext();
}
UTF8Iterator operator++(int) noexcept {
UTF8Iterator result{*this};
operator++();
return result;
}
UTF8Iterator& operator++() noexcept {
m_ptr = m_next;
findNext();
return *this;
}
friend inline bool operator==(UTF8Iterator const& a, UTF8Iterator const& b) noexcept {
return a.m_ptr == b.m_ptr
|| (a.m_ptr == a.m_end && b.m_ptr == nullptr)
|| (b.m_ptr == b.m_end && a.m_ptr == nullptr);
}
friend inline bool operator<(UTF8Iterator const& a, UTF8Iterator const& b) noexcept {
if (a == b) {
return false;
}
if (a.m_ptr == a.m_end || !a.m_ptr) {
return false;
}
if (b.m_ptr == b.m_end || !b.m_ptr) {
return true;
}
return a.m_ptr < b.m_ptr;
}
inline std::uint32_t operator*() const noexcept {
return m_codePoint;
}
private:
void findNext() noexcept {
if (!m_ptr || m_ptr == m_end) {
m_codePoint = ~std::uint32_t{0};
return;
}
m_ptr = m_next;
if (!m_ptr || m_ptr == m_end) {
m_codePoint = ~std::uint32_t{0};
return;
}
int t = 0;
int n = -1;
m_codePoint = 0;
m_next = m_ptr;
while (n != 0 && m_next < m_end) {
std::uint8_t value = *m_next++;
if (n == -1) {
if ((value & 0b1111'1000u) == 0b1111'0000u) {
t = 4;
n = 3;
m_codePoint = (value & 0b0000'0111u);
} else if ((value & 0b1111'0000u) == 0b1110'0000u) {
t = 3;
n = 2;
m_codePoint = (value & 0b0000'1111u);
} else if ((value & 0b1110'0000u) == 0b1100'0000u) {
t = 2;
n = 1;
m_codePoint = (value & 0b0001'1111u);
} else if ((value & 0b1000'0000u) == 0b0000'0000u) {
t = 1;
n = 0;
m_codePoint = value;
} else {
// Illegal sequence, stop here
t = 1;
n = 0;
m_codePoint = ~std::uint32_t{0};
}
} else {
if ((value & 0b1100'0000u) != 0b1000'0000u) {
// Illegal sequence, stop here
t = 1;
n = 0;
m_codePoint = ~std::uint32_t{0};
}
m_codePoint <<= 6;
m_codePoint |= value & 0b0011'1111u;
--n;
}
}
if (n == 0) {
if (m_codePoint < UTF8CodePointRanges[t - 1].first || m_codePoint > UTF8CodePointRanges[t - 1].second) {
m_codePoint = ~std::uint32_t{0};
return;
}
// Surrogates are illegal
if (m_codePoint >= 0xd800u && m_codePoint <= 0xdfffu) {
m_codePoint = ~std::uint32_t{0};
return;
}
// Code point is OK
return;
} else {
/* We've reached the end of the string inside a sequence, so make
* this an illegal character
*/
m_codePoint = ~std::uint32_t{0};
return;
}
}
std::uint8_t const* m_ptr{nullptr};
std::uint8_t const* m_next{nullptr};
std::uint8_t const* m_end{nullptr};
std::uint32_t m_codePoint{};
};
inline UTF8Iterator begin(UTF8Iterator const& a) noexcept {
return a;
}
inline UTF8Iterator end(UTF8Iterator const&) noexcept {
return UTF8Iterator{};
}
} // monoformat
#endif // MONOFORMAT_UTF8_HPP