libmonoformat/cpp/src/monoformat_utf8.hpp

#ifndef MONOFORMAT_UTF8_HPP
#define MONOFORMAT_UTF8_HPP

#include <cstdint>
#include <cstddef>
#include <string_view>
#include <span>
#include <utility>

namespace monoformat {

inline std::pair<std::uint32_t, std::uint32_t> const UTF8CodePointRanges[] = {
    {0x000000u, 0x00007fu},
    {0x000080u, 0x0007ffu},
    {0x000800u, 0x00ffffu},
    {0x010000u, 0x10ffffu},
};

class UTF8Iterator {
public:
    UTF8Iterator() = default;

    explicit UTF8Iterator(std::string_view buffer) noexcept
        : UTF8Iterator{buffer.data(), buffer.data() + buffer.size()}
    {
    }

    explicit UTF8Iterator(std::span<char const> buffer) noexcept
        : UTF8Iterator{buffer.data(), buffer.data() + buffer.size()}
    {
    }

    explicit UTF8Iterator(std::span<unsigned char const> buffer) noexcept
        : UTF8Iterator{buffer.data(), buffer.data() + buffer.size()}
    {
    }

    explicit UTF8Iterator(std::span<signed char const> buffer) noexcept
        : UTF8Iterator{buffer.data(), buffer.data() + buffer.size()}
    {
    }

    explicit UTF8Iterator(std::span<std::byte const> buffer) noexcept
        : UTF8Iterator{buffer.data(), buffer.data() + buffer.size()}
    {
    }

    explicit UTF8Iterator(char const* ptr, char const* end) noexcept
        : m_ptr{reinterpret_cast<std::uint8_t const*>(ptr)}
        , m_next{reinterpret_cast<std::uint8_t const*>(ptr)}
        , m_end{reinterpret_cast<std::uint8_t const*>(end)}
    {
        findNext();
    }

    explicit UTF8Iterator(unsigned char const* ptr, unsigned char const* end) noexcept
        : m_ptr{reinterpret_cast<std::uint8_t const*>(ptr)}
        , m_next{reinterpret_cast<std::uint8_t const*>(ptr)}
        , m_end{reinterpret_cast<std::uint8_t const*>(end)}
    {
        findNext();
    }

    explicit UTF8Iterator(signed char const* ptr, signed char const* end) noexcept
        : m_ptr{reinterpret_cast<std::uint8_t const*>(ptr)}
        , m_next{reinterpret_cast<std::uint8_t const*>(ptr)}
        , m_end{reinterpret_cast<std::uint8_t const*>(end)}
    {
        findNext();
    }

    explicit UTF8Iterator(std::byte const* ptr, std::byte const* end) noexcept
        : m_ptr{reinterpret_cast<std::uint8_t const*>(ptr)}
        , m_next{reinterpret_cast<std::uint8_t const*>(ptr)}
        , m_end{reinterpret_cast<std::uint8_t const*>(end)}
    {
        findNext();
    }

    UTF8Iterator operator++(int) noexcept {
        UTF8Iterator result{*this};
        operator++();
        return result;
    }

    UTF8Iterator& operator++() noexcept {
        m_ptr = m_next;
        findNext();
        return *this;
    }

    friend inline bool operator==(UTF8Iterator const& a, UTF8Iterator const& b) noexcept {
        return a.m_ptr == b.m_ptr
            || (a.m_ptr == a.m_end && b.m_ptr == nullptr)
            || (b.m_ptr == b.m_end && a.m_ptr == nullptr);
    }

    friend inline bool operator<(UTF8Iterator const& a, UTF8Iterator const& b) noexcept {
        if (a == b) {
            return false;
        }
        if (a.m_ptr == a.m_end || !a.m_ptr) {
            return false;
        }
        if (b.m_ptr == b.m_end || !b.m_ptr) {
            return true;
        }
        return a.m_ptr < b.m_ptr;
    }

    inline std::uint32_t operator*() const noexcept {
        return m_codePoint;
    }

private:
    void findNext() noexcept {
        if (!m_ptr || m_ptr == m_end) {
            m_codePoint = ~std::uint32_t{0};
            return;
        }
        m_ptr = m_next;
        if (!m_ptr || m_ptr == m_end) {
            m_codePoint = ~std::uint32_t{0};
            return;
        }

        int t = 0;
        int n = -1;
        m_codePoint = 0;
        m_next = m_ptr;
        while (n != 0 && m_next < m_end) {
            std::uint8_t value = *m_next++;
            if (n == -1) {
                if ((value & 0b1111'1000u) == 0b1111'0000u) {
                    t = 4;
                    n = 3;
                    m_codePoint = (value & 0b0000'0111u);
                } else if ((value & 0b1111'0000u) == 0b1110'0000u) {
                    t = 3;
                    n = 2;
                    m_codePoint = (value & 0b0000'1111u);
                } else if ((value & 0b1110'0000u) == 0b1100'0000u) {
                    t = 2;
                    n = 1;
                    m_codePoint = (value & 0b0001'1111u);
                } else if ((value & 0b1000'0000u) == 0b0000'0000u) {
                    t = 1;
                    n = 0;
                    m_codePoint = value;
                } else {
                    // Illegal sequence, stop here
                    t = 1;
                    n = 0;
                    m_codePoint = ~std::uint32_t{0};
                }
            } else {
                if ((value & 0b1100'0000u) != 0b1000'0000u) {
                    // Illegal sequence, stop here
                    t = 1;
                    n = 0;
                    m_codePoint = ~std::uint32_t{0};
                }

                m_codePoint <<= 6;
                m_codePoint |= value & 0b0011'1111u;
                --n;
            }
        }

        if (n == 0) {
            if (m_codePoint < UTF8CodePointRanges[t - 1].first || m_codePoint > UTF8CodePointRanges[t - 1].second) {
                m_codePoint = ~std::uint32_t{0};
                return;
            }

            // Surrogates are illegal
            if (m_codePoint >= 0xd800u && m_codePoint <= 0xdfffu) {
                m_codePoint = ~std::uint32_t{0};
                return;
            }

            // Code point is OK
            return;
        } else {
            /* We've reached the end of the string inside a sequence, so make
             * this an illegal character
             */
            m_codePoint = ~std::uint32_t{0};
            return;
        }
    }

    std::uint8_t const* m_ptr{nullptr};
    std::uint8_t const* m_next{nullptr};
    std::uint8_t const* m_end{nullptr};
    std::uint32_t m_codePoint{};
};

inline UTF8Iterator begin(UTF8Iterator const& a) noexcept {
    return a;
}

inline UTF8Iterator end(UTF8Iterator const&) noexcept {
    return UTF8Iterator{};
}

} // monoformat

#endif // MONOFORMAT_UTF8_HPP