AyCode.Core/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs

using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;

namespace AyCode.Core.Serializers.Binaries;

/// <summary>
/// In-house UTF-8 ↔ UTF-16 transcoder used by the binary serializer hot path. Trust-input
/// semantics — no validation (writer-side input is a .NET string, reader-side input is valid
/// UTF-8 by writer contract). Bypasses <see cref="System.Text.Encoding.UTF8"/> virtual-dispatch
/// + EncoderFallback / DecoderFallback overhead.
///
/// <para><b>SIMD path hierarchy</b> (cascading tail-handler):</para>
/// <list type="bullet">
///   <item>Vector512 / AVX-512BW — 64 byte/iter (Intel server, Intel 11th gen client, AMD Zen 4+)</item>
///   <item>Vector256 / AVX2 — 32 byte/iter (Intel 12-14th gen client, AMD Zen 3 and earlier)</item>
///   <item>Vector128 / SSE-NEON-WASM — 16 byte/iter (Apple Silicon NEON, WASM SIMD, legacy SSE2)</item>
///   <item>Scalar — final tail (&lt; 16 byte) and no-SIMD fall-back</item>
/// </list>
///
/// <para>JIT/AOT path-selection: <c>Avx512BW.IsSupported</c> / <c>Vector256.IsHardwareAccelerated</c> /
/// <c>Vector128.IsHardwareAccelerated</c> are <c>[Intrinsic]</c> static booleans — the compiler
/// constant-folds the dead branches per host. Non-supported tiers eliminate to zero generated code.</para>
///
/// <para>Algorithm reference: see <c>BINARY_TODO.md#accore-bin-t-v4n2</c> for the multi-tier SIMD
/// transcoder design and per-tier acceptance criteria.</para>
/// </summary>
internal static class Utf8Transcoder
{
    /// <summary>
    /// Custom UTF-16 → UTF-8 single-pass encoder. Symmetric with <see cref="DecodeUtf8SinglePass"/>.
    /// </summary>
    /// <remarks>
    /// Bypasses <see cref="System.Text.Encoding.UTF8"/>.GetBytes virtual-dispatch + encoder-fallback
    /// overhead. Trusted-input encoder — no validation (writer side, the input is a .NET string
    /// which always has valid UTF-16 surrogate pairs).
    ///
    /// Layered for max throughput on mixed content:
    /// • <b>Phase 1a — Vector512 ASCII narrow:</b> 32 chars/iter on AVX-512BW hosts. JIT lowers
    ///   <c>Vector256.Narrow</c> to AVX-512 VPACKUSWB (single-instruction pack).
    /// • <b>Phase 1b — Vector256 ASCII narrow:</b> 16 chars/iter on AVX2 hosts (also handles tail
    ///   &lt; 32 chars after the AVX-512 path on capable hosts).
    /// • <b>Phase 1c — Vector128 ASCII narrow:</b> 16 chars/iter on Apple Silicon NEON / WASM SIMD
    ///   / legacy SSE2 hosts (also handles tail &lt; 16 chars).
    /// • <b>Phase 2 — DWORD ASCII batch:</b> 4 chars/iter. OR-mask test
    ///   <c>(c0 | c1 | c2 | c3) &amp; 0xFF80 == 0</c>; on hit, 4 byte writes per iter.
    /// • <b>Phase 3 — Scalar multi-byte encode:</b> 1-byte (ASCII), 2-byte (Latin extended,
    ///   Cyrillic, Greek), 3-byte (CJK BMP), 4-byte (supplementary plane via UTF-16 surrogate pair).
    ///
    /// Returns actual byte count written. Caller must ensure <paramref name="dst"/> has at least
    /// <c>src.Length * 4</c> capacity (UTF-8 worst case).
    /// </remarks>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    internal static int EncodeUtf8SinglePass(ReadOnlySpan<char> src, Span<byte> dst)
    {
        int srcIdx = 0, dstIdx = 0;
        ref char srcRefChar = ref MemoryMarshal.GetReference(src);
        ref ushort srcRefU16 = ref Unsafe.As<char, ushort>(ref srcRefChar);
        ref byte dstRef = ref MemoryMarshal.GetReference(dst);

        // Phase 1a — Vector512 ASCII narrow (32 chars/iter on AVX-512BW hosts).
        // JIT-time path-selection via Avx512BW.IsSupported [Intrinsic] static bool — non-AVX-512
        // hosts get this branch eliminated by constant-folding (zero overhead in the generated asm).
        if (Avx512BW.IsSupported)
        {
            var asciiMask512 = Vector512.Create((ushort)0xFF80);
            while (src.Length - srcIdx >= Vector512<ushort>.Count)  // 32 chars per Vector512<ushort>
            {
                var v = Vector512.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
                // ASCII detect: any char's high bits set (>= 0x80)?
                if ((v & asciiMask512) != Vector512<ushort>.Zero) break;
                // Narrow 32 ushorts (Vector512) → 32 bytes (Vector256) via two 256-bit halves.
                // The JIT lowers this to AVX-512 VPACKUSWB on capable hosts (single-instruction pack).
                var bytes = Vector256.Narrow(v.GetLower(), v.GetUpper());
                bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
                srcIdx += Vector512<ushort>.Count;
                dstIdx += Vector512<ushort>.Count;  // 32 chars → 32 bytes (1:1 for ASCII)
            }
        }

        // Phase 1b — Vector256 ASCII narrow (16 chars/iter on AVX2 hosts; also handles tail < 32 chars
        // after the AVX-512 path on capable hosts).
        if (Vector256.IsHardwareAccelerated)
        {
            var asciiMask = Vector256.Create((ushort)0xFF80);
            while (src.Length - srcIdx >= Vector256<ushort>.Count)  // 16 chars per Vector256<ushort>
            {
                var v = Vector256.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
                // ASCII detect: any char's high bits set (>= 0x80)?
                if ((v & asciiMask) != Vector256<ushort>.Zero) break;
                // Narrow 16 ushorts (Vector256) → 16 bytes (Vector128) via two halves
                var bytes = Vector128.Narrow(v.GetLower(), v.GetUpper());
                bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
                srcIdx += Vector256<ushort>.Count;
                dstIdx += Vector256<ushort>.Count;  // 16 chars → 16 bytes (1:1 for ASCII)
            }
        }

        // Phase 1c — Vector128 ASCII narrow (16 chars/iter on Apple Silicon NEON, WASM SIMD,
        // legacy SSE2; also handles tail < 16 chars after higher tiers). Cross-platform —
        // Vector128.IsHardwareAccelerated is true on any 128-bit-SIMD-capable host.
        // Two Vector128<ushort> loads (8 + 8 = 16 chars) narrow to one Vector128<byte> (16 bytes).
        if (Vector128.IsHardwareAccelerated)
        {
            var asciiMask128 = Vector128.Create((ushort)0xFF80);
            while (src.Length - srcIdx >= 16)  // 16 chars = 2 × Vector128<ushort>
            {
                var lo = Vector128.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
                var hi = Vector128.LoadUnsafe(ref srcRefU16, (uint)(srcIdx + 8));
                // ASCII detect: any char's high bits set in either half?
                if (((lo | hi) & asciiMask128) != Vector128<ushort>.Zero) break;
                // Narrow 2× Vector128<ushort> (16 chars) → Vector128<byte> (16 bytes)
                var bytes = Vector128.Narrow(lo, hi);
                bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
                srcIdx += 16;
                dstIdx += 16;
            }
        }

        // Phase 2/3 — scalar with DWORD ASCII batch
        while (srcIdx < src.Length)
        {
            // DWORD ASCII batch: 4 chars → 4 bytes when all ASCII
            if (src.Length - srcIdx >= 4)
            {
                var c0 = Unsafe.Add(ref srcRefChar, srcIdx);
                var c1 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
                var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 2);
                var c3 = Unsafe.Add(ref srcRefChar, srcIdx + 3);
                if (((c0 | c1 | c2 | c3) & 0xFF80) == 0)
                {
                    Unsafe.Add(ref dstRef, dstIdx)     = (byte)c0;
                    Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)c1;
                    Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)c2;
                    Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)c3;
                    srcIdx += 4;
                    dstIdx += 4;
                    continue;
                }
            }

            // Scalar single-char encode
            var c = Unsafe.Add(ref srcRefChar, srcIdx);
            if (c < 0x80)
            {
                // 1-byte ASCII (U+0000–U+007F)
                Unsafe.Add(ref dstRef, dstIdx++) = (byte)c;
                srcIdx += 1;
            }
            else if (c < 0x800)
            {
                // 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF
                // Latin extended (Hungarian, Polish, Czech, Spanish, French, German diacritics),
                // Greek, Cyrillic, Hebrew, Arabic.
                Unsafe.Add(ref dstRef, dstIdx)     = (byte)(0xC0 | (c >> 6));
                Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | (c & 0x3F));
                dstIdx += 2;
                srcIdx += 1;
            }
            else if ((c & 0xF800) != 0xD800)
            {
                // 3-byte BMP: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF (excluding surrogate range)
                // CJK BMP, various other BMP scripts.
                Unsafe.Add(ref dstRef, dstIdx)     = (byte)(0xE0 | (c >> 12));
                Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((c >> 6) & 0x3F));
                Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | (c & 0x3F));
                dstIdx += 3;
                srcIdx += 1;
            }
            else
            {
                // 4-byte: surrogate pair → supplementary plane codepoint (U+10000–U+10FFFF)
                // High surrogate (0xD800–0xDBFF) followed by low surrogate (0xDC00–0xDFFF).
                var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
                var codepoint = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00);
                Unsafe.Add(ref dstRef, dstIdx)     = (byte)(0xF0 | (codepoint >> 18));
                Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((codepoint >> 12) & 0x3F));
                Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | ((codepoint >> 6) & 0x3F));
                Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(0x80 | (codepoint & 0x3F));
                dstIdx += 4;
                srcIdx += 2;  // consumed 2 chars (surrogate pair)
            }
        }

        return dstIdx;
    }

    /// <summary>
    /// Counts the UTF-8 byte length produced by encoding the given UTF-16 char span.
    /// Symmetric encode-side helper to <see cref="CountUtf8Chars"/>; the value returned equals
    /// the <c>bytesWritten</c> that <see cref="EncodeUtf8SinglePass"/> would produce.
    /// </summary>
    /// <remarks>
    /// Trusted-input — assumes well-formed UTF-16 (every high surrogate paired with a low surrogate),
    /// matching <see cref="EncodeUtf8SinglePass"/>'s contract. Bypasses
    /// <see cref="System.Text.Encoding.UTF8"/>.GetByteCount virtual-dispatch + encoder-fallback overhead.
    ///
    /// <para>Layered SIMD: Vector512 (32 chars/iter) on AVX-512BW hosts → Vector256 (16 chars/iter)
    /// on AVX2 hosts → Vector128 (8 chars/iter) on Apple Silicon NEON / WASM SIMD / SSE2 → scalar tail.
    /// JIT/AOT path-selection via <c>Avx512BW.IsSupported</c> / <c>Vector{N}.IsHardwareAccelerated</c>
    /// <c>[Intrinsic]</c> booleans (constant-folded dead branches per host).</para>
    ///
    /// <para>Per-char UTF-8 byte contribution:</para>
    /// <list type="bullet">
    ///   <item><c>c &lt; 0x80</c> → 1 byte (ASCII)</item>
    ///   <item><c>0x80 ≤ c &lt; 0x800</c> → 2 bytes (Latin extended, Cyrillic, Greek, Hebrew, Arabic)</item>
    ///   <item><c>0x800 ≤ c &lt; 0xD800</c> or <c>c ≥ 0xE000</c> → 3 bytes (CJK BMP, other BMP)</item>
    ///   <item><c>0xD800 ≤ c &lt; 0xDC00</c> (high surrogate) → 4 bytes (whole pair encoded here)</item>
    ///   <item><c>0xDC00 ≤ c &lt; 0xE000</c> (low surrogate) → 0 bytes (absorbed by paired high surrogate)</item>
    /// </list>
    ///
    /// <para>SIMD per-block: 5 popcount-on-threshold-mask operations
    /// (&lt; 0x80, &lt; 0x800, &lt; 0xD800, &lt; 0xDC00, &lt; 0xE000). Closed-form aggregation:
    /// <c>bytes = 3*N - ascii - c_lt_0x800 + highSur - 3*lowSur</c>
    /// where <c>ascii = popcount(c &lt; 0x80)</c>,
    /// <c>highSur = popcount(c &lt; 0xDC00) - popcount(c &lt; 0xD800)</c>,
    /// <c>lowSur = popcount(c &lt; 0xE000) - popcount(c &lt; 0xDC00)</c>.</para>
    ///
    /// <para>Both <c>highSur</c> and <c>lowSur</c> must be counted independently — feature-equivalent
    /// to the per-char model (high → 4 bytes, low → 0 bytes). A natural-looking shortcut
    /// (<c>lowSur == highSur</c> for well-formed UTF-16) is FALSE within a single SIMD chunk when
    /// a surrogate pair straddles the chunk boundary; over the whole string the counts equalize
    /// but per-block they don't. Across-the-boundary correctness: a high surrogate counted in
    /// chunk N contributes 4 bytes there; its low surrogate (in chunk N+1) contributes 0 bytes —
    /// total 4 bytes per pair regardless of where the boundary falls.</para>
    ///
    /// <para>Pairs with <see cref="EncodeUtf8SinglePass"/> for two-pass [VarUInt][bytes] writes in
    /// cold-fallback paths (e.g. <c>WriteFixStrDirect</c>'s non-ASCII fallback in
    /// <c>BinarySerializationContext</c>).</para>
    /// </remarks>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    internal static int GetUtf8ByteCount(ReadOnlySpan<char> src)
    {
        var byteCount = 0;
        var i = 0;
        var n = src.Length;
        ref ushort srcRef = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(src));

        // SIMD path 1: Vector512 (32 chars/iter) on AVX-512BW hosts
        if (Avx512BW.IsSupported && n >= Vector512<ushort>.Count)
        {
            var v_0x80 = Vector512.Create((ushort)0x80);
            var v_0x800 = Vector512.Create((ushort)0x800);
            var v_0xD800 = Vector512.Create((ushort)0xD800);
            var v_0xDC00 = Vector512.Create((ushort)0xDC00);
            var v_0xE000 = Vector512.Create((ushort)0xE000);

            do
            {
                var v = Vector512.LoadUnsafe(ref srcRef, (uint)i);

                var c_lt_0x80   = BitOperations.PopCount(Vector512.LessThan(v, v_0x80).ExtractMostSignificantBits());
                var c_lt_0x800  = BitOperations.PopCount(Vector512.LessThan(v, v_0x800).ExtractMostSignificantBits());
                var c_lt_0xD800 = BitOperations.PopCount(Vector512.LessThan(v, v_0xD800).ExtractMostSignificantBits());
                var c_lt_0xDC00 = BitOperations.PopCount(Vector512.LessThan(v, v_0xDC00).ExtractMostSignificantBits());
                var c_lt_0xE000 = BitOperations.PopCount(Vector512.LessThan(v, v_0xE000).ExtractMostSignificantBits());

                var highSur = c_lt_0xDC00 - c_lt_0xD800;
                var lowSur = c_lt_0xE000 - c_lt_0xDC00;
                byteCount += 3 * Vector512<ushort>.Count - c_lt_0x80 - c_lt_0x800 + highSur - 3 * lowSur;

                i += Vector512<ushort>.Count;
            } while (n - i >= Vector512<ushort>.Count);
        }

        // SIMD path 2: Vector256 (16 chars/iter) on AVX2 hosts; also handles AVX-512 tail < 32 chars
        if (Vector256.IsHardwareAccelerated && n - i >= Vector256<ushort>.Count)
        {
            var v_0x80 = Vector256.Create((ushort)0x80);
            var v_0x800 = Vector256.Create((ushort)0x800);
            var v_0xD800 = Vector256.Create((ushort)0xD800);
            var v_0xDC00 = Vector256.Create((ushort)0xDC00);
            var v_0xE000 = Vector256.Create((ushort)0xE000);

            do
            {
                var v = Vector256.LoadUnsafe(ref srcRef, (uint)i);

                var c_lt_0x80   = BitOperations.PopCount(Vector256.LessThan(v, v_0x80).ExtractMostSignificantBits());
                var c_lt_0x800  = BitOperations.PopCount(Vector256.LessThan(v, v_0x800).ExtractMostSignificantBits());
                var c_lt_0xD800 = BitOperations.PopCount(Vector256.LessThan(v, v_0xD800).ExtractMostSignificantBits());
                var c_lt_0xDC00 = BitOperations.PopCount(Vector256.LessThan(v, v_0xDC00).ExtractMostSignificantBits());
                var c_lt_0xE000 = BitOperations.PopCount(Vector256.LessThan(v, v_0xE000).ExtractMostSignificantBits());

                var highSur = c_lt_0xDC00 - c_lt_0xD800;
                var lowSur = c_lt_0xE000 - c_lt_0xDC00;
                byteCount += 3 * Vector256<ushort>.Count - c_lt_0x80 - c_lt_0x800 + highSur - 3 * lowSur;

                i += Vector256<ushort>.Count;
            } while (n - i >= Vector256<ushort>.Count);
        }

        // SIMD path 3: Vector128 (8 chars/iter) on Apple Silicon NEON, WASM SIMD, legacy SSE2;
        // also handles tail < 16 from higher tiers. Cross-platform via Vector128.IsHardwareAccelerated.
        if (Vector128.IsHardwareAccelerated && n - i >= Vector128<ushort>.Count)
        {
            var v_0x80 = Vector128.Create((ushort)0x80);
            var v_0x800 = Vector128.Create((ushort)0x800);
            var v_0xD800 = Vector128.Create((ushort)0xD800);
            var v_0xDC00 = Vector128.Create((ushort)0xDC00);
            var v_0xE000 = Vector128.Create((ushort)0xE000);

            do
            {
                var v = Vector128.LoadUnsafe(ref srcRef, (uint)i);

                var c_lt_0x80   = BitOperations.PopCount(Vector128.LessThan(v, v_0x80).ExtractMostSignificantBits());
                var c_lt_0x800  = BitOperations.PopCount(Vector128.LessThan(v, v_0x800).ExtractMostSignificantBits());
                var c_lt_0xD800 = BitOperations.PopCount(Vector128.LessThan(v, v_0xD800).ExtractMostSignificantBits());
                var c_lt_0xDC00 = BitOperations.PopCount(Vector128.LessThan(v, v_0xDC00).ExtractMostSignificantBits());
                var c_lt_0xE000 = BitOperations.PopCount(Vector128.LessThan(v, v_0xE000).ExtractMostSignificantBits());

                var highSur = c_lt_0xDC00 - c_lt_0xD800;
                var lowSur = c_lt_0xE000 - c_lt_0xDC00;
                byteCount += 3 * Vector128<ushort>.Count - c_lt_0x80 - c_lt_0x800 + highSur - 3 * lowSur;

                i += Vector128<ushort>.Count;
            } while (n - i >= Vector128<ushort>.Count);
        }

        // Scalar tail (and fallback for non-SIMD hardware).
        // CRITICAL: must use the SAME per-char accounting model as the SIMD path so that surrogate
        // pairs split across a SIMD/scalar boundary count correctly. The SIMD path counts each char
        // independently — high surrogate → 4 bytes, low surrogate → 0 bytes. The scalar tail must
        // do the same (i += 1 per char, NOT i += 2 on high surrogate). If the scalar tail
        // double-consumed surrogate pairs (i += 2 on high), a high surrogate landing in the last
        // SIMD chunk would be counted there as 4 bytes, then its low surrogate in the scalar tail
        // would re-trigger the surrogate branch and add 4 more bytes (with i += 2 advancing past
        // an unrelated next char). Net: +4 byte miscount per split-pair.
        while (i < n)
        {
            var c = Unsafe.Add(ref srcRef, i);
            if (c < 0x80)
            {
                byteCount += 1;
            }
            else if (c < 0x800)
            {
                byteCount += 2;
            }
            else if (c < 0xD800)
            {
                byteCount += 3;  // BMP below surrogate range
            }
            else if (c < 0xDC00)
            {
                byteCount += 4;  // high surrogate → owns the 4-byte encoding for the pair
            }
            else if (c < 0xE000)
            {
                // low surrogate → 0 bytes (the paired high surrogate already accounted for the 4)
            }
            else
            {
                byteCount += 3;  // BMP at or above 0xE000
            }
            i += 1;
        }

        return byteCount;
    }

    /// <summary>
    /// Counts UTF-16 chars produced by decoding the given UTF-8 byte span.
    /// </summary>
    /// <remarks>
    /// Layered SIMD: Vector512 (64 byte/iter) on AVX-512BW hosts → Vector256 (32 byte/iter) on AVX2
    /// hosts → Vector128 (16 byte/iter) on Apple Silicon / WASM / SSE2 → scalar tail. All SIMD paths
    /// use the same two bit-pattern checks:
    /// • Non-continuation bytes (NOT 10xxxxxx, mask 0xC0 ≠ 0x80): each contributes 1 char.
    /// • 4-byte start bytes (11110xxx, mask 0xF8 == 0xF0): each contributes an EXTRA char (surrogate pair).
    ///
    /// SIMD per-block result: <c>(N - popcount(continuationMask)) + popcount(fourByteStartMask)</c>.
    /// Scalar tail handles the remaining bytes.
    ///
    /// Char-count rules:
    /// • Continuation bytes (10xxxxxx, 0x80–0xBF) — produce no char, skip.
    /// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each.
    /// • 4-byte start bytes (11110xxx, 0xF0–0xF7) — produce 2 chars (UTF-16 surrogate pair).
    /// </remarks>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    internal static int CountUtf8Chars(ReadOnlySpan<byte> bytes)
    {
        var count = 0;
        var i = 0;
        ref var bytesRef = ref MemoryMarshal.GetReference(bytes);

        // SIMD path 1: 64 bytes/iter via Vector512 (AVX-512BW hosts)
        if (Avx512BW.IsSupported && bytes.Length >= 64)
        {
            var contMask512 = Vector512.Create((byte)0xC0);
            var contValue512 = Vector512.Create((byte)0x80);
            var fourByteMask512 = Vector512.Create((byte)0xF8);
            var fourByteValue512 = Vector512.Create((byte)0xF0);

            do
            {
                var v = Vector512.LoadUnsafe(ref bytesRef, (uint)i);

                // Non-continuation count: 64 - popcount(continuation byte mask)
                var contMatches = Vector512.Equals(v & contMask512, contValue512);
                var contBits = contMatches.ExtractMostSignificantBits();  // ulong
                count += 64 - BitOperations.PopCount(contBits);

                // 4-byte start count: popcount(fourByte start byte mask)
                var fourByteMatches = Vector512.Equals(v & fourByteMask512, fourByteValue512);
                var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
                count += BitOperations.PopCount(fourByteBits);

                i += 64;
            } while (bytes.Length - i >= 64);
        }

        // SIMD path 2: 32 bytes/iter via Vector256 (AVX2 hosts, also handles AVX-512 tail < 64)
        if (Vector256.IsHardwareAccelerated && bytes.Length - i >= 32)
        {
            var contMask = Vector256.Create((byte)0xC0);
            var contValue = Vector256.Create((byte)0x80);
            var fourByteMask = Vector256.Create((byte)0xF8);
            var fourByteValue = Vector256.Create((byte)0xF0);

            do
            {
                var v = Vector256.LoadUnsafe(ref bytesRef, (uint)i);

                // Non-continuation count: 32 - popcount(continuation byte mask)
                var contMatches = Vector256.Equals(v & contMask, contValue);
                var contBits = contMatches.ExtractMostSignificantBits();
                count += 32 - BitOperations.PopCount(contBits);

                // 4-byte start count: popcount(fourByte start byte mask)
                var fourByteMatches = Vector256.Equals(v & fourByteMask, fourByteValue);
                var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
                count += BitOperations.PopCount(fourByteBits);

                i += 32;
            } while (bytes.Length - i >= 32);
        }

        // SIMD path 3: 16 bytes/iter via Vector128 (Apple Silicon NEON, WASM SIMD, legacy SSE2;
        // also handles tail < 32 from higher tiers). Cross-platform — Vector128.IsHardwareAccelerated
        // returns true on any host with a 128-bit SIMD ISA (NEON / SSE2 / WASM SIMD).
        if (Vector128.IsHardwareAccelerated && bytes.Length - i >= 16)
        {
            var contMask128 = Vector128.Create((byte)0xC0);
            var contValue128 = Vector128.Create((byte)0x80);
            var fourByteMask128 = Vector128.Create((byte)0xF8);
            var fourByteValue128 = Vector128.Create((byte)0xF0);

            do
            {
                var v = Vector128.LoadUnsafe(ref bytesRef, (uint)i);

                // Non-continuation count: 16 - popcount(continuation byte mask)
                var contMatches = Vector128.Equals(v & contMask128, contValue128);
                var contBits = contMatches.ExtractMostSignificantBits();
                count += 16 - BitOperations.PopCount(contBits);

                // 4-byte start count: popcount(fourByte start byte mask)
                var fourByteMatches = Vector128.Equals(v & fourByteMask128, fourByteValue128);
                var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
                count += BitOperations.PopCount(fourByteBits);

                i += 16;
            } while (bytes.Length - i >= 16);
        }

        // Scalar tail (and fallback for non-SIMD hardware)
        for (; i < bytes.Length; i++)
        {
            var b = Unsafe.Add(ref bytesRef, i);
            if ((b & 0xC0) != 0x80) count++;       // non-continuation byte
            if ((b & 0xF8) == 0xF0) count++;        // 4-byte start: extra char for surrogate pair
        }
        return count;
    }

    /// <summary>
    /// Single-pass UTF-8 → UTF-16 decoder. Returns the actual char count written to <paramref name="dst"/>.
    /// </summary>
    /// <remarks>
    /// Layered approach for maximum throughput across mixed content:
    /// • <b>Phase 1 — Vector256 ASCII prefix bulk widen:</b> 32 bytes/iter while all top bits are zero.
    ///   Uses <see cref="Vector256.Widen(Vector256{byte})"/> to produce two Vector256&lt;ushort&gt; lanes
    ///   = 32 chars per iteration. Breaks on first non-ASCII byte found in the loaded vector.
    /// • <b>Phase 2 — DWORD ASCII batch:</b> when ≥4 bytes remain, read as <c>uint</c>, test
    ///   <c>(dword &amp; 0x80808080u) == 0</c>; on hit, widen 4 chars in 4 instructions and continue.
    /// • <b>Phase 3 — Scalar multi-byte branch:</b> 1-byte (ASCII single), 2-byte (Latin extended,
    ///   Cyrillic, Greek, Hebrew, Arabic), 3-byte (CJK BMP), 4-byte (supplementary plane → surrogate pair).
    ///   Direct bit-extract, no validation — input is trusted.
    ///
    /// JIT compiles the switch into a jump table for predictable dispatch on mixed content.
    /// Hungarian text typical pattern: ASCII run (Phase 1/2 widening) → 2-byte char (Phase 3
    /// case &lt; 0xE0) → ASCII run → 2-byte char → ... — each phase optimal for its segment.
    ///
    /// V4N2 Phase 2.5 (run-length scalar decoder) attempted 2026-05-07 — both full and hybrid
    /// (3-byte do-while only) variants showed bench-instability and unmeasurable optimization
    /// signal on the available hardware. Reverted to the switch-jumptable per-char baseline.
    /// </remarks>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    internal static int DecodeUtf8SinglePass(ReadOnlySpan<byte> src, Span<char> dst)
    {
        int srcIdx = 0, dstIdx = 0;
        ref byte srcRef = ref MemoryMarshal.GetReference(src);
        ref ushort dstRef = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(dst));

        // Phase 1 — Vector256 ASCII prefix bulk widen (32 bytes/iter)
        if (Vector256.IsHardwareAccelerated)
        {
            while (src.Length - srcIdx >= Vector256<byte>.Count)
            {
                var v = Vector256.LoadUnsafe(ref srcRef, (uint)srcIdx);
                // ASCII detect: any high bit set among the 32 bytes?
                if (v.ExtractMostSignificantBits() != 0) break;

                // Widen 32 bytes → 2 × Vector256<ushort> (32 chars total). Each Vector256<ushort>
                // holds 16 ushort, so the upper half stores at dstIdx + 16 (= Vector256<ushort>.Count).
                var (lower, upper) = Vector256.Widen(v);
                lower.StoreUnsafe(ref dstRef, (uint)dstIdx);
                upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector256<ushort>.Count));
                srcIdx += Vector256<byte>.Count;
                dstIdx += Vector256<byte>.Count;  // 32 bytes → 32 chars
            }
        }

        // Phase 2/2.5/3 — DWORD ASCII batch + run-length scalar decoder + 4-byte fallback
        while (srcIdx < src.Length)
        {
            // Phase 2 — DWORD ASCII batch: 4 ASCII bytes → 4 chars per iter (unchanged)
            if (src.Length - srcIdx >= 4)
            {
                var dword = Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref srcRef, srcIdx));
                if ((dword & 0x80808080u) == 0)
                {
                    Unsafe.Add(ref dstRef, dstIdx)     = (byte)dword;
                    Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(dword >> 8);
                    Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(dword >> 16);
                    Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(dword >> 24);
                    srcIdx += 4;
                    dstIdx += 4;
                    continue;
                }
            }

            // Phase 2.5 — lead-byte selects run-type, inner do-while decodes the full run.
            // Benefit vs. per-char switch-jumptable: the switch dispatch fires once per run-start,
            // not once per char. Long homogeneous runs (CJK 3-byte chunks, Latin/Cyrillic/Greek
            // 2-byte sequences) get tight branchless inner loops.
            var b0 = Unsafe.Add(ref srcRef, srcIdx);
            switch (b0)
            {
                case < 0x80:
                    // 1-byte ASCII single (single-byte tail of a run that the DWORD batch couldn't cover).
                    // No do-while loop here — the DWORD batch already handles long ASCII runs above;
                    // this case is the 1-3 byte tail before the next non-ASCII byte.
                    Unsafe.Add(ref dstRef, dstIdx++) = b0;
                    srcIdx += 1;
                    break;
                case < 0xE0:
                {
                    // 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF
                    // Latin extended, Cyrillic, Greek, Hebrew, Arabic.
                    var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
                    Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x1F) << 6) | (b1 & 0x3F));
                    srcIdx += 2;
                    break;
                }
                case < 0xF0:
                {
                    // 3-byte: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF
                    // CJK BMP, various other scripts.
                    var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
                    var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
                    Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F));
                    srcIdx += 3;
                    break;
                }
                default:
                {
                    // 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000–U+10FFFF
                    // Supplementary plane (emoji, rare CJK ext) → UTF-16 surrogate pair.
                    // No do-while: 4-byte sequences are typically isolated (single emoji in mixed text).
                    var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
                    var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
                    var b3 = Unsafe.Add(ref srcRef, srcIdx + 3);
                    var codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
                    codepoint -= 0x10000;
                    Unsafe.Add(ref dstRef, dstIdx)     = (ushort)(0xD800 | (codepoint >> 10));
                    Unsafe.Add(ref dstRef, dstIdx + 1) = (ushort)(0xDC00 | (codepoint & 0x3FF));
                    dstIdx += 2;
                    srcIdx += 4;
                    break;
                }
            }
        }

        return dstIdx;
    }
}