diff --git a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs
index 2fc1f8d..e71dc3b 100644
--- a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs
+++ b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs
@@ -484,242 +484,14 @@ public static partial class AcBinaryDeserializer
             var pos = _position;
             _position += byteLength;
             var src = _buffer.AsSpan(pos, byteLength);
-            var charCount = CountUtf8Chars(src);
+            var charCount = Utf8Transcoder.CountUtf8Chars(src);
 
             return string.Create(charCount, (Buffer: _buffer, Pos: pos, Len: byteLength), static (chars, state) =>
             {
-                DecodeUtf8SinglePass(state.Buffer.AsSpan(state.Pos, state.Len), chars);
+                Utf8Transcoder.DecodeUtf8SinglePass(state.Buffer.AsSpan(state.Pos, state.Len), chars);
             });
         }
 
-        /// <summary>
-        /// Counts UTF-16 chars produced by decoding the given UTF-8 byte span.
-        /// </summary>
-        /// <remarks>
-        /// Layered SIMD: Vector512 (64 byte/iter) on AVX-512BW hosts → Vector256 (32 byte/iter)
-        /// on AVX2 hosts → scalar tail. Both SIMD paths use the same two bit-pattern checks:
-        /// • Non-continuation bytes (NOT 10xxxxxx, mask 0xC0 ≠ 0x80): each contributes 1 char.
-        /// • 4-byte start bytes (11110xxx, mask 0xF8 == 0xF0): each contributes an EXTRA char (surrogate pair).
-        ///
-        /// SIMD per-block result: <c>(N - popcount(continuationMask)) + popcount(fourByteStartMask)</c>
-        /// where N = 64 (Vector512) or 32 (Vector256). Scalar tail handles the remaining bytes.
-        ///
-        /// Char-count rules:
-        /// • Continuation bytes (10xxxxxx, 0x80–0xBF) — produce no char, skip.
-        /// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each.
-        /// • 4-byte start bytes (11110xxx, 0xF0–0xF7) — produce 2 chars (UTF-16 surrogate pair).
-        ///
-        /// JIT-time path-selection: <c>Avx512BW.IsSupported</c> and <c>Vector256.IsHardwareAccelerated</c>
-        /// are <c>[Intrinsic]</c> static booleans — the JIT/AOT constant-folds the dead branches per host.
-        /// </remarks>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int CountUtf8Chars(ReadOnlySpan<byte> bytes)
-        {
-            var count = 0;
-            var i = 0;
-            ref var bytesRef = ref MemoryMarshal.GetReference(bytes);
-
-            // SIMD path 1: 64 bytes/iter via Vector512 (AVX-512BW hosts)
-            if (Avx512BW.IsSupported && bytes.Length >= 64)
-            {
-                var contMask512 = Vector512.Create((byte)0xC0);
-                var contValue512 = Vector512.Create((byte)0x80);
-                var fourByteMask512 = Vector512.Create((byte)0xF8);
-                var fourByteValue512 = Vector512.Create((byte)0xF0);
-
-                do
-                {
-                    var v = Vector512.LoadUnsafe(ref bytesRef, (uint)i);
-
-                    // Non-continuation count: 64 - popcount(continuation byte mask)
-                    var contMatches = Vector512.Equals(v & contMask512, contValue512);
-                    var contBits = contMatches.ExtractMostSignificantBits();  // ulong
-                    count += 64 - System.Numerics.BitOperations.PopCount(contBits);
-
-                    // 4-byte start count: popcount(fourByte start byte mask)
-                    var fourByteMatches = Vector512.Equals(v & fourByteMask512, fourByteValue512);
-                    var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
-                    count += System.Numerics.BitOperations.PopCount(fourByteBits);
-
-                    i += 64;
-                } while (bytes.Length - i >= 64);
-            }
-
-            // SIMD path 2: 32 bytes/iter via Vector256 (AVX2 hosts, also handles AVX-512 tail < 64)
-            if (Vector256.IsHardwareAccelerated && bytes.Length - i >= 32)
-            {
-                var contMask = Vector256.Create((byte)0xC0);
-                var contValue = Vector256.Create((byte)0x80);
-                var fourByteMask = Vector256.Create((byte)0xF8);
-                var fourByteValue = Vector256.Create((byte)0xF0);
-
-                do
-                {
-                    var v = Vector256.LoadUnsafe(ref bytesRef, (uint)i);
-
-                    // Non-continuation count: 32 - popcount(continuation byte mask)
-                    var contMatches = Vector256.Equals(v & contMask, contValue);
-                    var contBits = contMatches.ExtractMostSignificantBits();
-                    count += 32 - System.Numerics.BitOperations.PopCount(contBits);
-
-                    // 4-byte start count: popcount(fourByte start byte mask)
-                    var fourByteMatches = Vector256.Equals(v & fourByteMask, fourByteValue);
-                    var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
-                    count += System.Numerics.BitOperations.PopCount(fourByteBits);
-
-                    i += 32;
-                } while (bytes.Length - i >= 32);
-            }
-
-            // SIMD path 3: 16 bytes/iter via Vector128 (Apple Silicon NEON, WASM SIMD, legacy SSE2;
-            // also handles tail < 32 from higher tiers). Cross-platform — Vector128.IsHardwareAccelerated
-            // returns true on any host with a 128-bit SIMD ISA (NEON / SSE2 / WASM SIMD).
-            if (Vector128.IsHardwareAccelerated && bytes.Length - i >= 16)
-            {
-                var contMask128 = Vector128.Create((byte)0xC0);
-                var contValue128 = Vector128.Create((byte)0x80);
-                var fourByteMask128 = Vector128.Create((byte)0xF8);
-                var fourByteValue128 = Vector128.Create((byte)0xF0);
-
-                do
-                {
-                    var v = Vector128.LoadUnsafe(ref bytesRef, (uint)i);
-
-                    // Non-continuation count: 16 - popcount(continuation byte mask)
-                    var contMatches = Vector128.Equals(v & contMask128, contValue128);
-                    var contBits = contMatches.ExtractMostSignificantBits();
-                    count += 16 - System.Numerics.BitOperations.PopCount(contBits);
-
-                    // 4-byte start count: popcount(fourByte start byte mask)
-                    var fourByteMatches = Vector128.Equals(v & fourByteMask128, fourByteValue128);
-                    var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
-                    count += System.Numerics.BitOperations.PopCount(fourByteBits);
-
-                    i += 16;
-                } while (bytes.Length - i >= 16);
-            }
-
-            // Scalar tail (and fallback for non-SIMD hardware)
-            for (; i < bytes.Length; i++)
-            {
-                var b = Unsafe.Add(ref bytesRef, i);
-                if ((b & 0xC0) != 0x80) count++;       // non-continuation byte
-                if ((b & 0xF8) == 0xF0) count++;        // 4-byte start: extra char for surrogate pair
-            }
-            return count;
-        }
-
-        /// <summary>
-        /// Single-pass UTF-8 → UTF-16 decoder. Returns the actual char count written to <paramref name="dst"/>.
-        /// </summary>
-        /// <remarks>
-        /// Layered approach for maximum throughput across mixed content:
-        /// • <b>Phase 1 — Vector256 ASCII prefix bulk widen:</b> 32 bytes/iter while all top bits are zero.
-        ///   Uses <see cref="Vector256.Widen(Vector256{byte})"/> to produce two Vector256&lt;ushort&gt; lanes
-        ///   = 32 chars per iteration. Breaks on first non-ASCII byte found in the loaded vector.
-        /// • <b>Phase 2 — DWORD ASCII batch:</b> when ≥4 bytes remain, read as <c>uint</c>, test
-        ///   <c>(dword &amp; 0x80808080u) == 0</c>; on hit, widen 4 chars in 4 instructions and continue.
-        /// • <b>Phase 3 — Scalar multi-byte branch:</b> 1-byte (ASCII single), 2-byte (Latin extended,
-        ///   Cyrillic, Greek, Hebrew, Arabic), 3-byte (CJK BMP), 4-byte (supplementary plane → surrogate pair).
-        ///   Direct bit-extract, no validation — input is trusted.
-        ///
-        /// JIT compiles the switch into a jump table for predictable dispatch on mixed content.
-        /// Hungarian text typical pattern: ASCII run (Phase 1/2 widening) → 2-byte char (Phase 3
-        /// case &lt; 0xE0) → ASCII run → 2-byte char → ... — each phase optimal for its segment.
-        /// </remarks>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int DecodeUtf8SinglePass(ReadOnlySpan<byte> src, Span<char> dst)
-        {
-            int srcIdx = 0, dstIdx = 0;
-            ref byte srcRef = ref MemoryMarshal.GetReference(src);
-            ref ushort dstRef = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(dst));
-
-            // Phase 1 — Vector256 ASCII prefix bulk widen (32 bytes/iter)
-            if (Vector256.IsHardwareAccelerated)
-            {
-                while (src.Length - srcIdx >= Vector256<byte>.Count)
-                {
-                    var v = Vector256.LoadUnsafe(ref srcRef, (uint)srcIdx);
-                    // ASCII detect: any high bit set among the 32 bytes?
-                    if (v.ExtractMostSignificantBits() != 0) break;
-
-                    // Widen 32 bytes → 2 × Vector256<ushort> (32 chars total)
-                    var (lower, upper) = Vector256.Widen(v);
-                    lower.StoreUnsafe(ref dstRef, (uint)dstIdx);
-                    upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector128<ushort>.Count));
-                    srcIdx += Vector256<byte>.Count;
-                    dstIdx += Vector256<byte>.Count;  // 32 bytes → 32 chars
-                }
-            }
-
-            // Phase 2/3 — scalar loop with DWORD ASCII batch
-            while (srcIdx < src.Length)
-            {
-                // DWORD ASCII batch: 4 ASCII bytes → 4 chars per iter
-                if (src.Length - srcIdx >= 4)
-                {
-                    var dword = Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref srcRef, srcIdx));
-                    if ((dword & 0x80808080u) == 0)
-                    {
-                        Unsafe.Add(ref dstRef, dstIdx)     = (byte)dword;
-                        Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(dword >> 8);
-                        Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(dword >> 16);
-                        Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(dword >> 24);
-                        srcIdx += 4;
-                        dstIdx += 4;
-                        continue;
-                    }
-                }
-
-                // Scalar multi-byte branch (jump-table compile via switch)
-                var b0 = Unsafe.Add(ref srcRef, srcIdx);
-                switch (b0)
-                {
-                    case < 0x80:
-                        // 1-byte ASCII (U+0000–U+007F)
-                        Unsafe.Add(ref dstRef, dstIdx++) = b0;
-                        srcIdx += 1;
-                        break;
-                    case < 0xE0:
-                    {
-                        // 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF
-                        // Latin extended, Cyrillic, Greek, Hebrew, Arabic.
-                        var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
-                        Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x1F) << 6) | (b1 & 0x3F));
-                        srcIdx += 2;
-                        break;
-                    }
-                    case < 0xF0:
-                    {
-                        // 3-byte: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF
-                        // CJK BMP, various other scripts.
-                        var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
-                        var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
-                        Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F));
-                        srcIdx += 3;
-                        break;
-                    }
-                    default:
-                    {
-                        // 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000–U+10FFFF
-                        // Supplementary plane (emoji, rare CJK ext) → UTF-16 surrogate pair.
-                        var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
-                        var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
-                        var b3 = Unsafe.Add(ref srcRef, srcIdx + 3);
-                        var codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
-                        codepoint -= 0x10000;
-                        Unsafe.Add(ref dstRef, dstIdx)     = (ushort)(0xD800 | (codepoint >> 10));
-                        Unsafe.Add(ref dstRef, dstIdx + 1) = (ushort)(0xDC00 | (codepoint & 0x3FF));
-                        dstIdx += 2;
-                        srcIdx += 4;
-                        break;
-                    }
-                }
-            }
-
-            return dstIdx;
-        }
-
         private string ReadStringUtf8Cached(int length)
         {
             var slice = _buffer.AsSpan(_position, length);
diff --git a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs
index 45b256a..ce9af11 100644
--- a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs
+++ b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs
@@ -699,7 +699,7 @@ public static partial class AcBinarySerializer
 
             var savedPos = _position;
             var encodeStart = savedPos + reserveSize;
-            var bytesWritten = EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
+            var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
 
             var actualVarUIntSize = VarUIntSize((uint)bytesWritten);
             if (actualVarUIntSize < reserveSize)
@@ -768,7 +768,7 @@ public static partial class AcBinarySerializer
                 EnsureCapacity(2 + maxBytesShort);   // marker + 1-byte VarUInt + bytes (worst case)
 
                 var savedPosShort = _position;
-                var bytesWrittenShort = EncodeUtf8SinglePass(
+                var bytesWrittenShort = Utf8Transcoder.EncodeUtf8SinglePass(
                     value.AsSpan(),
                     _buffer.AsSpan(savedPosShort + 1, maxBytesShort));
                 var isAsciiShort = bytesWrittenShort == charLength;
@@ -805,7 +805,7 @@ public static partial class AcBinarySerializer
 
             var savedPos = _position;
             var encodeStart = savedPos + 1 + reserveVarUInt;
-            var bytesWritten = EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
+            var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
             var isAscii = bytesWritten == charLength;
 
             _buffer[savedPos] = isAscii ? BinaryTypeCode.StringAscii : BinaryTypeCode.String;
@@ -876,161 +876,6 @@ public static partial class AcBinarySerializer
             _position += byteCount;
         }
 
-        /// <summary>
-        /// Custom UTF-16 → UTF-8 single-pass encoder. Symmetric with the deserializer's custom decoder
-        /// (<see cref="AcBinaryDeserializer"/>'s <c>DecodeUtf8SinglePass</c>).
-        /// </summary>
-        /// <remarks>
-        /// Bypasses <see cref="System.Text.Encoding.UTF8"/>.GetBytes virtual-dispatch + encoder-fallback
-        /// overhead. Trusted-input encoder — no validation (writer side, the input is a .NET string
-        /// which always has valid UTF-16 surrogate pairs).
-        ///
-        /// Layered for max throughput on mixed content:
-        /// • <b>Phase 1 — Vector256 ASCII narrow:</b> 16 chars/iter. Loads <c>Vector256&lt;ushort&gt;</c>,
-        ///   tests <c>(v &amp; 0xFF80) == 0</c> for all-ASCII; on hit, narrows to <c>Vector128&lt;byte&gt;</c>
-        ///   via <c>Vector128.Narrow(GetLower, GetUpper)</c> = 16 bytes per iter.
-        /// • <b>Phase 2 — DWORD ASCII batch:</b> 4 chars/iter. OR-mask test
-        ///   <c>(c0 | c1 | c2 | c3) &amp; 0xFF80 == 0</c>; on hit, 4 byte writes per iter.
-        /// • <b>Phase 3 — Scalar multi-byte encode:</b> 1-byte (ASCII), 2-byte (Latin extended,
-        ///   Cyrillic, Greek), 3-byte (CJK BMP), 4-byte (supplementary plane via UTF-16 surrogate pair).
-        ///
-        /// Returns actual byte count written. Caller must ensure <paramref name="dst"/> has at least
-        /// <c>src.Length * 4</c> capacity (UTF-8 worst case).
-        /// </remarks>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int EncodeUtf8SinglePass(ReadOnlySpan<char> src, Span<byte> dst)
-        {
-            int srcIdx = 0, dstIdx = 0;
-            ref char srcRefChar = ref MemoryMarshal.GetReference(src);
-            ref ushort srcRefU16 = ref Unsafe.As<char, ushort>(ref srcRefChar);
-            ref byte dstRef = ref MemoryMarshal.GetReference(dst);
-
-            // Phase 1a — Vector512 ASCII narrow (32 chars/iter on AVX-512BW hosts).
-            // JIT-time path-selection via Avx512BW.IsSupported [Intrinsic] static bool — non-AVX-512
-            // hosts get this branch eliminated by constant-folding (zero overhead in the generated asm).
-            if (Avx512BW.IsSupported)
-            {
-                var asciiMask512 = Vector512.Create((ushort)0xFF80);
-                while (src.Length - srcIdx >= Vector512<ushort>.Count)  // 32 chars per Vector512<ushort>
-                {
-                    var v = Vector512.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
-                    // ASCII detect: any char's high bits set (>= 0x80)?
-                    if ((v & asciiMask512) != Vector512<ushort>.Zero) break;
-                    // Narrow 32 ushorts (Vector512) → 32 bytes (Vector256) via two 256-bit halves.
-                    // The JIT lowers this to AVX-512 VPACKUSWB on capable hosts (single-instruction pack).
-                    var bytes = Vector256.Narrow(v.GetLower(), v.GetUpper());
-                    bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
-                    srcIdx += Vector512<ushort>.Count;
-                    dstIdx += Vector512<ushort>.Count;  // 32 chars → 32 bytes (1:1 for ASCII)
-                }
-            }
-
-            // Phase 1b — Vector256 ASCII narrow (16 chars/iter on AVX2 hosts; also handles tail < 32 chars
-            // after the AVX-512 path on capable hosts).
-            if (Vector256.IsHardwareAccelerated)
-            {
-                var asciiMask = Vector256.Create((ushort)0xFF80);
-                while (src.Length - srcIdx >= Vector256<ushort>.Count)  // 16 chars per Vector256<ushort>
-                {
-                    var v = Vector256.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
-                    // ASCII detect: any char's high bits set (>= 0x80)?
-                    if ((v & asciiMask) != Vector256<ushort>.Zero) break;
-                    // Narrow 16 ushorts (Vector256) → 16 bytes (Vector128) via two halves
-                    var bytes = Vector128.Narrow(v.GetLower(), v.GetUpper());
-                    bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
-                    srcIdx += Vector256<ushort>.Count;
-                    dstIdx += Vector256<ushort>.Count;  // 16 chars → 16 bytes (1:1 for ASCII)
-                }
-            }
-
-            // Phase 1c — Vector128 ASCII narrow (16 chars/iter on Apple Silicon NEON, WASM SIMD,
-            // legacy SSE2; also handles tail < 16 chars after higher tiers). Cross-platform —
-            // Vector128.IsHardwareAccelerated is true on any 128-bit-SIMD-capable host.
-            // Two Vector128<ushort> loads (8 + 8 = 16 chars) narrow to one Vector128<byte> (16 bytes).
-            if (Vector128.IsHardwareAccelerated)
-            {
-                var asciiMask128 = Vector128.Create((ushort)0xFF80);
-                while (src.Length - srcIdx >= 16)  // 16 chars = 2 × Vector128<ushort>
-                {
-                    var lo = Vector128.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
-                    var hi = Vector128.LoadUnsafe(ref srcRefU16, (uint)(srcIdx + 8));
-                    // ASCII detect: any char's high bits set in either half?
-                    if (((lo | hi) & asciiMask128) != Vector128<ushort>.Zero) break;
-                    // Narrow 2× Vector128<ushort> (16 chars) → Vector128<byte> (16 bytes)
-                    var bytes = Vector128.Narrow(lo, hi);
-                    bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
-                    srcIdx += 16;
-                    dstIdx += 16;
-                }
-            }
-
-            // Phase 2/3 — scalar with DWORD ASCII batch
-            while (srcIdx < src.Length)
-            {
-                // DWORD ASCII batch: 4 chars → 4 bytes when all ASCII
-                if (src.Length - srcIdx >= 4)
-                {
-                    var c0 = Unsafe.Add(ref srcRefChar, srcIdx);
-                    var c1 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
-                    var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 2);
-                    var c3 = Unsafe.Add(ref srcRefChar, srcIdx + 3);
-                    if (((c0 | c1 | c2 | c3) & 0xFF80) == 0)
-                    {
-                        Unsafe.Add(ref dstRef, dstIdx)     = (byte)c0;
-                        Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)c1;
-                        Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)c2;
-                        Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)c3;
-                        srcIdx += 4;
-                        dstIdx += 4;
-                        continue;
-                    }
-                }
-
-                // Scalar single-char encode
-                var c = Unsafe.Add(ref srcRefChar, srcIdx);
-                if (c < 0x80)
-                {
-                    // 1-byte ASCII (U+0000–U+007F)
-                    Unsafe.Add(ref dstRef, dstIdx++) = (byte)c;
-                    srcIdx += 1;
-                }
-                else if (c < 0x800)
-                {
-                    // 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF
-                    // Latin extended (Hungarian, Polish, Czech, Spanish, French, German diacritics),
-                    // Greek, Cyrillic, Hebrew, Arabic.
-                    Unsafe.Add(ref dstRef, dstIdx)     = (byte)(0xC0 | (c >> 6));
-                    Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | (c & 0x3F));
-                    dstIdx += 2;
-                    srcIdx += 1;
-                }
-                else if ((c & 0xF800) != 0xD800)
-                {
-                    // 3-byte BMP: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF (excluding surrogate range)
-                    // CJK BMP, various other BMP scripts.
-                    Unsafe.Add(ref dstRef, dstIdx)     = (byte)(0xE0 | (c >> 12));
-                    Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((c >> 6) & 0x3F));
-                    Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | (c & 0x3F));
-                    dstIdx += 3;
-                    srcIdx += 1;
-                }
-                else
-                {
-                    // 4-byte: surrogate pair → supplementary plane codepoint (U+10000–U+10FFFF)
-                    // High surrogate (0xD800–0xDBFF) followed by low surrogate (0xDC00–0xDFFF).
-                    var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
-                    var codepoint = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00);
-                    Unsafe.Add(ref dstRef, dstIdx)     = (byte)(0xF0 | (codepoint >> 18));
-                    Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((codepoint >> 12) & 0x3F));
-                    Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | ((codepoint >> 6) & 0x3F));
-                    Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(0x80 | (codepoint & 0x3F));
-                    dstIdx += 4;
-                    srcIdx += 2;  // consumed 2 chars (surrogate pair)
-                }
-            }
-
-            return dstIdx;
-        }
 
         #endregion
 
diff --git a/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs b/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs
new file mode 100644
index 0000000..a69adab
--- /dev/null
+++ b/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs
@@ -0,0 +1,416 @@
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace AyCode.Core.Serializers.Binaries;
+
+/// <summary>
+/// In-house UTF-8 ↔ UTF-16 transcoder used by the binary serializer hot path. Trust-input
+/// semantics — no validation (writer-side input is a .NET string, reader-side input is valid
+/// UTF-8 by writer contract). Bypasses <see cref="System.Text.Encoding.UTF8"/> virtual-dispatch
+/// + EncoderFallback / DecoderFallback overhead.
+///
+/// <para><b>SIMD path hierarchy</b> (cascading tail-handler):</para>
+/// <list type="bullet">
+///   <item>Vector512 / AVX-512BW — 64 byte/iter (Intel server, Intel 11th gen client, AMD Zen 4+)</item>
+///   <item>Vector256 / AVX2 — 32 byte/iter (Intel 12-14th gen client, AMD Zen 3 and earlier)</item>
+///   <item>Vector128 / SSE-NEON-WASM — 16 byte/iter (Apple Silicon NEON, WASM SIMD, legacy SSE2)</item>
+///   <item>Scalar — final tail (&lt; 16 byte) and no-SIMD fall-back</item>
+/// </list>
+///
+/// <para>JIT/AOT path-selection: <c>Avx512BW.IsSupported</c> / <c>Vector256.IsHardwareAccelerated</c> /
+/// <c>Vector128.IsHardwareAccelerated</c> are <c>[Intrinsic]</c> static booleans — the compiler
+/// constant-folds the dead branches per host. Non-supported tiers eliminate to zero generated code.</para>
+///
+/// <para>Algorithm reference: see <c>BINARY_TODO.md#accore-bin-t-v4n2</c> for the multi-tier SIMD
+/// transcoder design and per-tier acceptance criteria.</para>
+/// </summary>
+internal static class Utf8Transcoder
+{
+    /// <summary>
+    /// Custom UTF-16 → UTF-8 single-pass encoder. Symmetric with <see cref="DecodeUtf8SinglePass"/>.
+    /// </summary>
+    /// <remarks>
+    /// Bypasses <see cref="System.Text.Encoding.UTF8"/>.GetBytes virtual-dispatch + encoder-fallback
+    /// overhead. Trusted-input encoder — no validation (writer side, the input is a .NET string
+    /// which always has valid UTF-16 surrogate pairs).
+    ///
+    /// Layered for max throughput on mixed content:
+    /// • <b>Phase 1a — Vector512 ASCII narrow:</b> 32 chars/iter on AVX-512BW hosts. JIT lowers
+    ///   <c>Vector256.Narrow</c> to AVX-512 VPACKUSWB (single-instruction pack).
+    /// • <b>Phase 1b — Vector256 ASCII narrow:</b> 16 chars/iter on AVX2 hosts (also handles tail
+    ///   &lt; 32 chars after the AVX-512 path on capable hosts).
+    /// • <b>Phase 1c — Vector128 ASCII narrow:</b> 16 chars/iter on Apple Silicon NEON / WASM SIMD
+    ///   / legacy SSE2 hosts (also handles tail &lt; 16 chars).
+    /// • <b>Phase 2 — DWORD ASCII batch:</b> 4 chars/iter. OR-mask test
+    ///   <c>(c0 | c1 | c2 | c3) &amp; 0xFF80 == 0</c>; on hit, 4 byte writes per iter.
+    /// • <b>Phase 3 — Scalar multi-byte encode:</b> 1-byte (ASCII), 2-byte (Latin extended,
+    ///   Cyrillic, Greek), 3-byte (CJK BMP), 4-byte (supplementary plane via UTF-16 surrogate pair).
+    ///
+    /// Returns actual byte count written. Caller must ensure <paramref name="dst"/> has at least
+    /// <c>src.Length * 4</c> capacity (UTF-8 worst case).
+    /// </remarks>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    internal static int EncodeUtf8SinglePass(ReadOnlySpan<char> src, Span<byte> dst)
+    {
+        int srcIdx = 0, dstIdx = 0;
+        ref char srcRefChar = ref MemoryMarshal.GetReference(src);
+        ref ushort srcRefU16 = ref Unsafe.As<char, ushort>(ref srcRefChar);
+        ref byte dstRef = ref MemoryMarshal.GetReference(dst);
+
+        // Phase 1a — Vector512 ASCII narrow (32 chars/iter on AVX-512BW hosts).
+        // JIT-time path-selection via Avx512BW.IsSupported [Intrinsic] static bool — non-AVX-512
+        // hosts get this branch eliminated by constant-folding (zero overhead in the generated asm).
+        if (Avx512BW.IsSupported)
+        {
+            var asciiMask512 = Vector512.Create((ushort)0xFF80);
+            while (src.Length - srcIdx >= Vector512<ushort>.Count)  // 32 chars per Vector512<ushort>
+            {
+                var v = Vector512.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
+                // ASCII detect: any char's high bits set (>= 0x80)?
+                if ((v & asciiMask512) != Vector512<ushort>.Zero) break;
+                // Narrow 32 ushorts (Vector512) → 32 bytes (Vector256) via two 256-bit halves.
+                // The JIT lowers this to AVX-512 VPACKUSWB on capable hosts (single-instruction pack).
+                var bytes = Vector256.Narrow(v.GetLower(), v.GetUpper());
+                bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
+                srcIdx += Vector512<ushort>.Count;
+                dstIdx += Vector512<ushort>.Count;  // 32 chars → 32 bytes (1:1 for ASCII)
+            }
+        }
+
+        // Phase 1b — Vector256 ASCII narrow (16 chars/iter on AVX2 hosts; also handles tail < 32 chars
+        // after the AVX-512 path on capable hosts).
+        if (Vector256.IsHardwareAccelerated)
+        {
+            var asciiMask = Vector256.Create((ushort)0xFF80);
+            while (src.Length - srcIdx >= Vector256<ushort>.Count)  // 16 chars per Vector256<ushort>
+            {
+                var v = Vector256.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
+                // ASCII detect: any char's high bits set (>= 0x80)?
+                if ((v & asciiMask) != Vector256<ushort>.Zero) break;
+                // Narrow 16 ushorts (Vector256) → 16 bytes (Vector128) via two halves
+                var bytes = Vector128.Narrow(v.GetLower(), v.GetUpper());
+                bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
+                srcIdx += Vector256<ushort>.Count;
+                dstIdx += Vector256<ushort>.Count;  // 16 chars → 16 bytes (1:1 for ASCII)
+            }
+        }
+
+        // Phase 1c — Vector128 ASCII narrow (16 chars/iter on Apple Silicon NEON, WASM SIMD,
+        // legacy SSE2; also handles tail < 16 chars after higher tiers). Cross-platform —
+        // Vector128.IsHardwareAccelerated is true on any 128-bit-SIMD-capable host.
+        // Two Vector128<ushort> loads (8 + 8 = 16 chars) narrow to one Vector128<byte> (16 bytes).
+        if (Vector128.IsHardwareAccelerated)
+        {
+            var asciiMask128 = Vector128.Create((ushort)0xFF80);
+            while (src.Length - srcIdx >= 16)  // 16 chars = 2 × Vector128<ushort>
+            {
+                var lo = Vector128.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
+                var hi = Vector128.LoadUnsafe(ref srcRefU16, (uint)(srcIdx + 8));
+                // ASCII detect: any char's high bits set in either half?
+                if (((lo | hi) & asciiMask128) != Vector128<ushort>.Zero) break;
+                // Narrow 2× Vector128<ushort> (16 chars) → Vector128<byte> (16 bytes)
+                var bytes = Vector128.Narrow(lo, hi);
+                bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
+                srcIdx += 16;
+                dstIdx += 16;
+            }
+        }
+
+        // Phase 2/3 — scalar with DWORD ASCII batch
+        while (srcIdx < src.Length)
+        {
+            // DWORD ASCII batch: 4 chars → 4 bytes when all ASCII
+            if (src.Length - srcIdx >= 4)
+            {
+                var c0 = Unsafe.Add(ref srcRefChar, srcIdx);
+                var c1 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
+                var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 2);
+                var c3 = Unsafe.Add(ref srcRefChar, srcIdx + 3);
+                if (((c0 | c1 | c2 | c3) & 0xFF80) == 0)
+                {
+                    Unsafe.Add(ref dstRef, dstIdx)     = (byte)c0;
+                    Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)c1;
+                    Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)c2;
+                    Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)c3;
+                    srcIdx += 4;
+                    dstIdx += 4;
+                    continue;
+                }
+            }
+
+            // Scalar single-char encode
+            var c = Unsafe.Add(ref srcRefChar, srcIdx);
+            if (c < 0x80)
+            {
+                // 1-byte ASCII (U+0000–U+007F)
+                Unsafe.Add(ref dstRef, dstIdx++) = (byte)c;
+                srcIdx += 1;
+            }
+            else if (c < 0x800)
+            {
+                // 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF
+                // Latin extended (Hungarian, Polish, Czech, Spanish, French, German diacritics),
+                // Greek, Cyrillic, Hebrew, Arabic.
+                Unsafe.Add(ref dstRef, dstIdx)     = (byte)(0xC0 | (c >> 6));
+                Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | (c & 0x3F));
+                dstIdx += 2;
+                srcIdx += 1;
+            }
+            else if ((c & 0xF800) != 0xD800)
+            {
+                // 3-byte BMP: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF (excluding surrogate range)
+                // CJK BMP, various other BMP scripts.
+                Unsafe.Add(ref dstRef, dstIdx)     = (byte)(0xE0 | (c >> 12));
+                Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((c >> 6) & 0x3F));
+                Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | (c & 0x3F));
+                dstIdx += 3;
+                srcIdx += 1;
+            }
+            else
+            {
+                // 4-byte: surrogate pair → supplementary plane codepoint (U+10000–U+10FFFF)
+                // High surrogate (0xD800–0xDBFF) followed by low surrogate (0xDC00–0xDFFF).
+                var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
+                var codepoint = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00);
+                Unsafe.Add(ref dstRef, dstIdx)     = (byte)(0xF0 | (codepoint >> 18));
+                Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((codepoint >> 12) & 0x3F));
+                Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | ((codepoint >> 6) & 0x3F));
+                Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(0x80 | (codepoint & 0x3F));
+                dstIdx += 4;
+                srcIdx += 2;  // consumed 2 chars (surrogate pair)
+            }
+        }
+
+        return dstIdx;
+    }
+
+    /// <summary>
+    /// Counts UTF-16 chars produced by decoding the given UTF-8 byte span.
+    /// </summary>
+    /// <remarks>
+    /// Layered SIMD: Vector512 (64 byte/iter) on AVX-512BW hosts → Vector256 (32 byte/iter) on AVX2
+    /// hosts → Vector128 (16 byte/iter) on Apple Silicon / WASM / SSE2 → scalar tail. All SIMD paths
+    /// use the same two bit-pattern checks:
+    /// • Non-continuation bytes (NOT 10xxxxxx, mask 0xC0 ≠ 0x80): each contributes 1 char.
+    /// • 4-byte start bytes (11110xxx, mask 0xF8 == 0xF0): each contributes an EXTRA char (surrogate pair).
+    ///
+    /// SIMD per-block result: <c>(N - popcount(continuationMask)) + popcount(fourByteStartMask)</c>.
+    /// Scalar tail handles the remaining bytes.
+    ///
+    /// Char-count rules:
+    /// • Continuation bytes (10xxxxxx, 0x80–0xBF) — produce no char, skip.
+    /// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each.
+    /// • 4-byte start bytes (11110xxx, 0xF0–0xF7) — produce 2 chars (UTF-16 surrogate pair).
+    /// </remarks>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    internal static int CountUtf8Chars(ReadOnlySpan<byte> bytes)
+    {
+        var count = 0;
+        var i = 0;
+        ref var bytesRef = ref MemoryMarshal.GetReference(bytes);
+
+        // SIMD path 1: 64 bytes/iter via Vector512 (AVX-512BW hosts)
+        if (Avx512BW.IsSupported && bytes.Length >= 64)
+        {
+            var contMask512 = Vector512.Create((byte)0xC0);
+            var contValue512 = Vector512.Create((byte)0x80);
+            var fourByteMask512 = Vector512.Create((byte)0xF8);
+            var fourByteValue512 = Vector512.Create((byte)0xF0);
+
+            do
+            {
+                var v = Vector512.LoadUnsafe(ref bytesRef, (uint)i);
+
+                // Non-continuation count: 64 - popcount(continuation byte mask)
+                var contMatches = Vector512.Equals(v & contMask512, contValue512);
+                var contBits = contMatches.ExtractMostSignificantBits();  // ulong
+                count += 64 - BitOperations.PopCount(contBits);
+
+                // 4-byte start count: popcount(fourByte start byte mask)
+                var fourByteMatches = Vector512.Equals(v & fourByteMask512, fourByteValue512);
+                var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
+                count += BitOperations.PopCount(fourByteBits);
+
+                i += 64;
+            } while (bytes.Length - i >= 64);
+        }
+
+        // SIMD path 2: 32 bytes/iter via Vector256 (AVX2 hosts, also handles AVX-512 tail < 64)
+        if (Vector256.IsHardwareAccelerated && bytes.Length - i >= 32)
+        {
+            var contMask = Vector256.Create((byte)0xC0);
+            var contValue = Vector256.Create((byte)0x80);
+            var fourByteMask = Vector256.Create((byte)0xF8);
+            var fourByteValue = Vector256.Create((byte)0xF0);
+
+            do
+            {
+                var v = Vector256.LoadUnsafe(ref bytesRef, (uint)i);
+
+                // Non-continuation count: 32 - popcount(continuation byte mask)
+                var contMatches = Vector256.Equals(v & contMask, contValue);
+                var contBits = contMatches.ExtractMostSignificantBits();
+                count += 32 - BitOperations.PopCount(contBits);
+
+                // 4-byte start count: popcount(fourByte start byte mask)
+                var fourByteMatches = Vector256.Equals(v & fourByteMask, fourByteValue);
+                var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
+                count += BitOperations.PopCount(fourByteBits);
+
+                i += 32;
+            } while (bytes.Length - i >= 32);
+        }
+
+        // SIMD path 3: 16 bytes/iter via Vector128 (Apple Silicon NEON, WASM SIMD, legacy SSE2;
+        // also handles tail < 32 from higher tiers). Cross-platform — Vector128.IsHardwareAccelerated
+        // returns true on any host with a 128-bit SIMD ISA (NEON / SSE2 / WASM SIMD).
+        if (Vector128.IsHardwareAccelerated && bytes.Length - i >= 16)
+        {
+            var contMask128 = Vector128.Create((byte)0xC0);
+            var contValue128 = Vector128.Create((byte)0x80);
+            var fourByteMask128 = Vector128.Create((byte)0xF8);
+            var fourByteValue128 = Vector128.Create((byte)0xF0);
+
+            do
+            {
+                var v = Vector128.LoadUnsafe(ref bytesRef, (uint)i);
+
+                // Non-continuation count: 16 - popcount(continuation byte mask)
+                var contMatches = Vector128.Equals(v & contMask128, contValue128);
+                var contBits = contMatches.ExtractMostSignificantBits();
+                count += 16 - BitOperations.PopCount(contBits);
+
+                // 4-byte start count: popcount(fourByte start byte mask)
+                var fourByteMatches = Vector128.Equals(v & fourByteMask128, fourByteValue128);
+                var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
+                count += BitOperations.PopCount(fourByteBits);
+
+                i += 16;
+            } while (bytes.Length - i >= 16);
+        }
+
+        // Scalar tail (and fallback for non-SIMD hardware)
+        for (; i < bytes.Length; i++)
+        {
+            var b = Unsafe.Add(ref bytesRef, i);
+            if ((b & 0xC0) != 0x80) count++;       // non-continuation byte
+            if ((b & 0xF8) == 0xF0) count++;        // 4-byte start: extra char for surrogate pair
+        }
+        return count;
+    }
+
+    /// <summary>
+    /// Single-pass UTF-8 → UTF-16 decoder. Returns the actual char count written to <paramref name="dst"/>.
+    /// </summary>
+    /// <remarks>
+    /// Layered approach for maximum throughput across mixed content:
+    /// • <b>Phase 1 — Vector256 ASCII prefix bulk widen:</b> 32 bytes/iter while all top bits are zero.
+    ///   Uses <see cref="Vector256.Widen(Vector256{byte})"/> to produce two Vector256&lt;ushort&gt; lanes
+    ///   = 32 chars per iteration. Breaks on first non-ASCII byte found in the loaded vector.
+    /// • <b>Phase 2 — DWORD ASCII batch:</b> when ≥4 bytes remain, read as <c>uint</c>, test
+    ///   <c>(dword &amp; 0x80808080u) == 0</c>; on hit, widen 4 chars in 4 instructions and continue.
+    /// • <b>Phase 3 — Scalar multi-byte branch:</b> 1-byte (ASCII single), 2-byte (Latin extended,
+    ///   Cyrillic, Greek, Hebrew, Arabic), 3-byte (CJK BMP), 4-byte (supplementary plane → surrogate pair).
+    ///   Direct bit-extract, no validation — input is trusted.
+    ///
+    /// JIT compiles the switch into a jump table for predictable dispatch on mixed content.
+    /// Hungarian text typical pattern: ASCII run (Phase 1/2 widening) → 2-byte char (Phase 3
+    /// case &lt; 0xE0) → ASCII run → 2-byte char → ... — each phase optimal for its segment.
+    /// </remarks>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    internal static int DecodeUtf8SinglePass(ReadOnlySpan<byte> src, Span<char> dst)
+    {
+        int srcIdx = 0, dstIdx = 0;
+        ref byte srcRef = ref MemoryMarshal.GetReference(src);
+        ref ushort dstRef = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(dst));
+
+        // Phase 1 — Vector256 ASCII prefix bulk widen (32 bytes/iter)
+        if (Vector256.IsHardwareAccelerated)
+        {
+            while (src.Length - srcIdx >= Vector256<byte>.Count)
+            {
+                var v = Vector256.LoadUnsafe(ref srcRef, (uint)srcIdx);
+                // ASCII detect: any high bit set among the 32 bytes?
+                if (v.ExtractMostSignificantBits() != 0) break;
+
+                // Widen 32 bytes → 2 × Vector256<ushort> (32 chars total)
+                var (lower, upper) = Vector256.Widen(v);
+                lower.StoreUnsafe(ref dstRef, (uint)dstIdx);
+                upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector128<ushort>.Count));
+                srcIdx += Vector256<byte>.Count;
+                dstIdx += Vector256<byte>.Count;  // 32 bytes → 32 chars
+            }
+        }
+
+        // Phase 2/3 — scalar loop with DWORD ASCII batch
+        while (srcIdx < src.Length)
+        {
+            // DWORD ASCII batch: 4 ASCII bytes → 4 chars per iter
+            if (src.Length - srcIdx >= 4)
+            {
+                var dword = Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref srcRef, srcIdx));
+                if ((dword & 0x80808080u) == 0)
+                {
+                    Unsafe.Add(ref dstRef, dstIdx)     = (byte)dword;
+                    Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(dword >> 8);
+                    Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(dword >> 16);
+                    Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(dword >> 24);
+                    srcIdx += 4;
+                    dstIdx += 4;
+                    continue;
+                }
+            }
+
+            // Scalar multi-byte branch (jump-table compile via switch)
+            var b0 = Unsafe.Add(ref srcRef, srcIdx);
+            switch (b0)
+            {
+                case < 0x80:
+                    // 1-byte ASCII (U+0000–U+007F)
+                    Unsafe.Add(ref dstRef, dstIdx++) = b0;
+                    srcIdx += 1;
+                    break;
+                case < 0xE0:
+                {
+                    // 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF
+                    // Latin extended, Cyrillic, Greek, Hebrew, Arabic.
+                    var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
+                    Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x1F) << 6) | (b1 & 0x3F));
+                    srcIdx += 2;
+                    break;
+                }
+                case < 0xF0:
+                {
+                    // 3-byte: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF
+                    // CJK BMP, various other scripts.
+                    var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
+                    var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
+                    Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F));
+                    srcIdx += 3;
+                    break;
+                }
+                default:
+                {
+                    // 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000–U+10FFFF
+                    // Supplementary plane (emoji, rare CJK ext) → UTF-16 surrogate pair.
+                    var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
+                    var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
+                    var b3 = Unsafe.Add(ref srcRef, srcIdx + 3);
+                    var codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
+                    codepoint -= 0x10000;
+                    Unsafe.Add(ref dstRef, dstIdx)     = (ushort)(0xD800 | (codepoint >> 10));
+                    Unsafe.Add(ref dstRef, dstIdx + 1) = (ushort)(0xDC00 | (codepoint & 0x3FF));
+                    dstIdx += 2;
+                    srcIdx += 4;
+                    break;
+                }
+            }
+        }
+
+        return dstIdx;
+    }
+}
diff --git a/AyCode.Core/docs/BINARY/BINARY_TODO.md b/AyCode.Core/docs/BINARY/BINARY_TODO.md
index b3358da..81aa3f3 100644
--- a/AyCode.Core/docs/BINARY/BINARY_TODO.md
+++ b/AyCode.Core/docs/BINARY/BINARY_TODO.md
@@ -693,9 +693,18 @@ JIT/AOT path-selection via `[Intrinsic]` `IsSupported` static booleans — non-s
 | 1 | `CountUtf8Chars` (decode 1st pass) | ✅ done | ✅ existing | ✅ done | ✅ existing |
 | 2 | `EncodeUtf8SinglePass` Phase 1 (ASCII narrow) | ✅ done | ✅ existing | ✅ done | ✅ existing |
 | 3a | `DecodeUtf8SinglePass` multi-byte transcoder (Vector512) | ⏳ TODO | bail-out only | bail-out only | ✅ existing |
-| 3b | `DecodeUtf8SinglePass` multi-byte transcoder (Vector256) | — | ⏳ TODO | bail-out only | ✅ existing |
+| 3b | `DecodeUtf8SinglePass` multi-byte transcoder (Vector256) | — | 🔍 **deferred — see note** | bail-out only | ✅ existing |
 | 3c | `DecodeUtf8SinglePass` multi-byte transcoder (Vector128) | — | — | ⏳ TODO | ✅ existing |
 
+**Note on Phase 3b (Vector256 / AVX2) — deferred, not dropped.** AVX2 lacks the AVX-512BW primitives (`CompareEqualMask` producing a `__mmask` k-register, in-lane `vpermb`, mask-driven `vpcompressb`) that make the classify-mask-compress-widen pipeline efficient. The `Vector256.Shuffle` is cross-lane via two `vpshufb` (per-128-bit-lane), which complicates leader-byte extraction across multi-byte sequences spanning the lane boundary. The simdutf C++ project — the canonical reference for this algorithm class — implements only **SSE4 (16-byte)** and **AVX-512 (64-byte)** paths; it explicitly skips AVX2 because the implementation cost-benefit is unfavorable on this algorithm.
+
+On AVX2 hosts, the Phase 3c (Vector128) transcoder runs as the primer multi-byte path AND as tail handler — covering AVX2 hosts with 16-byte/iter, which is already a significant win over the current scalar multi-byte branch. Phase 3b would require either:
+
+1. Hand-rolling an AVX2-specific 32-byte algorithm with cross-lane permute workarounds (research-grade complexity, uncertain net win — could be SLOWER than the Vector128 path due to cross-lane shuffle latency)
+2. Waiting for `Avx10v1` / `Avx10v2` to expose AVX-512BW-class primitives in 256-bit form (Intel's unified vector ISA — `Avx10v1` already in .NET 9, `Avx10v2` arrives with future Intel hardware)
+
+**Re-evaluation triggers:** if benchmark on AVX2 hosts shows Phase 3c Vector128 path leaves > 10% Deser gap vs MemPack on multi-byte content; or if `Avx10v1` 256-bit primitives mature enough to make the algorithm tractable. Until then: **Phase 3b stays in the TODO as a research / future-work item** — not actively scheduled, but documented so a future contributor doesn't re-derive the AVX2 limitations.
+
 **Phase 3 is the remaining gap — UTF-8 multi-byte decode on every host class**. ASCII path is already fast across all SIMD tiers (Vector256 + Vector128 prefix widen + `Encoding.Latin1.GetString` BCL fast path). The gap is on **multi-byte UTF-8 content** — Hungarian / Cyrillic / Greek (2-byte) and CJK BMP (3-byte) sequences — where the SIMD prefix bails out on the first non-ASCII byte and falls back to scalar bit-extract. The Repeated benchmark cell (Hungarian content) is the canonical witness; with all-Hungarian content (current bench data), Small / Repeated Deser cells trail MemPack by 6-14%.
 
 **Why all 3 SIMD tiers (not just AVX-512BW)** — public NuGet package goal: i18n payloads must be fast on every supported host (cloud server, desktop, mobile, Blazor WASM), not only AVX-512-capable cloud servers. The saját scalar multi-byte branch is the bottleneck on **all** non-ASCII content regardless of host class. The BCL `Encoding.UTF8` falls back to a similar scalar path on multi-byte content (with virtual dispatch + EncoderFallback overhead), so even where the BCL has its own SIMD 2-byte handler (.NET 9 PR #92580), our trust-input scalar wins on net — but a saját SIMD multi-byte path would dominate on every host.