diff --git a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs
index 2fc1f8d..e71dc3b 100644
--- a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs
+++ b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs
@@ -484,242 +484,14 @@ public static partial class AcBinaryDeserializer
var pos = _position;
_position += byteLength;
var src = _buffer.AsSpan(pos, byteLength);
- var charCount = CountUtf8Chars(src);
+ var charCount = Utf8Transcoder.CountUtf8Chars(src);
return string.Create(charCount, (Buffer: _buffer, Pos: pos, Len: byteLength), static (chars, state) =>
{
- DecodeUtf8SinglePass(state.Buffer.AsSpan(state.Pos, state.Len), chars);
+ Utf8Transcoder.DecodeUtf8SinglePass(state.Buffer.AsSpan(state.Pos, state.Len), chars);
});
}
- ///
- /// Counts UTF-16 chars produced by decoding the given UTF-8 byte span.
- ///
- ///
- /// Layered SIMD: Vector512 (64 byte/iter) on AVX-512BW hosts → Vector256 (32 byte/iter)
- /// on AVX2 hosts → scalar tail. Both SIMD paths use the same two bit-pattern checks:
- /// • Non-continuation bytes (NOT 10xxxxxx, mask 0xC0 ≠ 0x80): each contributes 1 char.
- /// • 4-byte start bytes (11110xxx, mask 0xF8 == 0xF0): each contributes an EXTRA char (surrogate pair).
- ///
- /// SIMD per-block result: (N - popcount(continuationMask)) + popcount(fourByteStartMask)
- /// where N = 64 (Vector512) or 32 (Vector256). Scalar tail handles the remaining bytes.
- ///
- /// Char-count rules:
- /// • Continuation bytes (10xxxxxx, 0x80–0xBF) — produce no char, skip.
- /// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each.
- /// • 4-byte start bytes (11110xxx, 0xF0–0xF7) — produce 2 chars (UTF-16 surrogate pair).
- ///
- /// JIT-time path-selection: Avx512BW.IsSupported and Vector256.IsHardwareAccelerated
- /// are [Intrinsic] static booleans — the JIT/AOT constant-folds the dead branches per host.
- ///
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static int CountUtf8Chars(ReadOnlySpan bytes)
- {
- var count = 0;
- var i = 0;
- ref var bytesRef = ref MemoryMarshal.GetReference(bytes);
-
- // SIMD path 1: 64 bytes/iter via Vector512 (AVX-512BW hosts)
- if (Avx512BW.IsSupported && bytes.Length >= 64)
- {
- var contMask512 = Vector512.Create((byte)0xC0);
- var contValue512 = Vector512.Create((byte)0x80);
- var fourByteMask512 = Vector512.Create((byte)0xF8);
- var fourByteValue512 = Vector512.Create((byte)0xF0);
-
- do
- {
- var v = Vector512.LoadUnsafe(ref bytesRef, (uint)i);
-
- // Non-continuation count: 64 - popcount(continuation byte mask)
- var contMatches = Vector512.Equals(v & contMask512, contValue512);
- var contBits = contMatches.ExtractMostSignificantBits(); // ulong
- count += 64 - System.Numerics.BitOperations.PopCount(contBits);
-
- // 4-byte start count: popcount(fourByte start byte mask)
- var fourByteMatches = Vector512.Equals(v & fourByteMask512, fourByteValue512);
- var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
- count += System.Numerics.BitOperations.PopCount(fourByteBits);
-
- i += 64;
- } while (bytes.Length - i >= 64);
- }
-
- // SIMD path 2: 32 bytes/iter via Vector256 (AVX2 hosts, also handles AVX-512 tail < 64)
- if (Vector256.IsHardwareAccelerated && bytes.Length - i >= 32)
- {
- var contMask = Vector256.Create((byte)0xC0);
- var contValue = Vector256.Create((byte)0x80);
- var fourByteMask = Vector256.Create((byte)0xF8);
- var fourByteValue = Vector256.Create((byte)0xF0);
-
- do
- {
- var v = Vector256.LoadUnsafe(ref bytesRef, (uint)i);
-
- // Non-continuation count: 32 - popcount(continuation byte mask)
- var contMatches = Vector256.Equals(v & contMask, contValue);
- var contBits = contMatches.ExtractMostSignificantBits();
- count += 32 - System.Numerics.BitOperations.PopCount(contBits);
-
- // 4-byte start count: popcount(fourByte start byte mask)
- var fourByteMatches = Vector256.Equals(v & fourByteMask, fourByteValue);
- var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
- count += System.Numerics.BitOperations.PopCount(fourByteBits);
-
- i += 32;
- } while (bytes.Length - i >= 32);
- }
-
- // SIMD path 3: 16 bytes/iter via Vector128 (Apple Silicon NEON, WASM SIMD, legacy SSE2;
- // also handles tail < 32 from higher tiers). Cross-platform — Vector128.IsHardwareAccelerated
- // returns true on any host with a 128-bit SIMD ISA (NEON / SSE2 / WASM SIMD).
- if (Vector128.IsHardwareAccelerated && bytes.Length - i >= 16)
- {
- var contMask128 = Vector128.Create((byte)0xC0);
- var contValue128 = Vector128.Create((byte)0x80);
- var fourByteMask128 = Vector128.Create((byte)0xF8);
- var fourByteValue128 = Vector128.Create((byte)0xF0);
-
- do
- {
- var v = Vector128.LoadUnsafe(ref bytesRef, (uint)i);
-
- // Non-continuation count: 16 - popcount(continuation byte mask)
- var contMatches = Vector128.Equals(v & contMask128, contValue128);
- var contBits = contMatches.ExtractMostSignificantBits();
- count += 16 - System.Numerics.BitOperations.PopCount(contBits);
-
- // 4-byte start count: popcount(fourByte start byte mask)
- var fourByteMatches = Vector128.Equals(v & fourByteMask128, fourByteValue128);
- var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
- count += System.Numerics.BitOperations.PopCount(fourByteBits);
-
- i += 16;
- } while (bytes.Length - i >= 16);
- }
-
- // Scalar tail (and fallback for non-SIMD hardware)
- for (; i < bytes.Length; i++)
- {
- var b = Unsafe.Add(ref bytesRef, i);
- if ((b & 0xC0) != 0x80) count++; // non-continuation byte
- if ((b & 0xF8) == 0xF0) count++; // 4-byte start: extra char for surrogate pair
- }
- return count;
- }
-
- ///
- /// Single-pass UTF-8 → UTF-16 decoder. Returns the actual char count written to .
- ///
- ///
- /// Layered approach for maximum throughput across mixed content:
- /// • Phase 1 — Vector256 ASCII prefix bulk widen: 32 bytes/iter while all top bits are zero.
- /// Uses to produce two Vector256<ushort> lanes
- /// = 32 chars per iteration. Breaks on first non-ASCII byte found in the loaded vector.
- /// • Phase 2 — DWORD ASCII batch: when ≥4 bytes remain, read as uint, test
- /// (dword & 0x80808080u) == 0; on hit, widen 4 chars in 4 instructions and continue.
- /// • Phase 3 — Scalar multi-byte branch: 1-byte (ASCII single), 2-byte (Latin extended,
- /// Cyrillic, Greek, Hebrew, Arabic), 3-byte (CJK BMP), 4-byte (supplementary plane → surrogate pair).
- /// Direct bit-extract, no validation — input is trusted.
- ///
- /// JIT compiles the switch into a jump table for predictable dispatch on mixed content.
- /// Hungarian text typical pattern: ASCII run (Phase 1/2 widening) → 2-byte char (Phase 3
- /// case < 0xE0) → ASCII run → 2-byte char → ... — each phase optimal for its segment.
- ///
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static int DecodeUtf8SinglePass(ReadOnlySpan src, Span dst)
- {
- int srcIdx = 0, dstIdx = 0;
- ref byte srcRef = ref MemoryMarshal.GetReference(src);
- ref ushort dstRef = ref Unsafe.As(ref MemoryMarshal.GetReference(dst));
-
- // Phase 1 — Vector256 ASCII prefix bulk widen (32 bytes/iter)
- if (Vector256.IsHardwareAccelerated)
- {
- while (src.Length - srcIdx >= Vector256.Count)
- {
- var v = Vector256.LoadUnsafe(ref srcRef, (uint)srcIdx);
- // ASCII detect: any high bit set among the 32 bytes?
- if (v.ExtractMostSignificantBits() != 0) break;
-
- // Widen 32 bytes → 2 × Vector256 (32 chars total)
- var (lower, upper) = Vector256.Widen(v);
- lower.StoreUnsafe(ref dstRef, (uint)dstIdx);
- upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector128.Count));
- srcIdx += Vector256.Count;
- dstIdx += Vector256.Count; // 32 bytes → 32 chars
- }
- }
-
- // Phase 2/3 — scalar loop with DWORD ASCII batch
- while (srcIdx < src.Length)
- {
- // DWORD ASCII batch: 4 ASCII bytes → 4 chars per iter
- if (src.Length - srcIdx >= 4)
- {
- var dword = Unsafe.ReadUnaligned(ref Unsafe.Add(ref srcRef, srcIdx));
- if ((dword & 0x80808080u) == 0)
- {
- Unsafe.Add(ref dstRef, dstIdx) = (byte)dword;
- Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(dword >> 8);
- Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(dword >> 16);
- Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(dword >> 24);
- srcIdx += 4;
- dstIdx += 4;
- continue;
- }
- }
-
- // Scalar multi-byte branch (jump-table compile via switch)
- var b0 = Unsafe.Add(ref srcRef, srcIdx);
- switch (b0)
- {
- case < 0x80:
- // 1-byte ASCII (U+0000–U+007F)
- Unsafe.Add(ref dstRef, dstIdx++) = b0;
- srcIdx += 1;
- break;
- case < 0xE0:
- {
- // 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF
- // Latin extended, Cyrillic, Greek, Hebrew, Arabic.
- var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
- Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x1F) << 6) | (b1 & 0x3F));
- srcIdx += 2;
- break;
- }
- case < 0xF0:
- {
- // 3-byte: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF
- // CJK BMP, various other scripts.
- var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
- var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
- Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F));
- srcIdx += 3;
- break;
- }
- default:
- {
- // 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000–U+10FFFF
- // Supplementary plane (emoji, rare CJK ext) → UTF-16 surrogate pair.
- var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
- var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
- var b3 = Unsafe.Add(ref srcRef, srcIdx + 3);
- var codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
- codepoint -= 0x10000;
- Unsafe.Add(ref dstRef, dstIdx) = (ushort)(0xD800 | (codepoint >> 10));
- Unsafe.Add(ref dstRef, dstIdx + 1) = (ushort)(0xDC00 | (codepoint & 0x3FF));
- dstIdx += 2;
- srcIdx += 4;
- break;
- }
- }
- }
-
- return dstIdx;
- }
-
private string ReadStringUtf8Cached(int length)
{
var slice = _buffer.AsSpan(_position, length);
diff --git a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs
index 45b256a..ce9af11 100644
--- a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs
+++ b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs
@@ -699,7 +699,7 @@ public static partial class AcBinarySerializer
var savedPos = _position;
var encodeStart = savedPos + reserveSize;
- var bytesWritten = EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
+ var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
var actualVarUIntSize = VarUIntSize((uint)bytesWritten);
if (actualVarUIntSize < reserveSize)
@@ -768,7 +768,7 @@ public static partial class AcBinarySerializer
EnsureCapacity(2 + maxBytesShort); // marker + 1-byte VarUInt + bytes (worst case)
var savedPosShort = _position;
- var bytesWrittenShort = EncodeUtf8SinglePass(
+ var bytesWrittenShort = Utf8Transcoder.EncodeUtf8SinglePass(
value.AsSpan(),
_buffer.AsSpan(savedPosShort + 1, maxBytesShort));
var isAsciiShort = bytesWrittenShort == charLength;
@@ -805,7 +805,7 @@ public static partial class AcBinarySerializer
var savedPos = _position;
var encodeStart = savedPos + 1 + reserveVarUInt;
- var bytesWritten = EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
+ var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
var isAscii = bytesWritten == charLength;
_buffer[savedPos] = isAscii ? BinaryTypeCode.StringAscii : BinaryTypeCode.String;
@@ -876,161 +876,6 @@ public static partial class AcBinarySerializer
_position += byteCount;
}
- ///
- /// Custom UTF-16 → UTF-8 single-pass encoder. Symmetric with the deserializer's custom decoder
- /// ('s DecodeUtf8SinglePass).
- ///
- ///
- /// Bypasses .GetBytes virtual-dispatch + encoder-fallback
- /// overhead. Trusted-input encoder — no validation (writer side, the input is a .NET string
- /// which always has valid UTF-16 surrogate pairs).
- ///
- /// Layered for max throughput on mixed content:
- /// • Phase 1 — Vector256 ASCII narrow: 16 chars/iter. Loads Vector256<ushort>,
- /// tests (v & 0xFF80) == 0 for all-ASCII; on hit, narrows to Vector128<byte>
- /// via Vector128.Narrow(GetLower, GetUpper) = 16 bytes per iter.
- /// • Phase 2 — DWORD ASCII batch: 4 chars/iter. OR-mask test
- /// (c0 | c1 | c2 | c3) & 0xFF80 == 0; on hit, 4 byte writes per iter.
- /// • Phase 3 — Scalar multi-byte encode: 1-byte (ASCII), 2-byte (Latin extended,
- /// Cyrillic, Greek), 3-byte (CJK BMP), 4-byte (supplementary plane via UTF-16 surrogate pair).
- ///
- /// Returns actual byte count written. Caller must ensure has at least
- /// src.Length * 4 capacity (UTF-8 worst case).
- ///
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static int EncodeUtf8SinglePass(ReadOnlySpan src, Span dst)
- {
- int srcIdx = 0, dstIdx = 0;
- ref char srcRefChar = ref MemoryMarshal.GetReference(src);
- ref ushort srcRefU16 = ref Unsafe.As(ref srcRefChar);
- ref byte dstRef = ref MemoryMarshal.GetReference(dst);
-
- // Phase 1a — Vector512 ASCII narrow (32 chars/iter on AVX-512BW hosts).
- // JIT-time path-selection via Avx512BW.IsSupported [Intrinsic] static bool — non-AVX-512
- // hosts get this branch eliminated by constant-folding (zero overhead in the generated asm).
- if (Avx512BW.IsSupported)
- {
- var asciiMask512 = Vector512.Create((ushort)0xFF80);
- while (src.Length - srcIdx >= Vector512.Count) // 32 chars per Vector512
- {
- var v = Vector512.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
- // ASCII detect: any char's high bits set (>= 0x80)?
- if ((v & asciiMask512) != Vector512.Zero) break;
- // Narrow 32 ushorts (Vector512) → 32 bytes (Vector256) via two 256-bit halves.
- // The JIT lowers this to AVX-512 VPACKUSWB on capable hosts (single-instruction pack).
- var bytes = Vector256.Narrow(v.GetLower(), v.GetUpper());
- bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
- srcIdx += Vector512.Count;
- dstIdx += Vector512.Count; // 32 chars → 32 bytes (1:1 for ASCII)
- }
- }
-
- // Phase 1b — Vector256 ASCII narrow (16 chars/iter on AVX2 hosts; also handles tail < 32 chars
- // after the AVX-512 path on capable hosts).
- if (Vector256.IsHardwareAccelerated)
- {
- var asciiMask = Vector256.Create((ushort)0xFF80);
- while (src.Length - srcIdx >= Vector256.Count) // 16 chars per Vector256
- {
- var v = Vector256.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
- // ASCII detect: any char's high bits set (>= 0x80)?
- if ((v & asciiMask) != Vector256.Zero) break;
- // Narrow 16 ushorts (Vector256) → 16 bytes (Vector128) via two halves
- var bytes = Vector128.Narrow(v.GetLower(), v.GetUpper());
- bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
- srcIdx += Vector256.Count;
- dstIdx += Vector256.Count; // 16 chars → 16 bytes (1:1 for ASCII)
- }
- }
-
- // Phase 1c — Vector128 ASCII narrow (16 chars/iter on Apple Silicon NEON, WASM SIMD,
- // legacy SSE2; also handles tail < 16 chars after higher tiers). Cross-platform —
- // Vector128.IsHardwareAccelerated is true on any 128-bit-SIMD-capable host.
- // Two Vector128 loads (8 + 8 = 16 chars) narrow to one Vector128 (16 bytes).
- if (Vector128.IsHardwareAccelerated)
- {
- var asciiMask128 = Vector128.Create((ushort)0xFF80);
- while (src.Length - srcIdx >= 16) // 16 chars = 2 × Vector128
- {
- var lo = Vector128.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
- var hi = Vector128.LoadUnsafe(ref srcRefU16, (uint)(srcIdx + 8));
- // ASCII detect: any char's high bits set in either half?
- if (((lo | hi) & asciiMask128) != Vector128.Zero) break;
- // Narrow 2× Vector128 (16 chars) → Vector128 (16 bytes)
- var bytes = Vector128.Narrow(lo, hi);
- bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
- srcIdx += 16;
- dstIdx += 16;
- }
- }
-
- // Phase 2/3 — scalar with DWORD ASCII batch
- while (srcIdx < src.Length)
- {
- // DWORD ASCII batch: 4 chars → 4 bytes when all ASCII
- if (src.Length - srcIdx >= 4)
- {
- var c0 = Unsafe.Add(ref srcRefChar, srcIdx);
- var c1 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
- var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 2);
- var c3 = Unsafe.Add(ref srcRefChar, srcIdx + 3);
- if (((c0 | c1 | c2 | c3) & 0xFF80) == 0)
- {
- Unsafe.Add(ref dstRef, dstIdx) = (byte)c0;
- Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)c1;
- Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)c2;
- Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)c3;
- srcIdx += 4;
- dstIdx += 4;
- continue;
- }
- }
-
- // Scalar single-char encode
- var c = Unsafe.Add(ref srcRefChar, srcIdx);
- if (c < 0x80)
- {
- // 1-byte ASCII (U+0000–U+007F)
- Unsafe.Add(ref dstRef, dstIdx++) = (byte)c;
- srcIdx += 1;
- }
- else if (c < 0x800)
- {
- // 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF
- // Latin extended (Hungarian, Polish, Czech, Spanish, French, German diacritics),
- // Greek, Cyrillic, Hebrew, Arabic.
- Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xC0 | (c >> 6));
- Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | (c & 0x3F));
- dstIdx += 2;
- srcIdx += 1;
- }
- else if ((c & 0xF800) != 0xD800)
- {
- // 3-byte BMP: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF (excluding surrogate range)
- // CJK BMP, various other BMP scripts.
- Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xE0 | (c >> 12));
- Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((c >> 6) & 0x3F));
- Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | (c & 0x3F));
- dstIdx += 3;
- srcIdx += 1;
- }
- else
- {
- // 4-byte: surrogate pair → supplementary plane codepoint (U+10000–U+10FFFF)
- // High surrogate (0xD800–0xDBFF) followed by low surrogate (0xDC00–0xDFFF).
- var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
- var codepoint = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00);
- Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xF0 | (codepoint >> 18));
- Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((codepoint >> 12) & 0x3F));
- Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | ((codepoint >> 6) & 0x3F));
- Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(0x80 | (codepoint & 0x3F));
- dstIdx += 4;
- srcIdx += 2; // consumed 2 chars (surrogate pair)
- }
- }
-
- return dstIdx;
- }
#endregion
diff --git a/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs b/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs
new file mode 100644
index 0000000..a69adab
--- /dev/null
+++ b/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs
@@ -0,0 +1,416 @@
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace AyCode.Core.Serializers.Binaries;
+
+///
+/// In-house UTF-8 ↔ UTF-16 transcoder used by the binary serializer hot path. Trust-input
+/// semantics — no validation (writer-side input is a .NET string, reader-side input is valid
+/// UTF-8 by writer contract). Bypasses virtual-dispatch
+/// + EncoderFallback / DecoderFallback overhead.
+///
+/// SIMD path hierarchy (cascading tail-handler):
+///
+/// - Vector512 / AVX-512BW — 64 byte/iter (Intel server, Intel 11th gen client, AMD Zen 4+)
+/// - Vector256 / AVX2 — 32 byte/iter (Intel 12-14th gen client, AMD Zen 3 and earlier)
+/// - Vector128 / SSE-NEON-WASM — 16 byte/iter (Apple Silicon NEON, WASM SIMD, legacy SSE2)
+/// - Scalar — final tail (< 16 byte) and no-SIMD fall-back
+///
+///
+/// JIT/AOT path-selection: Avx512BW.IsSupported / Vector256.IsHardwareAccelerated /
+/// Vector128.IsHardwareAccelerated are [Intrinsic] static booleans — the compiler
+/// constant-folds the dead branches per host. Non-supported tiers eliminate to zero generated code.
+///
+/// Algorithm reference: see BINARY_TODO.md#accore-bin-t-v4n2 for the multi-tier SIMD
+/// transcoder design and per-tier acceptance criteria.
+///
+internal static class Utf8Transcoder
+{
+ ///
+ /// Custom UTF-16 → UTF-8 single-pass encoder. Symmetric with .
+ ///
+ ///
+ /// Bypasses .GetBytes virtual-dispatch + encoder-fallback
+ /// overhead. Trusted-input encoder — no validation (writer side, the input is a .NET string
+ /// which always has valid UTF-16 surrogate pairs).
+ ///
+ /// Layered for max throughput on mixed content:
+ /// • Phase 1a — Vector512 ASCII narrow: 32 chars/iter on AVX-512BW hosts. JIT lowers
+ /// Vector256.Narrow to AVX-512 VPACKUSWB (single-instruction pack).
+ /// • Phase 1b — Vector256 ASCII narrow: 16 chars/iter on AVX2 hosts (also handles tail
+ /// < 32 chars after the AVX-512 path on capable hosts).
+ /// • Phase 1c — Vector128 ASCII narrow: 16 chars/iter on Apple Silicon NEON / WASM SIMD
+ /// / legacy SSE2 hosts (also handles tail < 16 chars).
+ /// • Phase 2 — DWORD ASCII batch: 4 chars/iter. OR-mask test
+ /// (c0 | c1 | c2 | c3) & 0xFF80 == 0; on hit, 4 byte writes per iter.
+ /// • Phase 3 — Scalar multi-byte encode: 1-byte (ASCII), 2-byte (Latin extended,
+ /// Cyrillic, Greek), 3-byte (CJK BMP), 4-byte (supplementary plane via UTF-16 surrogate pair).
+ ///
+ /// Returns actual byte count written. Caller must ensure has at least
+ /// src.Length * 4 capacity (UTF-8 worst case).
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static int EncodeUtf8SinglePass(ReadOnlySpan src, Span dst)
+ {
+ int srcIdx = 0, dstIdx = 0;
+ ref char srcRefChar = ref MemoryMarshal.GetReference(src);
+ ref ushort srcRefU16 = ref Unsafe.As(ref srcRefChar);
+ ref byte dstRef = ref MemoryMarshal.GetReference(dst);
+
+ // Phase 1a — Vector512 ASCII narrow (32 chars/iter on AVX-512BW hosts).
+ // JIT-time path-selection via Avx512BW.IsSupported [Intrinsic] static bool — non-AVX-512
+ // hosts get this branch eliminated by constant-folding (zero overhead in the generated asm).
+ if (Avx512BW.IsSupported)
+ {
+ var asciiMask512 = Vector512.Create((ushort)0xFF80);
+ while (src.Length - srcIdx >= Vector512.Count) // 32 chars per Vector512
+ {
+ var v = Vector512.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
+ // ASCII detect: any char's high bits set (>= 0x80)?
+ if ((v & asciiMask512) != Vector512.Zero) break;
+ // Narrow 32 ushorts (Vector512) → 32 bytes (Vector256) via two 256-bit halves.
+ // The JIT lowers this to AVX-512 VPACKUSWB on capable hosts (single-instruction pack).
+ var bytes = Vector256.Narrow(v.GetLower(), v.GetUpper());
+ bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
+ srcIdx += Vector512.Count;
+ dstIdx += Vector512.Count; // 32 chars → 32 bytes (1:1 for ASCII)
+ }
+ }
+
+ // Phase 1b — Vector256 ASCII narrow (16 chars/iter on AVX2 hosts; also handles tail < 32 chars
+ // after the AVX-512 path on capable hosts).
+ if (Vector256.IsHardwareAccelerated)
+ {
+ var asciiMask = Vector256.Create((ushort)0xFF80);
+ while (src.Length - srcIdx >= Vector256.Count) // 16 chars per Vector256
+ {
+ var v = Vector256.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
+ // ASCII detect: any char's high bits set (>= 0x80)?
+ if ((v & asciiMask) != Vector256.Zero) break;
+ // Narrow 16 ushorts (Vector256) → 16 bytes (Vector128) via two halves
+ var bytes = Vector128.Narrow(v.GetLower(), v.GetUpper());
+ bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
+ srcIdx += Vector256.Count;
+ dstIdx += Vector256.Count; // 16 chars → 16 bytes (1:1 for ASCII)
+ }
+ }
+
+ // Phase 1c — Vector128 ASCII narrow (16 chars/iter on Apple Silicon NEON, WASM SIMD,
+ // legacy SSE2; also handles tail < 16 chars after higher tiers). Cross-platform —
+ // Vector128.IsHardwareAccelerated is true on any 128-bit-SIMD-capable host.
+ // Two Vector128 loads (8 + 8 = 16 chars) narrow to one Vector128 (16 bytes).
+ if (Vector128.IsHardwareAccelerated)
+ {
+ var asciiMask128 = Vector128.Create((ushort)0xFF80);
+ while (src.Length - srcIdx >= 16) // 16 chars = 2 × Vector128
+ {
+ var lo = Vector128.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
+ var hi = Vector128.LoadUnsafe(ref srcRefU16, (uint)(srcIdx + 8));
+ // ASCII detect: any char's high bits set in either half?
+ if (((lo | hi) & asciiMask128) != Vector128.Zero) break;
+ // Narrow 2× Vector128 (16 chars) → Vector128 (16 bytes)
+ var bytes = Vector128.Narrow(lo, hi);
+ bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
+ srcIdx += 16;
+ dstIdx += 16;
+ }
+ }
+
+ // Phase 2/3 — scalar with DWORD ASCII batch
+ while (srcIdx < src.Length)
+ {
+ // DWORD ASCII batch: 4 chars → 4 bytes when all ASCII
+ if (src.Length - srcIdx >= 4)
+ {
+ var c0 = Unsafe.Add(ref srcRefChar, srcIdx);
+ var c1 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
+ var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 2);
+ var c3 = Unsafe.Add(ref srcRefChar, srcIdx + 3);
+ if (((c0 | c1 | c2 | c3) & 0xFF80) == 0)
+ {
+ Unsafe.Add(ref dstRef, dstIdx) = (byte)c0;
+ Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)c1;
+ Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)c2;
+ Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)c3;
+ srcIdx += 4;
+ dstIdx += 4;
+ continue;
+ }
+ }
+
+ // Scalar single-char encode
+ var c = Unsafe.Add(ref srcRefChar, srcIdx);
+ if (c < 0x80)
+ {
+ // 1-byte ASCII (U+0000–U+007F)
+ Unsafe.Add(ref dstRef, dstIdx++) = (byte)c;
+ srcIdx += 1;
+ }
+ else if (c < 0x800)
+ {
+ // 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF
+ // Latin extended (Hungarian, Polish, Czech, Spanish, French, German diacritics),
+ // Greek, Cyrillic, Hebrew, Arabic.
+ Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xC0 | (c >> 6));
+ Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | (c & 0x3F));
+ dstIdx += 2;
+ srcIdx += 1;
+ }
+ else if ((c & 0xF800) != 0xD800)
+ {
+ // 3-byte BMP: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF (excluding surrogate range)
+ // CJK BMP, various other BMP scripts.
+ Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xE0 | (c >> 12));
+ Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((c >> 6) & 0x3F));
+ Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | (c & 0x3F));
+ dstIdx += 3;
+ srcIdx += 1;
+ }
+ else
+ {
+ // 4-byte: surrogate pair → supplementary plane codepoint (U+10000–U+10FFFF)
+ // High surrogate (0xD800–0xDBFF) followed by low surrogate (0xDC00–0xDFFF).
+ var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
+ var codepoint = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00);
+ Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xF0 | (codepoint >> 18));
+ Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((codepoint >> 12) & 0x3F));
+ Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | ((codepoint >> 6) & 0x3F));
+ Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(0x80 | (codepoint & 0x3F));
+ dstIdx += 4;
+ srcIdx += 2; // consumed 2 chars (surrogate pair)
+ }
+ }
+
+ return dstIdx;
+ }
+
+ ///
+ /// Counts UTF-16 chars produced by decoding the given UTF-8 byte span.
+ ///
+ ///
+ /// Layered SIMD: Vector512 (64 byte/iter) on AVX-512BW hosts → Vector256 (32 byte/iter) on AVX2
+ /// hosts → Vector128 (16 byte/iter) on Apple Silicon / WASM / SSE2 → scalar tail. All SIMD paths
+ /// use the same two bit-pattern checks:
+ /// • Non-continuation bytes (NOT 10xxxxxx, mask 0xC0 ≠ 0x80): each contributes 1 char.
+ /// • 4-byte start bytes (11110xxx, mask 0xF8 == 0xF0): each contributes an EXTRA char (surrogate pair).
+ ///
+ /// SIMD per-block result: (N - popcount(continuationMask)) + popcount(fourByteStartMask).
+ /// Scalar tail handles the remaining bytes.
+ ///
+ /// Char-count rules:
+ /// • Continuation bytes (10xxxxxx, 0x80–0xBF) — produce no char, skip.
+ /// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each.
+ /// • 4-byte start bytes (11110xxx, 0xF0–0xF7) — produce 2 chars (UTF-16 surrogate pair).
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static int CountUtf8Chars(ReadOnlySpan bytes)
+ {
+ var count = 0;
+ var i = 0;
+ ref var bytesRef = ref MemoryMarshal.GetReference(bytes);
+
+ // SIMD path 1: 64 bytes/iter via Vector512 (AVX-512BW hosts)
+ if (Avx512BW.IsSupported && bytes.Length >= 64)
+ {
+ var contMask512 = Vector512.Create((byte)0xC0);
+ var contValue512 = Vector512.Create((byte)0x80);
+ var fourByteMask512 = Vector512.Create((byte)0xF8);
+ var fourByteValue512 = Vector512.Create((byte)0xF0);
+
+ do
+ {
+ var v = Vector512.LoadUnsafe(ref bytesRef, (uint)i);
+
+ // Non-continuation count: 64 - popcount(continuation byte mask)
+ var contMatches = Vector512.Equals(v & contMask512, contValue512);
+ var contBits = contMatches.ExtractMostSignificantBits(); // ulong
+ count += 64 - BitOperations.PopCount(contBits);
+
+ // 4-byte start count: popcount(fourByte start byte mask)
+ var fourByteMatches = Vector512.Equals(v & fourByteMask512, fourByteValue512);
+ var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
+ count += BitOperations.PopCount(fourByteBits);
+
+ i += 64;
+ } while (bytes.Length - i >= 64);
+ }
+
+ // SIMD path 2: 32 bytes/iter via Vector256 (AVX2 hosts, also handles AVX-512 tail < 64)
+ if (Vector256.IsHardwareAccelerated && bytes.Length - i >= 32)
+ {
+ var contMask = Vector256.Create((byte)0xC0);
+ var contValue = Vector256.Create((byte)0x80);
+ var fourByteMask = Vector256.Create((byte)0xF8);
+ var fourByteValue = Vector256.Create((byte)0xF0);
+
+ do
+ {
+ var v = Vector256.LoadUnsafe(ref bytesRef, (uint)i);
+
+ // Non-continuation count: 32 - popcount(continuation byte mask)
+ var contMatches = Vector256.Equals(v & contMask, contValue);
+ var contBits = contMatches.ExtractMostSignificantBits();
+ count += 32 - BitOperations.PopCount(contBits);
+
+ // 4-byte start count: popcount(fourByte start byte mask)
+ var fourByteMatches = Vector256.Equals(v & fourByteMask, fourByteValue);
+ var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
+ count += BitOperations.PopCount(fourByteBits);
+
+ i += 32;
+ } while (bytes.Length - i >= 32);
+ }
+
+ // SIMD path 3: 16 bytes/iter via Vector128 (Apple Silicon NEON, WASM SIMD, legacy SSE2;
+ // also handles tail < 32 from higher tiers). Cross-platform — Vector128.IsHardwareAccelerated
+ // returns true on any host with a 128-bit SIMD ISA (NEON / SSE2 / WASM SIMD).
+ if (Vector128.IsHardwareAccelerated && bytes.Length - i >= 16)
+ {
+ var contMask128 = Vector128.Create((byte)0xC0);
+ var contValue128 = Vector128.Create((byte)0x80);
+ var fourByteMask128 = Vector128.Create((byte)0xF8);
+ var fourByteValue128 = Vector128.Create((byte)0xF0);
+
+ do
+ {
+ var v = Vector128.LoadUnsafe(ref bytesRef, (uint)i);
+
+ // Non-continuation count: 16 - popcount(continuation byte mask)
+ var contMatches = Vector128.Equals(v & contMask128, contValue128);
+ var contBits = contMatches.ExtractMostSignificantBits();
+ count += 16 - BitOperations.PopCount(contBits);
+
+ // 4-byte start count: popcount(fourByte start byte mask)
+ var fourByteMatches = Vector128.Equals(v & fourByteMask128, fourByteValue128);
+ var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
+ count += BitOperations.PopCount(fourByteBits);
+
+ i += 16;
+ } while (bytes.Length - i >= 16);
+ }
+
+ // Scalar tail (and fallback for non-SIMD hardware)
+ for (; i < bytes.Length; i++)
+ {
+ var b = Unsafe.Add(ref bytesRef, i);
+ if ((b & 0xC0) != 0x80) count++; // non-continuation byte
+ if ((b & 0xF8) == 0xF0) count++; // 4-byte start: extra char for surrogate pair
+ }
+ return count;
+ }
+
+ ///
+ /// Single-pass UTF-8 → UTF-16 decoder. Returns the actual char count written to .
+ ///
+ ///
+ /// Layered approach for maximum throughput across mixed content:
+ /// • Phase 1 — Vector256 ASCII prefix bulk widen: 32 bytes/iter while all top bits are zero.
+ /// Uses to produce two Vector256<ushort> lanes
+ /// = 32 chars per iteration. Breaks on first non-ASCII byte found in the loaded vector.
+ /// • Phase 2 — DWORD ASCII batch: when ≥4 bytes remain, read as uint, test
+ /// (dword & 0x80808080u) == 0; on hit, widen 4 chars in 4 instructions and continue.
+ /// • Phase 3 — Scalar multi-byte branch: 1-byte (ASCII single), 2-byte (Latin extended,
+ /// Cyrillic, Greek, Hebrew, Arabic), 3-byte (CJK BMP), 4-byte (supplementary plane → surrogate pair).
+ /// Direct bit-extract, no validation — input is trusted.
+ ///
+ /// JIT compiles the switch into a jump table for predictable dispatch on mixed content.
+ /// Hungarian text typical pattern: ASCII run (Phase 1/2 widening) → 2-byte char (Phase 3
+ /// case < 0xE0) → ASCII run → 2-byte char → ... — each phase optimal for its segment.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static int DecodeUtf8SinglePass(ReadOnlySpan src, Span dst)
+ {
+ int srcIdx = 0, dstIdx = 0;
+ ref byte srcRef = ref MemoryMarshal.GetReference(src);
+ ref ushort dstRef = ref Unsafe.As(ref MemoryMarshal.GetReference(dst));
+
+ // Phase 1 — Vector256 ASCII prefix bulk widen (32 bytes/iter)
+ if (Vector256.IsHardwareAccelerated)
+ {
+ while (src.Length - srcIdx >= Vector256.Count)
+ {
+ var v = Vector256.LoadUnsafe(ref srcRef, (uint)srcIdx);
+ // ASCII detect: any high bit set among the 32 bytes?
+ if (v.ExtractMostSignificantBits() != 0) break;
+
+ // Widen 32 bytes → 2 × Vector256 (32 chars total)
+ var (lower, upper) = Vector256.Widen(v);
+ lower.StoreUnsafe(ref dstRef, (uint)dstIdx);
+ upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector128.Count));
+ srcIdx += Vector256.Count;
+ dstIdx += Vector256.Count; // 32 bytes → 32 chars
+ }
+ }
+
+ // Phase 2/3 — scalar loop with DWORD ASCII batch
+ while (srcIdx < src.Length)
+ {
+ // DWORD ASCII batch: 4 ASCII bytes → 4 chars per iter
+ if (src.Length - srcIdx >= 4)
+ {
+ var dword = Unsafe.ReadUnaligned(ref Unsafe.Add(ref srcRef, srcIdx));
+ if ((dword & 0x80808080u) == 0)
+ {
+ Unsafe.Add(ref dstRef, dstIdx) = (byte)dword;
+ Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(dword >> 8);
+ Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(dword >> 16);
+ Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(dword >> 24);
+ srcIdx += 4;
+ dstIdx += 4;
+ continue;
+ }
+ }
+
+ // Scalar multi-byte branch (jump-table compile via switch)
+ var b0 = Unsafe.Add(ref srcRef, srcIdx);
+ switch (b0)
+ {
+ case < 0x80:
+ // 1-byte ASCII (U+0000–U+007F)
+ Unsafe.Add(ref dstRef, dstIdx++) = b0;
+ srcIdx += 1;
+ break;
+ case < 0xE0:
+ {
+ // 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF
+ // Latin extended, Cyrillic, Greek, Hebrew, Arabic.
+ var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
+ Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x1F) << 6) | (b1 & 0x3F));
+ srcIdx += 2;
+ break;
+ }
+ case < 0xF0:
+ {
+ // 3-byte: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF
+ // CJK BMP, various other scripts.
+ var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
+ var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
+ Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F));
+ srcIdx += 3;
+ break;
+ }
+ default:
+ {
+ // 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000–U+10FFFF
+ // Supplementary plane (emoji, rare CJK ext) → UTF-16 surrogate pair.
+ var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
+ var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
+ var b3 = Unsafe.Add(ref srcRef, srcIdx + 3);
+ var codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
+ codepoint -= 0x10000;
+ Unsafe.Add(ref dstRef, dstIdx) = (ushort)(0xD800 | (codepoint >> 10));
+ Unsafe.Add(ref dstRef, dstIdx + 1) = (ushort)(0xDC00 | (codepoint & 0x3FF));
+ dstIdx += 2;
+ srcIdx += 4;
+ break;
+ }
+ }
+ }
+
+ return dstIdx;
+ }
+}
diff --git a/AyCode.Core/docs/BINARY/BINARY_TODO.md b/AyCode.Core/docs/BINARY/BINARY_TODO.md
index b3358da..81aa3f3 100644
--- a/AyCode.Core/docs/BINARY/BINARY_TODO.md
+++ b/AyCode.Core/docs/BINARY/BINARY_TODO.md
@@ -693,9 +693,18 @@ JIT/AOT path-selection via `[Intrinsic]` `IsSupported` static booleans — non-s
| 1 | `CountUtf8Chars` (decode 1st pass) | ✅ done | ✅ existing | ✅ done | ✅ existing |
| 2 | `EncodeUtf8SinglePass` Phase 1 (ASCII narrow) | ✅ done | ✅ existing | ✅ done | ✅ existing |
| 3a | `DecodeUtf8SinglePass` multi-byte transcoder (Vector512) | ⏳ TODO | bail-out only | bail-out only | ✅ existing |
-| 3b | `DecodeUtf8SinglePass` multi-byte transcoder (Vector256) | — | ⏳ TODO | bail-out only | ✅ existing |
+| 3b | `DecodeUtf8SinglePass` multi-byte transcoder (Vector256) | — | 🔍 **deferred — see note** | bail-out only | ✅ existing |
| 3c | `DecodeUtf8SinglePass` multi-byte transcoder (Vector128) | — | — | ⏳ TODO | ✅ existing |
+**Note on Phase 3b (Vector256 / AVX2) — deferred, not dropped.** AVX2 lacks the AVX-512BW primitives (`CompareEqualMask` producing a `__mmask` k-register, in-lane `vpermb`, mask-driven `vpcompressb`) that make the classify-mask-compress-widen pipeline efficient. The `Vector256.Shuffle` is cross-lane via two `vpshufb` (per-128-bit-lane), which complicates leader-byte extraction across multi-byte sequences spanning the lane boundary. The simdutf C++ project — the canonical reference for this algorithm class — implements only **SSE4 (16-byte)** and **AVX-512 (64-byte)** paths; it explicitly skips AVX2 because the implementation cost-benefit is unfavorable on this algorithm.
+
+On AVX2 hosts, the Phase 3c (Vector128) transcoder runs as the primer multi-byte path AND as tail handler — covering AVX2 hosts with 16-byte/iter, which is already a significant win over the current scalar multi-byte branch. Phase 3b would require either:
+
+1. Hand-rolling an AVX2-specific 32-byte algorithm with cross-lane permute workarounds (research-grade complexity, uncertain net win — could be SLOWER than the Vector128 path due to cross-lane shuffle latency)
+2. Waiting for `Avx10v1` / `Avx10v2` to expose AVX-512BW-class primitives in 256-bit form (Intel's unified vector ISA — `Avx10v1` already in .NET 9, `Avx10v2` arrives with future Intel hardware)
+
+**Re-evaluation triggers:** if benchmark on AVX2 hosts shows Phase 3c Vector128 path leaves > 10% Deser gap vs MemPack on multi-byte content; or if `Avx10v1` 256-bit primitives mature enough to make the algorithm tractable. Until then: **Phase 3b stays in the TODO as a research / future-work item** — not actively scheduled, but documented so a future contributor doesn't re-derive the AVX2 limitations.
+
**Phase 3 is the remaining gap — UTF-8 multi-byte decode on every host class**. ASCII path is already fast across all SIMD tiers (Vector256 + Vector128 prefix widen + `Encoding.Latin1.GetString` BCL fast path). The gap is on **multi-byte UTF-8 content** — Hungarian / Cyrillic / Greek (2-byte) and CJK BMP (3-byte) sequences — where the SIMD prefix bails out on the first non-ASCII byte and falls back to scalar bit-extract. The Repeated benchmark cell (Hungarian content) is the canonical witness; with all-Hungarian content (current bench data), Small / Repeated Deser cells trail MemPack by 6-14%.
**Why all 3 SIMD tiers (not just AVX-512BW)** — public NuGet package goal: i18n payloads must be fast on every supported host (cloud server, desktop, mobile, Blazor WASM), not only AVX-512-capable cloud servers. The saját scalar multi-byte branch is the bottleneck on **all** non-ASCII content regardless of host class. The BCL `Encoding.UTF8` falls back to a similar scalar path on multi-byte content (with virtual dispatch + EncoderFallback overhead), so even where the BCL has its own SIMD 2-byte handler (.NET 9 PR #92580), our trust-input scalar wins on net — but a saját SIMD multi-byte path would dominate on every host.