AyCode.Core/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs

603 lines
31 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace AyCode.Core.Serializers.Binaries;
/// <summary>
/// In-house UTF-8 ↔ UTF-16 transcoder used by the binary serializer hot path. Trust-input
/// semantics — no validation (writer-side input is a .NET string, reader-side input is valid
/// UTF-8 by writer contract). Bypasses <see cref="System.Text.Encoding.UTF8"/> virtual-dispatch
/// + EncoderFallback / DecoderFallback overhead.
///
/// <para><b>SIMD path hierarchy</b> (cascading tail-handler):</para>
/// <list type="bullet">
/// <item>Vector512 / AVX-512BW — 64 byte/iter (Intel server, Intel 11th gen client, AMD Zen 4+)</item>
/// <item>Vector256 / AVX2 — 32 byte/iter (Intel 12-14th gen client, AMD Zen 3 and earlier)</item>
/// <item>Vector128 / SSE-NEON-WASM — 16 byte/iter (Apple Silicon NEON, WASM SIMD, legacy SSE2)</item>
/// <item>Scalar — final tail (&lt; 16 byte) and no-SIMD fall-back</item>
/// </list>
///
/// <para>JIT/AOT path-selection: <c>Avx512BW.IsSupported</c> / <c>Vector256.IsHardwareAccelerated</c> /
/// <c>Vector128.IsHardwareAccelerated</c> are <c>[Intrinsic]</c> static booleans — the compiler
/// constant-folds the dead branches per host. Non-supported tiers eliminate to zero generated code.</para>
///
/// <para>Algorithm reference: see <c>BINARY_TODO.md#accore-bin-t-v4n2</c> for the multi-tier SIMD
/// transcoder design and per-tier acceptance criteria.</para>
/// </summary>
internal static class Utf8Transcoder
{
/// <summary>
/// Custom UTF-16 → UTF-8 single-pass encoder. Symmetric with <see cref="DecodeUtf8SinglePass"/>.
/// </summary>
/// <remarks>
/// Bypasses <see cref="System.Text.Encoding.UTF8"/>.GetBytes virtual-dispatch + encoder-fallback
/// overhead. Trusted-input encoder — no validation (writer side, the input is a .NET string
/// which always has valid UTF-16 surrogate pairs).
///
/// Layered for max throughput on mixed content:
/// • <b>Phase 1a — Vector512 ASCII narrow:</b> 32 chars/iter on AVX-512BW hosts. JIT lowers
/// <c>Vector256.Narrow</c> to AVX-512 VPACKUSWB (single-instruction pack).
/// • <b>Phase 1b — Vector256 ASCII narrow:</b> 16 chars/iter on AVX2 hosts (also handles tail
/// &lt; 32 chars after the AVX-512 path on capable hosts).
/// • <b>Phase 1c — Vector128 ASCII narrow:</b> 16 chars/iter on Apple Silicon NEON / WASM SIMD
/// / legacy SSE2 hosts (also handles tail &lt; 16 chars).
/// • <b>Phase 2 — DWORD ASCII batch:</b> 4 chars/iter. OR-mask test
/// <c>(c0 | c1 | c2 | c3) &amp; 0xFF80 == 0</c>; on hit, 4 byte writes per iter.
/// • <b>Phase 3 — Scalar multi-byte encode:</b> 1-byte (ASCII), 2-byte (Latin extended,
/// Cyrillic, Greek), 3-byte (CJK BMP), 4-byte (supplementary plane via UTF-16 surrogate pair).
///
/// Returns actual byte count written. Caller must ensure <paramref name="dst"/> has at least
/// <c>src.Length * 4</c> capacity (UTF-8 worst case).
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int EncodeUtf8SinglePass(ReadOnlySpan<char> src, Span<byte> dst)
{
int srcIdx = 0, dstIdx = 0;
ref char srcRefChar = ref MemoryMarshal.GetReference(src);
ref ushort srcRefU16 = ref Unsafe.As<char, ushort>(ref srcRefChar);
ref byte dstRef = ref MemoryMarshal.GetReference(dst);
// Phase 1a — Vector512 ASCII narrow (32 chars/iter on AVX-512BW hosts).
// JIT-time path-selection via Avx512BW.IsSupported [Intrinsic] static bool — non-AVX-512
// hosts get this branch eliminated by constant-folding (zero overhead in the generated asm).
if (Avx512BW.IsSupported)
{
var asciiMask512 = Vector512.Create((ushort)0xFF80);
while (src.Length - srcIdx >= Vector512<ushort>.Count) // 32 chars per Vector512<ushort>
{
var v = Vector512.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
// ASCII detect: any char's high bits set (>= 0x80)?
if ((v & asciiMask512) != Vector512<ushort>.Zero) break;
// Narrow 32 ushorts (Vector512) → 32 bytes (Vector256) via two 256-bit halves.
// The JIT lowers this to AVX-512 VPACKUSWB on capable hosts (single-instruction pack).
var bytes = Vector256.Narrow(v.GetLower(), v.GetUpper());
bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
srcIdx += Vector512<ushort>.Count;
dstIdx += Vector512<ushort>.Count; // 32 chars → 32 bytes (1:1 for ASCII)
}
}
// Phase 1b — Vector256 ASCII narrow (16 chars/iter on AVX2 hosts; also handles tail < 32 chars
// after the AVX-512 path on capable hosts).
if (Vector256.IsHardwareAccelerated)
{
var asciiMask = Vector256.Create((ushort)0xFF80);
while (src.Length - srcIdx >= Vector256<ushort>.Count) // 16 chars per Vector256<ushort>
{
var v = Vector256.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
// ASCII detect: any char's high bits set (>= 0x80)?
if ((v & asciiMask) != Vector256<ushort>.Zero) break;
// Narrow 16 ushorts (Vector256) → 16 bytes (Vector128) via two halves
var bytes = Vector128.Narrow(v.GetLower(), v.GetUpper());
bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
srcIdx += Vector256<ushort>.Count;
dstIdx += Vector256<ushort>.Count; // 16 chars → 16 bytes (1:1 for ASCII)
}
}
// Phase 1c — Vector128 ASCII narrow (16 chars/iter on Apple Silicon NEON, WASM SIMD,
// legacy SSE2; also handles tail < 16 chars after higher tiers). Cross-platform —
// Vector128.IsHardwareAccelerated is true on any 128-bit-SIMD-capable host.
// Two Vector128<ushort> loads (8 + 8 = 16 chars) narrow to one Vector128<byte> (16 bytes).
if (Vector128.IsHardwareAccelerated)
{
var asciiMask128 = Vector128.Create((ushort)0xFF80);
while (src.Length - srcIdx >= 16) // 16 chars = 2 × Vector128<ushort>
{
var lo = Vector128.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
var hi = Vector128.LoadUnsafe(ref srcRefU16, (uint)(srcIdx + 8));
// ASCII detect: any char's high bits set in either half?
if (((lo | hi) & asciiMask128) != Vector128<ushort>.Zero) break;
// Narrow 2× Vector128<ushort> (16 chars) → Vector128<byte> (16 bytes)
var bytes = Vector128.Narrow(lo, hi);
bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
srcIdx += 16;
dstIdx += 16;
}
}
// Phase 2/3 — scalar with DWORD ASCII batch
while (srcIdx < src.Length)
{
// DWORD ASCII batch: 4 chars → 4 bytes when all ASCII
if (src.Length - srcIdx >= 4)
{
var c0 = Unsafe.Add(ref srcRefChar, srcIdx);
var c1 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 2);
var c3 = Unsafe.Add(ref srcRefChar, srcIdx + 3);
if (((c0 | c1 | c2 | c3) & 0xFF80) == 0)
{
Unsafe.Add(ref dstRef, dstIdx) = (byte)c0;
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)c1;
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)c2;
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)c3;
srcIdx += 4;
dstIdx += 4;
continue;
}
}
// Scalar single-char encode
var c = Unsafe.Add(ref srcRefChar, srcIdx);
if (c < 0x80)
{
// 1-byte ASCII (U+0000U+007F)
Unsafe.Add(ref dstRef, dstIdx++) = (byte)c;
srcIdx += 1;
}
else if (c < 0x800)
{
// 2-byte: 110xxxxx 10xxxxxx → U+0080U+07FF
// Latin extended (Hungarian, Polish, Czech, Spanish, French, German diacritics),
// Greek, Cyrillic, Hebrew, Arabic.
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xC0 | (c >> 6));
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | (c & 0x3F));
dstIdx += 2;
srcIdx += 1;
}
else if ((c & 0xF800) != 0xD800)
{
// 3-byte BMP: 1110xxxx 10xxxxxx 10xxxxxx → U+0800U+FFFF (excluding surrogate range)
// CJK BMP, various other BMP scripts.
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xE0 | (c >> 12));
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((c >> 6) & 0x3F));
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | (c & 0x3F));
dstIdx += 3;
srcIdx += 1;
}
else
{
// 4-byte: surrogate pair → supplementary plane codepoint (U+10000U+10FFFF)
// High surrogate (0xD8000xDBFF) followed by low surrogate (0xDC000xDFFF).
var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
var codepoint = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00);
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xF0 | (codepoint >> 18));
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((codepoint >> 12) & 0x3F));
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | ((codepoint >> 6) & 0x3F));
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(0x80 | (codepoint & 0x3F));
dstIdx += 4;
srcIdx += 2; // consumed 2 chars (surrogate pair)
}
}
return dstIdx;
}
/// <summary>
/// Counts the UTF-8 byte length produced by encoding the given UTF-16 char span.
/// Symmetric encode-side helper to <see cref="CountUtf8Chars"/>; the value returned equals
/// the <c>bytesWritten</c> that <see cref="EncodeUtf8SinglePass"/> would produce.
/// </summary>
/// <remarks>
/// Trusted-input — assumes well-formed UTF-16 (every high surrogate paired with a low surrogate),
/// matching <see cref="EncodeUtf8SinglePass"/>'s contract. Bypasses
/// <see cref="System.Text.Encoding.UTF8"/>.GetByteCount virtual-dispatch + encoder-fallback overhead.
///
/// <para>Layered SIMD: Vector512 (32 chars/iter) on AVX-512BW hosts → Vector256 (16 chars/iter)
/// on AVX2 hosts → Vector128 (8 chars/iter) on Apple Silicon NEON / WASM SIMD / SSE2 → scalar tail.
/// JIT/AOT path-selection via <c>Avx512BW.IsSupported</c> / <c>Vector{N}.IsHardwareAccelerated</c>
/// <c>[Intrinsic]</c> booleans (constant-folded dead branches per host).</para>
///
/// <para>Per-char UTF-8 byte contribution:</para>
/// <list type="bullet">
/// <item><c>c &lt; 0x80</c> → 1 byte (ASCII)</item>
/// <item><c>0x80 ≤ c &lt; 0x800</c> → 2 bytes (Latin extended, Cyrillic, Greek, Hebrew, Arabic)</item>
/// <item><c>0x800 ≤ c &lt; 0xD800</c> or <c>c ≥ 0xE000</c> → 3 bytes (CJK BMP, other BMP)</item>
/// <item><c>0xD800 ≤ c &lt; 0xDC00</c> (high surrogate) → 4 bytes (whole pair encoded here)</item>
/// <item><c>0xDC00 ≤ c &lt; 0xE000</c> (low surrogate) → 0 bytes (absorbed by paired high surrogate)</item>
/// </list>
///
/// <para>SIMD per-block: 5 popcount-on-threshold-mask operations
/// (&lt; 0x80, &lt; 0x800, &lt; 0xD800, &lt; 0xDC00, &lt; 0xE000). Closed-form aggregation:
/// <c>bytes = 3*N - ascii - c_lt_0x800 + highSur - 3*lowSur</c>
/// where <c>ascii = popcount(c &lt; 0x80)</c>,
/// <c>highSur = popcount(c &lt; 0xDC00) - popcount(c &lt; 0xD800)</c>,
/// <c>lowSur = popcount(c &lt; 0xE000) - popcount(c &lt; 0xDC00)</c>.</para>
///
/// <para>Both <c>highSur</c> and <c>lowSur</c> must be counted independently — feature-equivalent
/// to the per-char model (high → 4 bytes, low → 0 bytes). A natural-looking shortcut
/// (<c>lowSur == highSur</c> for well-formed UTF-16) is FALSE within a single SIMD chunk when
/// a surrogate pair straddles the chunk boundary; over the whole string the counts equalize
/// but per-block they don't. Across-the-boundary correctness: a high surrogate counted in
/// chunk N contributes 4 bytes there; its low surrogate (in chunk N+1) contributes 0 bytes —
/// total 4 bytes per pair regardless of where the boundary falls.</para>
///
/// <para>Pairs with <see cref="EncodeUtf8SinglePass"/> for two-pass [VarUInt][bytes] writes in
/// cold-fallback paths (e.g. <c>WriteFixStrDirect</c>'s non-ASCII fallback in
/// <c>BinarySerializationContext</c>).</para>
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int GetUtf8ByteCount(ReadOnlySpan<char> src)
{
var byteCount = 0;
var i = 0;
var n = src.Length;
ref ushort srcRef = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(src));
// SIMD path 1: Vector512 (32 chars/iter) on AVX-512BW hosts
if (Avx512BW.IsSupported && n >= Vector512<ushort>.Count)
{
var v_0x80 = Vector512.Create((ushort)0x80);
var v_0x800 = Vector512.Create((ushort)0x800);
var v_0xD800 = Vector512.Create((ushort)0xD800);
var v_0xDC00 = Vector512.Create((ushort)0xDC00);
var v_0xE000 = Vector512.Create((ushort)0xE000);
do
{
var v = Vector512.LoadUnsafe(ref srcRef, (uint)i);
var c_lt_0x80 = BitOperations.PopCount(Vector512.LessThan(v, v_0x80).ExtractMostSignificantBits());
var c_lt_0x800 = BitOperations.PopCount(Vector512.LessThan(v, v_0x800).ExtractMostSignificantBits());
var c_lt_0xD800 = BitOperations.PopCount(Vector512.LessThan(v, v_0xD800).ExtractMostSignificantBits());
var c_lt_0xDC00 = BitOperations.PopCount(Vector512.LessThan(v, v_0xDC00).ExtractMostSignificantBits());
var c_lt_0xE000 = BitOperations.PopCount(Vector512.LessThan(v, v_0xE000).ExtractMostSignificantBits());
var highSur = c_lt_0xDC00 - c_lt_0xD800;
var lowSur = c_lt_0xE000 - c_lt_0xDC00;
byteCount += 3 * Vector512<ushort>.Count - c_lt_0x80 - c_lt_0x800 + highSur - 3 * lowSur;
i += Vector512<ushort>.Count;
} while (n - i >= Vector512<ushort>.Count);
}
// SIMD path 2: Vector256 (16 chars/iter) on AVX2 hosts; also handles AVX-512 tail < 32 chars
if (Vector256.IsHardwareAccelerated && n - i >= Vector256<ushort>.Count)
{
var v_0x80 = Vector256.Create((ushort)0x80);
var v_0x800 = Vector256.Create((ushort)0x800);
var v_0xD800 = Vector256.Create((ushort)0xD800);
var v_0xDC00 = Vector256.Create((ushort)0xDC00);
var v_0xE000 = Vector256.Create((ushort)0xE000);
do
{
var v = Vector256.LoadUnsafe(ref srcRef, (uint)i);
var c_lt_0x80 = BitOperations.PopCount(Vector256.LessThan(v, v_0x80).ExtractMostSignificantBits());
var c_lt_0x800 = BitOperations.PopCount(Vector256.LessThan(v, v_0x800).ExtractMostSignificantBits());
var c_lt_0xD800 = BitOperations.PopCount(Vector256.LessThan(v, v_0xD800).ExtractMostSignificantBits());
var c_lt_0xDC00 = BitOperations.PopCount(Vector256.LessThan(v, v_0xDC00).ExtractMostSignificantBits());
var c_lt_0xE000 = BitOperations.PopCount(Vector256.LessThan(v, v_0xE000).ExtractMostSignificantBits());
var highSur = c_lt_0xDC00 - c_lt_0xD800;
var lowSur = c_lt_0xE000 - c_lt_0xDC00;
byteCount += 3 * Vector256<ushort>.Count - c_lt_0x80 - c_lt_0x800 + highSur - 3 * lowSur;
i += Vector256<ushort>.Count;
} while (n - i >= Vector256<ushort>.Count);
}
// SIMD path 3: Vector128 (8 chars/iter) on Apple Silicon NEON, WASM SIMD, legacy SSE2;
// also handles tail < 16 from higher tiers. Cross-platform via Vector128.IsHardwareAccelerated.
if (Vector128.IsHardwareAccelerated && n - i >= Vector128<ushort>.Count)
{
var v_0x80 = Vector128.Create((ushort)0x80);
var v_0x800 = Vector128.Create((ushort)0x800);
var v_0xD800 = Vector128.Create((ushort)0xD800);
var v_0xDC00 = Vector128.Create((ushort)0xDC00);
var v_0xE000 = Vector128.Create((ushort)0xE000);
do
{
var v = Vector128.LoadUnsafe(ref srcRef, (uint)i);
var c_lt_0x80 = BitOperations.PopCount(Vector128.LessThan(v, v_0x80).ExtractMostSignificantBits());
var c_lt_0x800 = BitOperations.PopCount(Vector128.LessThan(v, v_0x800).ExtractMostSignificantBits());
var c_lt_0xD800 = BitOperations.PopCount(Vector128.LessThan(v, v_0xD800).ExtractMostSignificantBits());
var c_lt_0xDC00 = BitOperations.PopCount(Vector128.LessThan(v, v_0xDC00).ExtractMostSignificantBits());
var c_lt_0xE000 = BitOperations.PopCount(Vector128.LessThan(v, v_0xE000).ExtractMostSignificantBits());
var highSur = c_lt_0xDC00 - c_lt_0xD800;
var lowSur = c_lt_0xE000 - c_lt_0xDC00;
byteCount += 3 * Vector128<ushort>.Count - c_lt_0x80 - c_lt_0x800 + highSur - 3 * lowSur;
i += Vector128<ushort>.Count;
} while (n - i >= Vector128<ushort>.Count);
}
// Scalar tail (and fallback for non-SIMD hardware).
// CRITICAL: must use the SAME per-char accounting model as the SIMD path so that surrogate
// pairs split across a SIMD/scalar boundary count correctly. The SIMD path counts each char
// independently — high surrogate → 4 bytes, low surrogate → 0 bytes. The scalar tail must
// do the same (i += 1 per char, NOT i += 2 on high surrogate). If the scalar tail
// double-consumed surrogate pairs (i += 2 on high), a high surrogate landing in the last
// SIMD chunk would be counted there as 4 bytes, then its low surrogate in the scalar tail
// would re-trigger the surrogate branch and add 4 more bytes (with i += 2 advancing past
// an unrelated next char). Net: +4 byte miscount per split-pair.
while (i < n)
{
var c = Unsafe.Add(ref srcRef, i);
if (c < 0x80)
{
byteCount += 1;
}
else if (c < 0x800)
{
byteCount += 2;
}
else if (c < 0xD800)
{
byteCount += 3; // BMP below surrogate range
}
else if (c < 0xDC00)
{
byteCount += 4; // high surrogate → owns the 4-byte encoding for the pair
}
else if (c < 0xE000)
{
// low surrogate → 0 bytes (the paired high surrogate already accounted for the 4)
}
else
{
byteCount += 3; // BMP at or above 0xE000
}
i += 1;
}
return byteCount;
}
/// <summary>
/// Counts UTF-16 chars produced by decoding the given UTF-8 byte span.
/// </summary>
/// <remarks>
/// Layered SIMD: Vector512 (64 byte/iter) on AVX-512BW hosts → Vector256 (32 byte/iter) on AVX2
/// hosts → Vector128 (16 byte/iter) on Apple Silicon / WASM / SSE2 → scalar tail. All SIMD paths
/// use the same two bit-pattern checks:
/// • Non-continuation bytes (NOT 10xxxxxx, mask 0xC0 ≠ 0x80): each contributes 1 char.
/// • 4-byte start bytes (11110xxx, mask 0xF8 == 0xF0): each contributes an EXTRA char (surrogate pair).
///
/// SIMD per-block result: <c>(N - popcount(continuationMask)) + popcount(fourByteStartMask)</c>.
/// Scalar tail handles the remaining bytes.
///
/// Char-count rules:
/// • Continuation bytes (10xxxxxx, 0x800xBF) — produce no char, skip.
/// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each.
/// • 4-byte start bytes (11110xxx, 0xF00xF7) — produce 2 chars (UTF-16 surrogate pair).
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int CountUtf8Chars(ReadOnlySpan<byte> bytes)
{
var count = 0;
var i = 0;
ref var bytesRef = ref MemoryMarshal.GetReference(bytes);
// SIMD path 1: 64 bytes/iter via Vector512 (AVX-512BW hosts)
if (Avx512BW.IsSupported && bytes.Length >= 64)
{
var contMask512 = Vector512.Create((byte)0xC0);
var contValue512 = Vector512.Create((byte)0x80);
var fourByteMask512 = Vector512.Create((byte)0xF8);
var fourByteValue512 = Vector512.Create((byte)0xF0);
do
{
var v = Vector512.LoadUnsafe(ref bytesRef, (uint)i);
// Non-continuation count: 64 - popcount(continuation byte mask)
var contMatches = Vector512.Equals(v & contMask512, contValue512);
var contBits = contMatches.ExtractMostSignificantBits(); // ulong
count += 64 - BitOperations.PopCount(contBits);
// 4-byte start count: popcount(fourByte start byte mask)
var fourByteMatches = Vector512.Equals(v & fourByteMask512, fourByteValue512);
var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
count += BitOperations.PopCount(fourByteBits);
i += 64;
} while (bytes.Length - i >= 64);
}
// SIMD path 2: 32 bytes/iter via Vector256 (AVX2 hosts, also handles AVX-512 tail < 64)
if (Vector256.IsHardwareAccelerated && bytes.Length - i >= 32)
{
var contMask = Vector256.Create((byte)0xC0);
var contValue = Vector256.Create((byte)0x80);
var fourByteMask = Vector256.Create((byte)0xF8);
var fourByteValue = Vector256.Create((byte)0xF0);
do
{
var v = Vector256.LoadUnsafe(ref bytesRef, (uint)i);
// Non-continuation count: 32 - popcount(continuation byte mask)
var contMatches = Vector256.Equals(v & contMask, contValue);
var contBits = contMatches.ExtractMostSignificantBits();
count += 32 - BitOperations.PopCount(contBits);
// 4-byte start count: popcount(fourByte start byte mask)
var fourByteMatches = Vector256.Equals(v & fourByteMask, fourByteValue);
var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
count += BitOperations.PopCount(fourByteBits);
i += 32;
} while (bytes.Length - i >= 32);
}
// SIMD path 3: 16 bytes/iter via Vector128 (Apple Silicon NEON, WASM SIMD, legacy SSE2;
// also handles tail < 32 from higher tiers). Cross-platform — Vector128.IsHardwareAccelerated
// returns true on any host with a 128-bit SIMD ISA (NEON / SSE2 / WASM SIMD).
if (Vector128.IsHardwareAccelerated && bytes.Length - i >= 16)
{
var contMask128 = Vector128.Create((byte)0xC0);
var contValue128 = Vector128.Create((byte)0x80);
var fourByteMask128 = Vector128.Create((byte)0xF8);
var fourByteValue128 = Vector128.Create((byte)0xF0);
do
{
var v = Vector128.LoadUnsafe(ref bytesRef, (uint)i);
// Non-continuation count: 16 - popcount(continuation byte mask)
var contMatches = Vector128.Equals(v & contMask128, contValue128);
var contBits = contMatches.ExtractMostSignificantBits();
count += 16 - BitOperations.PopCount(contBits);
// 4-byte start count: popcount(fourByte start byte mask)
var fourByteMatches = Vector128.Equals(v & fourByteMask128, fourByteValue128);
var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
count += BitOperations.PopCount(fourByteBits);
i += 16;
} while (bytes.Length - i >= 16);
}
// Scalar tail (and fallback for non-SIMD hardware)
for (; i < bytes.Length; i++)
{
var b = Unsafe.Add(ref bytesRef, i);
if ((b & 0xC0) != 0x80) count++; // non-continuation byte
if ((b & 0xF8) == 0xF0) count++; // 4-byte start: extra char for surrogate pair
}
return count;
}
/// <summary>
/// Single-pass UTF-8 → UTF-16 decoder. Returns the actual char count written to <paramref name="dst"/>.
/// </summary>
/// <remarks>
/// Layered approach for maximum throughput across mixed content:
/// • <b>Phase 1 — Vector256 ASCII prefix bulk widen:</b> 32 bytes/iter while all top bits are zero.
/// Uses <see cref="Vector256.Widen(Vector256{byte})"/> to produce two Vector256&lt;ushort&gt; lanes
/// = 32 chars per iteration. Breaks on first non-ASCII byte found in the loaded vector.
/// • <b>Phase 2 — DWORD ASCII batch:</b> when ≥4 bytes remain, read as <c>uint</c>, test
/// <c>(dword &amp; 0x80808080u) == 0</c>; on hit, widen 4 chars in 4 instructions and continue.
/// • <b>Phase 3 — Scalar multi-byte branch:</b> 1-byte (ASCII single), 2-byte (Latin extended,
/// Cyrillic, Greek, Hebrew, Arabic), 3-byte (CJK BMP), 4-byte (supplementary plane → surrogate pair).
/// Direct bit-extract, no validation — input is trusted.
///
/// JIT compiles the switch into a jump table for predictable dispatch on mixed content.
/// Hungarian text typical pattern: ASCII run (Phase 1/2 widening) → 2-byte char (Phase 3
/// case &lt; 0xE0) → ASCII run → 2-byte char → ... — each phase optimal for its segment.
///
/// V4N2 Phase 2.5 (run-length scalar decoder) attempted 2026-05-07 — both full and hybrid
/// (3-byte do-while only) variants showed bench-instability and unmeasurable optimization
/// signal on the available hardware. Reverted to the switch-jumptable per-char baseline.
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int DecodeUtf8SinglePass(ReadOnlySpan<byte> src, Span<char> dst)
{
int srcIdx = 0, dstIdx = 0;
ref byte srcRef = ref MemoryMarshal.GetReference(src);
ref ushort dstRef = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(dst));
// Phase 1 — Vector256 ASCII prefix bulk widen (32 bytes/iter)
if (Vector256.IsHardwareAccelerated)
{
while (src.Length - srcIdx >= Vector256<byte>.Count)
{
var v = Vector256.LoadUnsafe(ref srcRef, (uint)srcIdx);
// ASCII detect: any high bit set among the 32 bytes?
if (v.ExtractMostSignificantBits() != 0) break;
// Widen 32 bytes → 2 × Vector256<ushort> (32 chars total). Each Vector256<ushort>
// holds 16 ushort, so the upper half stores at dstIdx + 16 (= Vector256<ushort>.Count).
var (lower, upper) = Vector256.Widen(v);
lower.StoreUnsafe(ref dstRef, (uint)dstIdx);
upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector256<ushort>.Count));
srcIdx += Vector256<byte>.Count;
dstIdx += Vector256<byte>.Count; // 32 bytes → 32 chars
}
}
// Phase 2/2.5/3 — DWORD ASCII batch + run-length scalar decoder + 4-byte fallback
while (srcIdx < src.Length)
{
// Phase 2 — DWORD ASCII batch: 4 ASCII bytes → 4 chars per iter (unchanged)
if (src.Length - srcIdx >= 4)
{
var dword = Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref srcRef, srcIdx));
if ((dword & 0x80808080u) == 0)
{
Unsafe.Add(ref dstRef, dstIdx) = (byte)dword;
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(dword >> 8);
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(dword >> 16);
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(dword >> 24);
srcIdx += 4;
dstIdx += 4;
continue;
}
}
// Phase 2.5 — lead-byte selects run-type, inner do-while decodes the full run.
// Benefit vs. per-char switch-jumptable: the switch dispatch fires once per run-start,
// not once per char. Long homogeneous runs (CJK 3-byte chunks, Latin/Cyrillic/Greek
// 2-byte sequences) get tight branchless inner loops.
var b0 = Unsafe.Add(ref srcRef, srcIdx);
switch (b0)
{
case < 0x80:
// 1-byte ASCII single (single-byte tail of a run that the DWORD batch couldn't cover).
// No do-while loop here — the DWORD batch already handles long ASCII runs above;
// this case is the 1-3 byte tail before the next non-ASCII byte.
Unsafe.Add(ref dstRef, dstIdx++) = b0;
srcIdx += 1;
break;
case < 0xE0:
{
// 2-byte: 110xxxxx 10xxxxxx → U+0080U+07FF
// Latin extended, Cyrillic, Greek, Hebrew, Arabic.
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x1F) << 6) | (b1 & 0x3F));
srcIdx += 2;
break;
}
case < 0xF0:
{
// 3-byte: 1110xxxx 10xxxxxx 10xxxxxx → U+0800U+FFFF
// CJK BMP, various other scripts.
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F));
srcIdx += 3;
break;
}
default:
{
// 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000U+10FFFF
// Supplementary plane (emoji, rare CJK ext) → UTF-16 surrogate pair.
// No do-while: 4-byte sequences are typically isolated (single emoji in mixed text).
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
var b3 = Unsafe.Add(ref srcRef, srcIdx + 3);
var codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
codepoint -= 0x10000;
Unsafe.Add(ref dstRef, dstIdx) = (ushort)(0xD800 | (codepoint >> 10));
Unsafe.Add(ref dstRef, dstIdx + 1) = (ushort)(0xDC00 | (codepoint & 0x3FF));
dstIdx += 2;
srcIdx += 4;
break;
}
}
}
return dstIdx;
}
}