[LOADED_DOCS: 2 files, no new loads]
Refactor: extract UTF-8 transcoder to Utf8Transcoder Moved all UTF-8/UTF-16 encoding, decoding, and char counting logic from AcBinarySerializer/AcBinaryDeserializer into a new internal Utf8Transcoder class. Updated all call sites to use the new class. Removed redundant private methods from the original classes. Updated BINARY_TODO.md to clarify SIMD decode status and rationale for deferring AVX2 multi-byte SIMD path. No functional changes—pure refactor for maintainability and future SIMD work.
This commit is contained in:
parent
651e2a0b9f
commit
8f3bbeacc1
|
|
@ -484,242 +484,14 @@ public static partial class AcBinaryDeserializer
|
|||
var pos = _position;
|
||||
_position += byteLength;
|
||||
var src = _buffer.AsSpan(pos, byteLength);
|
||||
var charCount = CountUtf8Chars(src);
|
||||
var charCount = Utf8Transcoder.CountUtf8Chars(src);
|
||||
|
||||
return string.Create(charCount, (Buffer: _buffer, Pos: pos, Len: byteLength), static (chars, state) =>
|
||||
{
|
||||
DecodeUtf8SinglePass(state.Buffer.AsSpan(state.Pos, state.Len), chars);
|
||||
Utf8Transcoder.DecodeUtf8SinglePass(state.Buffer.AsSpan(state.Pos, state.Len), chars);
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Counts UTF-16 chars produced by decoding the given UTF-8 byte span.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Layered SIMD: Vector512 (64 byte/iter) on AVX-512BW hosts → Vector256 (32 byte/iter)
|
||||
/// on AVX2 hosts → scalar tail. Both SIMD paths use the same two bit-pattern checks:
|
||||
/// • Non-continuation bytes (NOT 10xxxxxx, mask 0xC0 ≠ 0x80): each contributes 1 char.
|
||||
/// • 4-byte start bytes (11110xxx, mask 0xF8 == 0xF0): each contributes an EXTRA char (surrogate pair).
|
||||
///
|
||||
/// SIMD per-block result: <c>(N - popcount(continuationMask)) + popcount(fourByteStartMask)</c>
|
||||
/// where N = 64 (Vector512) or 32 (Vector256). Scalar tail handles the remaining bytes.
|
||||
///
|
||||
/// Char-count rules:
|
||||
/// • Continuation bytes (10xxxxxx, 0x80–0xBF) — produce no char, skip.
|
||||
/// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each.
|
||||
/// • 4-byte start bytes (11110xxx, 0xF0–0xF7) — produce 2 chars (UTF-16 surrogate pair).
|
||||
///
|
||||
/// JIT-time path-selection: <c>Avx512BW.IsSupported</c> and <c>Vector256.IsHardwareAccelerated</c>
|
||||
/// are <c>[Intrinsic]</c> static booleans — the JIT/AOT constant-folds the dead branches per host.
|
||||
/// </remarks>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private static int CountUtf8Chars(ReadOnlySpan<byte> bytes)
|
||||
{
|
||||
var count = 0;
|
||||
var i = 0;
|
||||
ref var bytesRef = ref MemoryMarshal.GetReference(bytes);
|
||||
|
||||
// SIMD path 1: 64 bytes/iter via Vector512 (AVX-512BW hosts)
|
||||
if (Avx512BW.IsSupported && bytes.Length >= 64)
|
||||
{
|
||||
var contMask512 = Vector512.Create((byte)0xC0);
|
||||
var contValue512 = Vector512.Create((byte)0x80);
|
||||
var fourByteMask512 = Vector512.Create((byte)0xF8);
|
||||
var fourByteValue512 = Vector512.Create((byte)0xF0);
|
||||
|
||||
do
|
||||
{
|
||||
var v = Vector512.LoadUnsafe(ref bytesRef, (uint)i);
|
||||
|
||||
// Non-continuation count: 64 - popcount(continuation byte mask)
|
||||
var contMatches = Vector512.Equals(v & contMask512, contValue512);
|
||||
var contBits = contMatches.ExtractMostSignificantBits(); // ulong
|
||||
count += 64 - System.Numerics.BitOperations.PopCount(contBits);
|
||||
|
||||
// 4-byte start count: popcount(fourByte start byte mask)
|
||||
var fourByteMatches = Vector512.Equals(v & fourByteMask512, fourByteValue512);
|
||||
var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
|
||||
count += System.Numerics.BitOperations.PopCount(fourByteBits);
|
||||
|
||||
i += 64;
|
||||
} while (bytes.Length - i >= 64);
|
||||
}
|
||||
|
||||
// SIMD path 2: 32 bytes/iter via Vector256 (AVX2 hosts, also handles AVX-512 tail < 64)
|
||||
if (Vector256.IsHardwareAccelerated && bytes.Length - i >= 32)
|
||||
{
|
||||
var contMask = Vector256.Create((byte)0xC0);
|
||||
var contValue = Vector256.Create((byte)0x80);
|
||||
var fourByteMask = Vector256.Create((byte)0xF8);
|
||||
var fourByteValue = Vector256.Create((byte)0xF0);
|
||||
|
||||
do
|
||||
{
|
||||
var v = Vector256.LoadUnsafe(ref bytesRef, (uint)i);
|
||||
|
||||
// Non-continuation count: 32 - popcount(continuation byte mask)
|
||||
var contMatches = Vector256.Equals(v & contMask, contValue);
|
||||
var contBits = contMatches.ExtractMostSignificantBits();
|
||||
count += 32 - System.Numerics.BitOperations.PopCount(contBits);
|
||||
|
||||
// 4-byte start count: popcount(fourByte start byte mask)
|
||||
var fourByteMatches = Vector256.Equals(v & fourByteMask, fourByteValue);
|
||||
var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
|
||||
count += System.Numerics.BitOperations.PopCount(fourByteBits);
|
||||
|
||||
i += 32;
|
||||
} while (bytes.Length - i >= 32);
|
||||
}
|
||||
|
||||
// SIMD path 3: 16 bytes/iter via Vector128 (Apple Silicon NEON, WASM SIMD, legacy SSE2;
|
||||
// also handles tail < 32 from higher tiers). Cross-platform — Vector128.IsHardwareAccelerated
|
||||
// returns true on any host with a 128-bit SIMD ISA (NEON / SSE2 / WASM SIMD).
|
||||
if (Vector128.IsHardwareAccelerated && bytes.Length - i >= 16)
|
||||
{
|
||||
var contMask128 = Vector128.Create((byte)0xC0);
|
||||
var contValue128 = Vector128.Create((byte)0x80);
|
||||
var fourByteMask128 = Vector128.Create((byte)0xF8);
|
||||
var fourByteValue128 = Vector128.Create((byte)0xF0);
|
||||
|
||||
do
|
||||
{
|
||||
var v = Vector128.LoadUnsafe(ref bytesRef, (uint)i);
|
||||
|
||||
// Non-continuation count: 16 - popcount(continuation byte mask)
|
||||
var contMatches = Vector128.Equals(v & contMask128, contValue128);
|
||||
var contBits = contMatches.ExtractMostSignificantBits();
|
||||
count += 16 - System.Numerics.BitOperations.PopCount(contBits);
|
||||
|
||||
// 4-byte start count: popcount(fourByte start byte mask)
|
||||
var fourByteMatches = Vector128.Equals(v & fourByteMask128, fourByteValue128);
|
||||
var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
|
||||
count += System.Numerics.BitOperations.PopCount(fourByteBits);
|
||||
|
||||
i += 16;
|
||||
} while (bytes.Length - i >= 16);
|
||||
}
|
||||
|
||||
// Scalar tail (and fallback for non-SIMD hardware)
|
||||
for (; i < bytes.Length; i++)
|
||||
{
|
||||
var b = Unsafe.Add(ref bytesRef, i);
|
||||
if ((b & 0xC0) != 0x80) count++; // non-continuation byte
|
||||
if ((b & 0xF8) == 0xF0) count++; // 4-byte start: extra char for surrogate pair
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Single-pass UTF-8 → UTF-16 decoder. Returns the actual char count written to <paramref name="dst"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Layered approach for maximum throughput across mixed content:
|
||||
/// • <b>Phase 1 — Vector256 ASCII prefix bulk widen:</b> 32 bytes/iter while all top bits are zero.
|
||||
/// Uses <see cref="Vector256.Widen(Vector256{byte})"/> to produce two Vector256<ushort> lanes
|
||||
/// = 32 chars per iteration. Breaks on first non-ASCII byte found in the loaded vector.
|
||||
/// • <b>Phase 2 — DWORD ASCII batch:</b> when ≥4 bytes remain, read as <c>uint</c>, test
|
||||
/// <c>(dword & 0x80808080u) == 0</c>; on hit, widen 4 chars in 4 instructions and continue.
|
||||
/// • <b>Phase 3 — Scalar multi-byte branch:</b> 1-byte (ASCII single), 2-byte (Latin extended,
|
||||
/// Cyrillic, Greek, Hebrew, Arabic), 3-byte (CJK BMP), 4-byte (supplementary plane → surrogate pair).
|
||||
/// Direct bit-extract, no validation — input is trusted.
|
||||
///
|
||||
/// JIT compiles the switch into a jump table for predictable dispatch on mixed content.
|
||||
/// Hungarian text typical pattern: ASCII run (Phase 1/2 widening) → 2-byte char (Phase 3
|
||||
/// case < 0xE0) → ASCII run → 2-byte char → ... — each phase optimal for its segment.
|
||||
/// </remarks>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private static int DecodeUtf8SinglePass(ReadOnlySpan<byte> src, Span<char> dst)
|
||||
{
|
||||
int srcIdx = 0, dstIdx = 0;
|
||||
ref byte srcRef = ref MemoryMarshal.GetReference(src);
|
||||
ref ushort dstRef = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(dst));
|
||||
|
||||
// Phase 1 — Vector256 ASCII prefix bulk widen (32 bytes/iter)
|
||||
if (Vector256.IsHardwareAccelerated)
|
||||
{
|
||||
while (src.Length - srcIdx >= Vector256<byte>.Count)
|
||||
{
|
||||
var v = Vector256.LoadUnsafe(ref srcRef, (uint)srcIdx);
|
||||
// ASCII detect: any high bit set among the 32 bytes?
|
||||
if (v.ExtractMostSignificantBits() != 0) break;
|
||||
|
||||
// Widen 32 bytes → 2 × Vector256<ushort> (32 chars total)
|
||||
var (lower, upper) = Vector256.Widen(v);
|
||||
lower.StoreUnsafe(ref dstRef, (uint)dstIdx);
|
||||
upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector128<ushort>.Count));
|
||||
srcIdx += Vector256<byte>.Count;
|
||||
dstIdx += Vector256<byte>.Count; // 32 bytes → 32 chars
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2/3 — scalar loop with DWORD ASCII batch
|
||||
while (srcIdx < src.Length)
|
||||
{
|
||||
// DWORD ASCII batch: 4 ASCII bytes → 4 chars per iter
|
||||
if (src.Length - srcIdx >= 4)
|
||||
{
|
||||
var dword = Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref srcRef, srcIdx));
|
||||
if ((dword & 0x80808080u) == 0)
|
||||
{
|
||||
Unsafe.Add(ref dstRef, dstIdx) = (byte)dword;
|
||||
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(dword >> 8);
|
||||
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(dword >> 16);
|
||||
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(dword >> 24);
|
||||
srcIdx += 4;
|
||||
dstIdx += 4;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Scalar multi-byte branch (jump-table compile via switch)
|
||||
var b0 = Unsafe.Add(ref srcRef, srcIdx);
|
||||
switch (b0)
|
||||
{
|
||||
case < 0x80:
|
||||
// 1-byte ASCII (U+0000–U+007F)
|
||||
Unsafe.Add(ref dstRef, dstIdx++) = b0;
|
||||
srcIdx += 1;
|
||||
break;
|
||||
case < 0xE0:
|
||||
{
|
||||
// 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF
|
||||
// Latin extended, Cyrillic, Greek, Hebrew, Arabic.
|
||||
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
|
||||
Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x1F) << 6) | (b1 & 0x3F));
|
||||
srcIdx += 2;
|
||||
break;
|
||||
}
|
||||
case < 0xF0:
|
||||
{
|
||||
// 3-byte: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF
|
||||
// CJK BMP, various other scripts.
|
||||
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
|
||||
var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
|
||||
Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F));
|
||||
srcIdx += 3;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
// 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000–U+10FFFF
|
||||
// Supplementary plane (emoji, rare CJK ext) → UTF-16 surrogate pair.
|
||||
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
|
||||
var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
|
||||
var b3 = Unsafe.Add(ref srcRef, srcIdx + 3);
|
||||
var codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
|
||||
codepoint -= 0x10000;
|
||||
Unsafe.Add(ref dstRef, dstIdx) = (ushort)(0xD800 | (codepoint >> 10));
|
||||
Unsafe.Add(ref dstRef, dstIdx + 1) = (ushort)(0xDC00 | (codepoint & 0x3FF));
|
||||
dstIdx += 2;
|
||||
srcIdx += 4;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return dstIdx;
|
||||
}
|
||||
|
||||
private string ReadStringUtf8Cached(int length)
|
||||
{
|
||||
var slice = _buffer.AsSpan(_position, length);
|
||||
|
|
|
|||
|
|
@ -699,7 +699,7 @@ public static partial class AcBinarySerializer
|
|||
|
||||
var savedPos = _position;
|
||||
var encodeStart = savedPos + reserveSize;
|
||||
var bytesWritten = EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
|
||||
var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
|
||||
|
||||
var actualVarUIntSize = VarUIntSize((uint)bytesWritten);
|
||||
if (actualVarUIntSize < reserveSize)
|
||||
|
|
@ -768,7 +768,7 @@ public static partial class AcBinarySerializer
|
|||
EnsureCapacity(2 + maxBytesShort); // marker + 1-byte VarUInt + bytes (worst case)
|
||||
|
||||
var savedPosShort = _position;
|
||||
var bytesWrittenShort = EncodeUtf8SinglePass(
|
||||
var bytesWrittenShort = Utf8Transcoder.EncodeUtf8SinglePass(
|
||||
value.AsSpan(),
|
||||
_buffer.AsSpan(savedPosShort + 1, maxBytesShort));
|
||||
var isAsciiShort = bytesWrittenShort == charLength;
|
||||
|
|
@ -805,7 +805,7 @@ public static partial class AcBinarySerializer
|
|||
|
||||
var savedPos = _position;
|
||||
var encodeStart = savedPos + 1 + reserveVarUInt;
|
||||
var bytesWritten = EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
|
||||
var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
|
||||
var isAscii = bytesWritten == charLength;
|
||||
|
||||
_buffer[savedPos] = isAscii ? BinaryTypeCode.StringAscii : BinaryTypeCode.String;
|
||||
|
|
@ -876,161 +876,6 @@ public static partial class AcBinarySerializer
|
|||
_position += byteCount;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Custom UTF-16 → UTF-8 single-pass encoder. Symmetric with the deserializer's custom decoder
|
||||
/// (<see cref="AcBinaryDeserializer"/>'s <c>DecodeUtf8SinglePass</c>).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Bypasses <see cref="System.Text.Encoding.UTF8"/>.GetBytes virtual-dispatch + encoder-fallback
|
||||
/// overhead. Trusted-input encoder — no validation (writer side, the input is a .NET string
|
||||
/// which always has valid UTF-16 surrogate pairs).
|
||||
///
|
||||
/// Layered for max throughput on mixed content:
|
||||
/// • <b>Phase 1 — Vector256 ASCII narrow:</b> 16 chars/iter. Loads <c>Vector256<ushort></c>,
|
||||
/// tests <c>(v & 0xFF80) == 0</c> for all-ASCII; on hit, narrows to <c>Vector128<byte></c>
|
||||
/// via <c>Vector128.Narrow(GetLower, GetUpper)</c> = 16 bytes per iter.
|
||||
/// • <b>Phase 2 — DWORD ASCII batch:</b> 4 chars/iter. OR-mask test
|
||||
/// <c>(c0 | c1 | c2 | c3) & 0xFF80 == 0</c>; on hit, 4 byte writes per iter.
|
||||
/// • <b>Phase 3 — Scalar multi-byte encode:</b> 1-byte (ASCII), 2-byte (Latin extended,
|
||||
/// Cyrillic, Greek), 3-byte (CJK BMP), 4-byte (supplementary plane via UTF-16 surrogate pair).
|
||||
///
|
||||
/// Returns actual byte count written. Caller must ensure <paramref name="dst"/> has at least
|
||||
/// <c>src.Length * 4</c> capacity (UTF-8 worst case).
|
||||
/// </remarks>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private static int EncodeUtf8SinglePass(ReadOnlySpan<char> src, Span<byte> dst)
|
||||
{
|
||||
int srcIdx = 0, dstIdx = 0;
|
||||
ref char srcRefChar = ref MemoryMarshal.GetReference(src);
|
||||
ref ushort srcRefU16 = ref Unsafe.As<char, ushort>(ref srcRefChar);
|
||||
ref byte dstRef = ref MemoryMarshal.GetReference(dst);
|
||||
|
||||
// Phase 1a — Vector512 ASCII narrow (32 chars/iter on AVX-512BW hosts).
|
||||
// JIT-time path-selection via Avx512BW.IsSupported [Intrinsic] static bool — non-AVX-512
|
||||
// hosts get this branch eliminated by constant-folding (zero overhead in the generated asm).
|
||||
if (Avx512BW.IsSupported)
|
||||
{
|
||||
var asciiMask512 = Vector512.Create((ushort)0xFF80);
|
||||
while (src.Length - srcIdx >= Vector512<ushort>.Count) // 32 chars per Vector512<ushort>
|
||||
{
|
||||
var v = Vector512.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
|
||||
// ASCII detect: any char's high bits set (>= 0x80)?
|
||||
if ((v & asciiMask512) != Vector512<ushort>.Zero) break;
|
||||
// Narrow 32 ushorts (Vector512) → 32 bytes (Vector256) via two 256-bit halves.
|
||||
// The JIT lowers this to AVX-512 VPACKUSWB on capable hosts (single-instruction pack).
|
||||
var bytes = Vector256.Narrow(v.GetLower(), v.GetUpper());
|
||||
bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
|
||||
srcIdx += Vector512<ushort>.Count;
|
||||
dstIdx += Vector512<ushort>.Count; // 32 chars → 32 bytes (1:1 for ASCII)
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 1b — Vector256 ASCII narrow (16 chars/iter on AVX2 hosts; also handles tail < 32 chars
|
||||
// after the AVX-512 path on capable hosts).
|
||||
if (Vector256.IsHardwareAccelerated)
|
||||
{
|
||||
var asciiMask = Vector256.Create((ushort)0xFF80);
|
||||
while (src.Length - srcIdx >= Vector256<ushort>.Count) // 16 chars per Vector256<ushort>
|
||||
{
|
||||
var v = Vector256.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
|
||||
// ASCII detect: any char's high bits set (>= 0x80)?
|
||||
if ((v & asciiMask) != Vector256<ushort>.Zero) break;
|
||||
// Narrow 16 ushorts (Vector256) → 16 bytes (Vector128) via two halves
|
||||
var bytes = Vector128.Narrow(v.GetLower(), v.GetUpper());
|
||||
bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
|
||||
srcIdx += Vector256<ushort>.Count;
|
||||
dstIdx += Vector256<ushort>.Count; // 16 chars → 16 bytes (1:1 for ASCII)
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 1c — Vector128 ASCII narrow (16 chars/iter on Apple Silicon NEON, WASM SIMD,
|
||||
// legacy SSE2; also handles tail < 16 chars after higher tiers). Cross-platform —
|
||||
// Vector128.IsHardwareAccelerated is true on any 128-bit-SIMD-capable host.
|
||||
// Two Vector128<ushort> loads (8 + 8 = 16 chars) narrow to one Vector128<byte> (16 bytes).
|
||||
if (Vector128.IsHardwareAccelerated)
|
||||
{
|
||||
var asciiMask128 = Vector128.Create((ushort)0xFF80);
|
||||
while (src.Length - srcIdx >= 16) // 16 chars = 2 × Vector128<ushort>
|
||||
{
|
||||
var lo = Vector128.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
|
||||
var hi = Vector128.LoadUnsafe(ref srcRefU16, (uint)(srcIdx + 8));
|
||||
// ASCII detect: any char's high bits set in either half?
|
||||
if (((lo | hi) & asciiMask128) != Vector128<ushort>.Zero) break;
|
||||
// Narrow 2× Vector128<ushort> (16 chars) → Vector128<byte> (16 bytes)
|
||||
var bytes = Vector128.Narrow(lo, hi);
|
||||
bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
|
||||
srcIdx += 16;
|
||||
dstIdx += 16;
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2/3 — scalar with DWORD ASCII batch
|
||||
while (srcIdx < src.Length)
|
||||
{
|
||||
// DWORD ASCII batch: 4 chars → 4 bytes when all ASCII
|
||||
if (src.Length - srcIdx >= 4)
|
||||
{
|
||||
var c0 = Unsafe.Add(ref srcRefChar, srcIdx);
|
||||
var c1 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
|
||||
var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 2);
|
||||
var c3 = Unsafe.Add(ref srcRefChar, srcIdx + 3);
|
||||
if (((c0 | c1 | c2 | c3) & 0xFF80) == 0)
|
||||
{
|
||||
Unsafe.Add(ref dstRef, dstIdx) = (byte)c0;
|
||||
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)c1;
|
||||
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)c2;
|
||||
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)c3;
|
||||
srcIdx += 4;
|
||||
dstIdx += 4;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Scalar single-char encode
|
||||
var c = Unsafe.Add(ref srcRefChar, srcIdx);
|
||||
if (c < 0x80)
|
||||
{
|
||||
// 1-byte ASCII (U+0000–U+007F)
|
||||
Unsafe.Add(ref dstRef, dstIdx++) = (byte)c;
|
||||
srcIdx += 1;
|
||||
}
|
||||
else if (c < 0x800)
|
||||
{
|
||||
// 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF
|
||||
// Latin extended (Hungarian, Polish, Czech, Spanish, French, German diacritics),
|
||||
// Greek, Cyrillic, Hebrew, Arabic.
|
||||
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xC0 | (c >> 6));
|
||||
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | (c & 0x3F));
|
||||
dstIdx += 2;
|
||||
srcIdx += 1;
|
||||
}
|
||||
else if ((c & 0xF800) != 0xD800)
|
||||
{
|
||||
// 3-byte BMP: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF (excluding surrogate range)
|
||||
// CJK BMP, various other BMP scripts.
|
||||
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xE0 | (c >> 12));
|
||||
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((c >> 6) & 0x3F));
|
||||
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | (c & 0x3F));
|
||||
dstIdx += 3;
|
||||
srcIdx += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// 4-byte: surrogate pair → supplementary plane codepoint (U+10000–U+10FFFF)
|
||||
// High surrogate (0xD800–0xDBFF) followed by low surrogate (0xDC00–0xDFFF).
|
||||
var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
|
||||
var codepoint = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00);
|
||||
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xF0 | (codepoint >> 18));
|
||||
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((codepoint >> 12) & 0x3F));
|
||||
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | ((codepoint >> 6) & 0x3F));
|
||||
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(0x80 | (codepoint & 0x3F));
|
||||
dstIdx += 4;
|
||||
srcIdx += 2; // consumed 2 chars (surrogate pair)
|
||||
}
|
||||
}
|
||||
|
||||
return dstIdx;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,416 @@
|
|||
using System;
|
||||
using System.Numerics;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Runtime.Intrinsics;
|
||||
using System.Runtime.Intrinsics.X86;
|
||||
|
||||
namespace AyCode.Core.Serializers.Binaries;
|
||||
|
||||
/// <summary>
|
||||
/// In-house UTF-8 ↔ UTF-16 transcoder used by the binary serializer hot path. Trust-input
|
||||
/// semantics — no validation (writer-side input is a .NET string, reader-side input is valid
|
||||
/// UTF-8 by writer contract). Bypasses <see cref="System.Text.Encoding.UTF8"/> virtual-dispatch
|
||||
/// + EncoderFallback / DecoderFallback overhead.
|
||||
///
|
||||
/// <para><b>SIMD path hierarchy</b> (cascading tail-handler):</para>
|
||||
/// <list type="bullet">
|
||||
/// <item>Vector512 / AVX-512BW — 64 byte/iter (Intel server, Intel 11th gen client, AMD Zen 4+)</item>
|
||||
/// <item>Vector256 / AVX2 — 32 byte/iter (Intel 12-14th gen client, AMD Zen 3 and earlier)</item>
|
||||
/// <item>Vector128 / SSE-NEON-WASM — 16 byte/iter (Apple Silicon NEON, WASM SIMD, legacy SSE2)</item>
|
||||
/// <item>Scalar — final tail (< 16 byte) and no-SIMD fall-back</item>
|
||||
/// </list>
|
||||
///
|
||||
/// <para>JIT/AOT path-selection: <c>Avx512BW.IsSupported</c> / <c>Vector256.IsHardwareAccelerated</c> /
|
||||
/// <c>Vector128.IsHardwareAccelerated</c> are <c>[Intrinsic]</c> static booleans — the compiler
|
||||
/// constant-folds the dead branches per host. Non-supported tiers eliminate to zero generated code.</para>
|
||||
///
|
||||
/// <para>Algorithm reference: see <c>BINARY_TODO.md#accore-bin-t-v4n2</c> for the multi-tier SIMD
|
||||
/// transcoder design and per-tier acceptance criteria.</para>
|
||||
/// </summary>
|
||||
internal static class Utf8Transcoder
|
||||
{
|
||||
/// <summary>
|
||||
/// Custom UTF-16 → UTF-8 single-pass encoder. Symmetric with <see cref="DecodeUtf8SinglePass"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Bypasses <see cref="System.Text.Encoding.UTF8"/>.GetBytes virtual-dispatch + encoder-fallback
|
||||
/// overhead. Trusted-input encoder — no validation (writer side, the input is a .NET string
|
||||
/// which always has valid UTF-16 surrogate pairs).
|
||||
///
|
||||
/// Layered for max throughput on mixed content:
|
||||
/// • <b>Phase 1a — Vector512 ASCII narrow:</b> 32 chars/iter on AVX-512BW hosts. JIT lowers
|
||||
/// <c>Vector256.Narrow</c> to AVX-512 VPACKUSWB (single-instruction pack).
|
||||
/// • <b>Phase 1b — Vector256 ASCII narrow:</b> 16 chars/iter on AVX2 hosts (also handles tail
|
||||
/// < 32 chars after the AVX-512 path on capable hosts).
|
||||
/// • <b>Phase 1c — Vector128 ASCII narrow:</b> 16 chars/iter on Apple Silicon NEON / WASM SIMD
|
||||
/// / legacy SSE2 hosts (also handles tail < 16 chars).
|
||||
/// • <b>Phase 2 — DWORD ASCII batch:</b> 4 chars/iter. OR-mask test
|
||||
/// <c>(c0 | c1 | c2 | c3) & 0xFF80 == 0</c>; on hit, 4 byte writes per iter.
|
||||
/// • <b>Phase 3 — Scalar multi-byte encode:</b> 1-byte (ASCII), 2-byte (Latin extended,
|
||||
/// Cyrillic, Greek), 3-byte (CJK BMP), 4-byte (supplementary plane via UTF-16 surrogate pair).
|
||||
///
|
||||
/// Returns actual byte count written. Caller must ensure <paramref name="dst"/> has at least
|
||||
/// <c>src.Length * 4</c> capacity (UTF-8 worst case).
|
||||
/// </remarks>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal static int EncodeUtf8SinglePass(ReadOnlySpan<char> src, Span<byte> dst)
|
||||
{
|
||||
int srcIdx = 0, dstIdx = 0;
|
||||
ref char srcRefChar = ref MemoryMarshal.GetReference(src);
|
||||
ref ushort srcRefU16 = ref Unsafe.As<char, ushort>(ref srcRefChar);
|
||||
ref byte dstRef = ref MemoryMarshal.GetReference(dst);
|
||||
|
||||
// Phase 1a — Vector512 ASCII narrow (32 chars/iter on AVX-512BW hosts).
|
||||
// JIT-time path-selection via Avx512BW.IsSupported [Intrinsic] static bool — non-AVX-512
|
||||
// hosts get this branch eliminated by constant-folding (zero overhead in the generated asm).
|
||||
if (Avx512BW.IsSupported)
|
||||
{
|
||||
var asciiMask512 = Vector512.Create((ushort)0xFF80);
|
||||
while (src.Length - srcIdx >= Vector512<ushort>.Count) // 32 chars per Vector512<ushort>
|
||||
{
|
||||
var v = Vector512.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
|
||||
// ASCII detect: any char's high bits set (>= 0x80)?
|
||||
if ((v & asciiMask512) != Vector512<ushort>.Zero) break;
|
||||
// Narrow 32 ushorts (Vector512) → 32 bytes (Vector256) via two 256-bit halves.
|
||||
// The JIT lowers this to AVX-512 VPACKUSWB on capable hosts (single-instruction pack).
|
||||
var bytes = Vector256.Narrow(v.GetLower(), v.GetUpper());
|
||||
bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
|
||||
srcIdx += Vector512<ushort>.Count;
|
||||
dstIdx += Vector512<ushort>.Count; // 32 chars → 32 bytes (1:1 for ASCII)
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 1b — Vector256 ASCII narrow (16 chars/iter on AVX2 hosts; also handles tail < 32 chars
|
||||
// after the AVX-512 path on capable hosts).
|
||||
if (Vector256.IsHardwareAccelerated)
|
||||
{
|
||||
var asciiMask = Vector256.Create((ushort)0xFF80);
|
||||
while (src.Length - srcIdx >= Vector256<ushort>.Count) // 16 chars per Vector256<ushort>
|
||||
{
|
||||
var v = Vector256.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
|
||||
// ASCII detect: any char's high bits set (>= 0x80)?
|
||||
if ((v & asciiMask) != Vector256<ushort>.Zero) break;
|
||||
// Narrow 16 ushorts (Vector256) → 16 bytes (Vector128) via two halves
|
||||
var bytes = Vector128.Narrow(v.GetLower(), v.GetUpper());
|
||||
bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
|
||||
srcIdx += Vector256<ushort>.Count;
|
||||
dstIdx += Vector256<ushort>.Count; // 16 chars → 16 bytes (1:1 for ASCII)
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 1c — Vector128 ASCII narrow (16 chars/iter on Apple Silicon NEON, WASM SIMD,
|
||||
// legacy SSE2; also handles tail < 16 chars after higher tiers). Cross-platform —
|
||||
// Vector128.IsHardwareAccelerated is true on any 128-bit-SIMD-capable host.
|
||||
// Two Vector128<ushort> loads (8 + 8 = 16 chars) narrow to one Vector128<byte> (16 bytes).
|
||||
if (Vector128.IsHardwareAccelerated)
|
||||
{
|
||||
var asciiMask128 = Vector128.Create((ushort)0xFF80);
|
||||
while (src.Length - srcIdx >= 16) // 16 chars = 2 × Vector128<ushort>
|
||||
{
|
||||
var lo = Vector128.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
|
||||
var hi = Vector128.LoadUnsafe(ref srcRefU16, (uint)(srcIdx + 8));
|
||||
// ASCII detect: any char's high bits set in either half?
|
||||
if (((lo | hi) & asciiMask128) != Vector128<ushort>.Zero) break;
|
||||
// Narrow 2× Vector128<ushort> (16 chars) → Vector128<byte> (16 bytes)
|
||||
var bytes = Vector128.Narrow(lo, hi);
|
||||
bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
|
||||
srcIdx += 16;
|
||||
dstIdx += 16;
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2/3 — scalar with DWORD ASCII batch
|
||||
while (srcIdx < src.Length)
|
||||
{
|
||||
// DWORD ASCII batch: 4 chars → 4 bytes when all ASCII
|
||||
if (src.Length - srcIdx >= 4)
|
||||
{
|
||||
var c0 = Unsafe.Add(ref srcRefChar, srcIdx);
|
||||
var c1 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
|
||||
var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 2);
|
||||
var c3 = Unsafe.Add(ref srcRefChar, srcIdx + 3);
|
||||
if (((c0 | c1 | c2 | c3) & 0xFF80) == 0)
|
||||
{
|
||||
Unsafe.Add(ref dstRef, dstIdx) = (byte)c0;
|
||||
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)c1;
|
||||
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)c2;
|
||||
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)c3;
|
||||
srcIdx += 4;
|
||||
dstIdx += 4;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Scalar single-char encode
|
||||
var c = Unsafe.Add(ref srcRefChar, srcIdx);
|
||||
if (c < 0x80)
|
||||
{
|
||||
// 1-byte ASCII (U+0000–U+007F)
|
||||
Unsafe.Add(ref dstRef, dstIdx++) = (byte)c;
|
||||
srcIdx += 1;
|
||||
}
|
||||
else if (c < 0x800)
|
||||
{
|
||||
// 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF
|
||||
// Latin extended (Hungarian, Polish, Czech, Spanish, French, German diacritics),
|
||||
// Greek, Cyrillic, Hebrew, Arabic.
|
||||
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xC0 | (c >> 6));
|
||||
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | (c & 0x3F));
|
||||
dstIdx += 2;
|
||||
srcIdx += 1;
|
||||
}
|
||||
else if ((c & 0xF800) != 0xD800)
|
||||
{
|
||||
// 3-byte BMP: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF (excluding surrogate range)
|
||||
// CJK BMP, various other BMP scripts.
|
||||
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xE0 | (c >> 12));
|
||||
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((c >> 6) & 0x3F));
|
||||
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | (c & 0x3F));
|
||||
dstIdx += 3;
|
||||
srcIdx += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// 4-byte: surrogate pair → supplementary plane codepoint (U+10000–U+10FFFF)
|
||||
// High surrogate (0xD800–0xDBFF) followed by low surrogate (0xDC00–0xDFFF).
|
||||
var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
|
||||
var codepoint = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00);
|
||||
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xF0 | (codepoint >> 18));
|
||||
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((codepoint >> 12) & 0x3F));
|
||||
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | ((codepoint >> 6) & 0x3F));
|
||||
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(0x80 | (codepoint & 0x3F));
|
||||
dstIdx += 4;
|
||||
srcIdx += 2; // consumed 2 chars (surrogate pair)
|
||||
}
|
||||
}
|
||||
|
||||
return dstIdx;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Counts UTF-16 chars produced by decoding the given UTF-8 byte span.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Layered SIMD: Vector512 (64 byte/iter) on AVX-512BW hosts → Vector256 (32 byte/iter) on AVX2
|
||||
/// hosts → Vector128 (16 byte/iter) on Apple Silicon / WASM / SSE2 → scalar tail. All SIMD paths
|
||||
/// use the same two bit-pattern checks:
|
||||
/// • Non-continuation bytes (NOT 10xxxxxx, mask 0xC0 ≠ 0x80): each contributes 1 char.
|
||||
/// • 4-byte start bytes (11110xxx, mask 0xF8 == 0xF0): each contributes an EXTRA char (surrogate pair).
|
||||
///
|
||||
/// SIMD per-block result: <c>(N - popcount(continuationMask)) + popcount(fourByteStartMask)</c>.
|
||||
/// Scalar tail handles the remaining bytes.
|
||||
///
|
||||
/// Char-count rules:
|
||||
/// • Continuation bytes (10xxxxxx, 0x80–0xBF) — produce no char, skip.
|
||||
/// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each.
|
||||
/// • 4-byte start bytes (11110xxx, 0xF0–0xF7) — produce 2 chars (UTF-16 surrogate pair).
|
||||
/// </remarks>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal static int CountUtf8Chars(ReadOnlySpan<byte> bytes)
|
||||
{
|
||||
var count = 0;
|
||||
var i = 0;
|
||||
ref var bytesRef = ref MemoryMarshal.GetReference(bytes);
|
||||
|
||||
// SIMD path 1: 64 bytes/iter via Vector512 (AVX-512BW hosts)
|
||||
if (Avx512BW.IsSupported && bytes.Length >= 64)
|
||||
{
|
||||
var contMask512 = Vector512.Create((byte)0xC0);
|
||||
var contValue512 = Vector512.Create((byte)0x80);
|
||||
var fourByteMask512 = Vector512.Create((byte)0xF8);
|
||||
var fourByteValue512 = Vector512.Create((byte)0xF0);
|
||||
|
||||
do
|
||||
{
|
||||
var v = Vector512.LoadUnsafe(ref bytesRef, (uint)i);
|
||||
|
||||
// Non-continuation count: 64 - popcount(continuation byte mask)
|
||||
var contMatches = Vector512.Equals(v & contMask512, contValue512);
|
||||
var contBits = contMatches.ExtractMostSignificantBits(); // ulong
|
||||
count += 64 - BitOperations.PopCount(contBits);
|
||||
|
||||
// 4-byte start count: popcount(fourByte start byte mask)
|
||||
var fourByteMatches = Vector512.Equals(v & fourByteMask512, fourByteValue512);
|
||||
var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
|
||||
count += BitOperations.PopCount(fourByteBits);
|
||||
|
||||
i += 64;
|
||||
} while (bytes.Length - i >= 64);
|
||||
}
|
||||
|
||||
// SIMD path 2: 32 bytes/iter via Vector256 (AVX2 hosts, also handles AVX-512 tail < 64)
|
||||
if (Vector256.IsHardwareAccelerated && bytes.Length - i >= 32)
|
||||
{
|
||||
var contMask = Vector256.Create((byte)0xC0);
|
||||
var contValue = Vector256.Create((byte)0x80);
|
||||
var fourByteMask = Vector256.Create((byte)0xF8);
|
||||
var fourByteValue = Vector256.Create((byte)0xF0);
|
||||
|
||||
do
|
||||
{
|
||||
var v = Vector256.LoadUnsafe(ref bytesRef, (uint)i);
|
||||
|
||||
// Non-continuation count: 32 - popcount(continuation byte mask)
|
||||
var contMatches = Vector256.Equals(v & contMask, contValue);
|
||||
var contBits = contMatches.ExtractMostSignificantBits();
|
||||
count += 32 - BitOperations.PopCount(contBits);
|
||||
|
||||
// 4-byte start count: popcount(fourByte start byte mask)
|
||||
var fourByteMatches = Vector256.Equals(v & fourByteMask, fourByteValue);
|
||||
var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
|
||||
count += BitOperations.PopCount(fourByteBits);
|
||||
|
||||
i += 32;
|
||||
} while (bytes.Length - i >= 32);
|
||||
}
|
||||
|
||||
// SIMD path 3: 16 bytes/iter via Vector128 (Apple Silicon NEON, WASM SIMD, legacy SSE2;
|
||||
// also handles tail < 32 from higher tiers). Cross-platform — Vector128.IsHardwareAccelerated
|
||||
// returns true on any host with a 128-bit SIMD ISA (NEON / SSE2 / WASM SIMD).
|
||||
if (Vector128.IsHardwareAccelerated && bytes.Length - i >= 16)
|
||||
{
|
||||
var contMask128 = Vector128.Create((byte)0xC0);
|
||||
var contValue128 = Vector128.Create((byte)0x80);
|
||||
var fourByteMask128 = Vector128.Create((byte)0xF8);
|
||||
var fourByteValue128 = Vector128.Create((byte)0xF0);
|
||||
|
||||
do
|
||||
{
|
||||
var v = Vector128.LoadUnsafe(ref bytesRef, (uint)i);
|
||||
|
||||
// Non-continuation count: 16 - popcount(continuation byte mask)
|
||||
var contMatches = Vector128.Equals(v & contMask128, contValue128);
|
||||
var contBits = contMatches.ExtractMostSignificantBits();
|
||||
count += 16 - BitOperations.PopCount(contBits);
|
||||
|
||||
// 4-byte start count: popcount(fourByte start byte mask)
|
||||
var fourByteMatches = Vector128.Equals(v & fourByteMask128, fourByteValue128);
|
||||
var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
|
||||
count += BitOperations.PopCount(fourByteBits);
|
||||
|
||||
i += 16;
|
||||
} while (bytes.Length - i >= 16);
|
||||
}
|
||||
|
||||
// Scalar tail (and fallback for non-SIMD hardware)
|
||||
for (; i < bytes.Length; i++)
|
||||
{
|
||||
var b = Unsafe.Add(ref bytesRef, i);
|
||||
if ((b & 0xC0) != 0x80) count++; // non-continuation byte
|
||||
if ((b & 0xF8) == 0xF0) count++; // 4-byte start: extra char for surrogate pair
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Single-pass UTF-8 → UTF-16 decoder. Returns the actual char count written to <paramref name="dst"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Layered approach for maximum throughput across mixed content:
|
||||
/// • <b>Phase 1 — Vector256 ASCII prefix bulk widen:</b> 32 bytes/iter while all top bits are zero.
|
||||
/// Uses <see cref="Vector256.Widen(Vector256{byte})"/> to produce two Vector256<ushort> lanes
|
||||
/// = 32 chars per iteration. Breaks on first non-ASCII byte found in the loaded vector.
|
||||
/// • <b>Phase 2 — DWORD ASCII batch:</b> when ≥4 bytes remain, read as <c>uint</c>, test
|
||||
/// <c>(dword & 0x80808080u) == 0</c>; on hit, widen 4 chars in 4 instructions and continue.
|
||||
/// • <b>Phase 3 — Scalar multi-byte branch:</b> 1-byte (ASCII single), 2-byte (Latin extended,
|
||||
/// Cyrillic, Greek, Hebrew, Arabic), 3-byte (CJK BMP), 4-byte (supplementary plane → surrogate pair).
|
||||
/// Direct bit-extract, no validation — input is trusted.
|
||||
///
|
||||
/// JIT compiles the switch into a jump table for predictable dispatch on mixed content.
|
||||
/// Hungarian text typical pattern: ASCII run (Phase 1/2 widening) → 2-byte char (Phase 3
|
||||
/// case < 0xE0) → ASCII run → 2-byte char → ... — each phase optimal for its segment.
|
||||
/// </remarks>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal static int DecodeUtf8SinglePass(ReadOnlySpan<byte> src, Span<char> dst)
|
||||
{
|
||||
int srcIdx = 0, dstIdx = 0;
|
||||
ref byte srcRef = ref MemoryMarshal.GetReference(src);
|
||||
ref ushort dstRef = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(dst));
|
||||
|
||||
// Phase 1 — Vector256 ASCII prefix bulk widen (32 bytes/iter)
|
||||
if (Vector256.IsHardwareAccelerated)
|
||||
{
|
||||
while (src.Length - srcIdx >= Vector256<byte>.Count)
|
||||
{
|
||||
var v = Vector256.LoadUnsafe(ref srcRef, (uint)srcIdx);
|
||||
// ASCII detect: any high bit set among the 32 bytes?
|
||||
if (v.ExtractMostSignificantBits() != 0) break;
|
||||
|
||||
// Widen 32 bytes → 2 × Vector256<ushort> (32 chars total)
|
||||
var (lower, upper) = Vector256.Widen(v);
|
||||
lower.StoreUnsafe(ref dstRef, (uint)dstIdx);
|
||||
upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector128<ushort>.Count));
|
||||
srcIdx += Vector256<byte>.Count;
|
||||
dstIdx += Vector256<byte>.Count; // 32 bytes → 32 chars
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2/3 — scalar loop with DWORD ASCII batch
|
||||
while (srcIdx < src.Length)
|
||||
{
|
||||
// DWORD ASCII batch: 4 ASCII bytes → 4 chars per iter
|
||||
if (src.Length - srcIdx >= 4)
|
||||
{
|
||||
var dword = Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref srcRef, srcIdx));
|
||||
if ((dword & 0x80808080u) == 0)
|
||||
{
|
||||
Unsafe.Add(ref dstRef, dstIdx) = (byte)dword;
|
||||
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(dword >> 8);
|
||||
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(dword >> 16);
|
||||
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(dword >> 24);
|
||||
srcIdx += 4;
|
||||
dstIdx += 4;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Scalar multi-byte branch (jump-table compile via switch)
|
||||
var b0 = Unsafe.Add(ref srcRef, srcIdx);
|
||||
switch (b0)
|
||||
{
|
||||
case < 0x80:
|
||||
// 1-byte ASCII (U+0000–U+007F)
|
||||
Unsafe.Add(ref dstRef, dstIdx++) = b0;
|
||||
srcIdx += 1;
|
||||
break;
|
||||
case < 0xE0:
|
||||
{
|
||||
// 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF
|
||||
// Latin extended, Cyrillic, Greek, Hebrew, Arabic.
|
||||
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
|
||||
Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x1F) << 6) | (b1 & 0x3F));
|
||||
srcIdx += 2;
|
||||
break;
|
||||
}
|
||||
case < 0xF0:
|
||||
{
|
||||
// 3-byte: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF
|
||||
// CJK BMP, various other scripts.
|
||||
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
|
||||
var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
|
||||
Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F));
|
||||
srcIdx += 3;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
// 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000–U+10FFFF
|
||||
// Supplementary plane (emoji, rare CJK ext) → UTF-16 surrogate pair.
|
||||
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
|
||||
var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
|
||||
var b3 = Unsafe.Add(ref srcRef, srcIdx + 3);
|
||||
var codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
|
||||
codepoint -= 0x10000;
|
||||
Unsafe.Add(ref dstRef, dstIdx) = (ushort)(0xD800 | (codepoint >> 10));
|
||||
Unsafe.Add(ref dstRef, dstIdx + 1) = (ushort)(0xDC00 | (codepoint & 0x3FF));
|
||||
dstIdx += 2;
|
||||
srcIdx += 4;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return dstIdx;
|
||||
}
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue