diff --git a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs index 2fc1f8d..e71dc3b 100644 --- a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs +++ b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs @@ -484,242 +484,14 @@ public static partial class AcBinaryDeserializer var pos = _position; _position += byteLength; var src = _buffer.AsSpan(pos, byteLength); - var charCount = CountUtf8Chars(src); + var charCount = Utf8Transcoder.CountUtf8Chars(src); return string.Create(charCount, (Buffer: _buffer, Pos: pos, Len: byteLength), static (chars, state) => { - DecodeUtf8SinglePass(state.Buffer.AsSpan(state.Pos, state.Len), chars); + Utf8Transcoder.DecodeUtf8SinglePass(state.Buffer.AsSpan(state.Pos, state.Len), chars); }); } - /// - /// Counts UTF-16 chars produced by decoding the given UTF-8 byte span. - /// - /// - /// Layered SIMD: Vector512 (64 byte/iter) on AVX-512BW hosts → Vector256 (32 byte/iter) - /// on AVX2 hosts → scalar tail. Both SIMD paths use the same two bit-pattern checks: - /// • Non-continuation bytes (NOT 10xxxxxx, mask 0xC0 ≠ 0x80): each contributes 1 char. - /// • 4-byte start bytes (11110xxx, mask 0xF8 == 0xF0): each contributes an EXTRA char (surrogate pair). - /// - /// SIMD per-block result: (N - popcount(continuationMask)) + popcount(fourByteStartMask) - /// where N = 64 (Vector512) or 32 (Vector256). Scalar tail handles the remaining bytes. - /// - /// Char-count rules: - /// • Continuation bytes (10xxxxxx, 0x80–0xBF) — produce no char, skip. - /// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each. - /// • 4-byte start bytes (11110xxx, 0xF0–0xF7) — produce 2 chars (UTF-16 surrogate pair). - /// - /// JIT-time path-selection: Avx512BW.IsSupported and Vector256.IsHardwareAccelerated - /// are [Intrinsic] static booleans — the JIT/AOT constant-folds the dead branches per host. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int CountUtf8Chars(ReadOnlySpan bytes) - { - var count = 0; - var i = 0; - ref var bytesRef = ref MemoryMarshal.GetReference(bytes); - - // SIMD path 1: 64 bytes/iter via Vector512 (AVX-512BW hosts) - if (Avx512BW.IsSupported && bytes.Length >= 64) - { - var contMask512 = Vector512.Create((byte)0xC0); - var contValue512 = Vector512.Create((byte)0x80); - var fourByteMask512 = Vector512.Create((byte)0xF8); - var fourByteValue512 = Vector512.Create((byte)0xF0); - - do - { - var v = Vector512.LoadUnsafe(ref bytesRef, (uint)i); - - // Non-continuation count: 64 - popcount(continuation byte mask) - var contMatches = Vector512.Equals(v & contMask512, contValue512); - var contBits = contMatches.ExtractMostSignificantBits(); // ulong - count += 64 - System.Numerics.BitOperations.PopCount(contBits); - - // 4-byte start count: popcount(fourByte start byte mask) - var fourByteMatches = Vector512.Equals(v & fourByteMask512, fourByteValue512); - var fourByteBits = fourByteMatches.ExtractMostSignificantBits(); - count += System.Numerics.BitOperations.PopCount(fourByteBits); - - i += 64; - } while (bytes.Length - i >= 64); - } - - // SIMD path 2: 32 bytes/iter via Vector256 (AVX2 hosts, also handles AVX-512 tail < 64) - if (Vector256.IsHardwareAccelerated && bytes.Length - i >= 32) - { - var contMask = Vector256.Create((byte)0xC0); - var contValue = Vector256.Create((byte)0x80); - var fourByteMask = Vector256.Create((byte)0xF8); - var fourByteValue = Vector256.Create((byte)0xF0); - - do - { - var v = Vector256.LoadUnsafe(ref bytesRef, (uint)i); - - // Non-continuation count: 32 - popcount(continuation byte mask) - var contMatches = Vector256.Equals(v & contMask, contValue); - var contBits = contMatches.ExtractMostSignificantBits(); - count += 32 - System.Numerics.BitOperations.PopCount(contBits); - - // 4-byte start count: popcount(fourByte start byte mask) - var fourByteMatches = Vector256.Equals(v & fourByteMask, fourByteValue); - var fourByteBits = fourByteMatches.ExtractMostSignificantBits(); - count += System.Numerics.BitOperations.PopCount(fourByteBits); - - i += 32; - } while (bytes.Length - i >= 32); - } - - // SIMD path 3: 16 bytes/iter via Vector128 (Apple Silicon NEON, WASM SIMD, legacy SSE2; - // also handles tail < 32 from higher tiers). Cross-platform — Vector128.IsHardwareAccelerated - // returns true on any host with a 128-bit SIMD ISA (NEON / SSE2 / WASM SIMD). - if (Vector128.IsHardwareAccelerated && bytes.Length - i >= 16) - { - var contMask128 = Vector128.Create((byte)0xC0); - var contValue128 = Vector128.Create((byte)0x80); - var fourByteMask128 = Vector128.Create((byte)0xF8); - var fourByteValue128 = Vector128.Create((byte)0xF0); - - do - { - var v = Vector128.LoadUnsafe(ref bytesRef, (uint)i); - - // Non-continuation count: 16 - popcount(continuation byte mask) - var contMatches = Vector128.Equals(v & contMask128, contValue128); - var contBits = contMatches.ExtractMostSignificantBits(); - count += 16 - System.Numerics.BitOperations.PopCount(contBits); - - // 4-byte start count: popcount(fourByte start byte mask) - var fourByteMatches = Vector128.Equals(v & fourByteMask128, fourByteValue128); - var fourByteBits = fourByteMatches.ExtractMostSignificantBits(); - count += System.Numerics.BitOperations.PopCount(fourByteBits); - - i += 16; - } while (bytes.Length - i >= 16); - } - - // Scalar tail (and fallback for non-SIMD hardware) - for (; i < bytes.Length; i++) - { - var b = Unsafe.Add(ref bytesRef, i); - if ((b & 0xC0) != 0x80) count++; // non-continuation byte - if ((b & 0xF8) == 0xF0) count++; // 4-byte start: extra char for surrogate pair - } - return count; - } - - /// - /// Single-pass UTF-8 → UTF-16 decoder. Returns the actual char count written to . - /// - /// - /// Layered approach for maximum throughput across mixed content: - /// • Phase 1 — Vector256 ASCII prefix bulk widen: 32 bytes/iter while all top bits are zero. - /// Uses to produce two Vector256<ushort> lanes - /// = 32 chars per iteration. Breaks on first non-ASCII byte found in the loaded vector. - /// • Phase 2 — DWORD ASCII batch: when ≥4 bytes remain, read as uint, test - /// (dword & 0x80808080u) == 0; on hit, widen 4 chars in 4 instructions and continue. - /// • Phase 3 — Scalar multi-byte branch: 1-byte (ASCII single), 2-byte (Latin extended, - /// Cyrillic, Greek, Hebrew, Arabic), 3-byte (CJK BMP), 4-byte (supplementary plane → surrogate pair). - /// Direct bit-extract, no validation — input is trusted. - /// - /// JIT compiles the switch into a jump table for predictable dispatch on mixed content. - /// Hungarian text typical pattern: ASCII run (Phase 1/2 widening) → 2-byte char (Phase 3 - /// case < 0xE0) → ASCII run → 2-byte char → ... — each phase optimal for its segment. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int DecodeUtf8SinglePass(ReadOnlySpan src, Span dst) - { - int srcIdx = 0, dstIdx = 0; - ref byte srcRef = ref MemoryMarshal.GetReference(src); - ref ushort dstRef = ref Unsafe.As(ref MemoryMarshal.GetReference(dst)); - - // Phase 1 — Vector256 ASCII prefix bulk widen (32 bytes/iter) - if (Vector256.IsHardwareAccelerated) - { - while (src.Length - srcIdx >= Vector256.Count) - { - var v = Vector256.LoadUnsafe(ref srcRef, (uint)srcIdx); - // ASCII detect: any high bit set among the 32 bytes? - if (v.ExtractMostSignificantBits() != 0) break; - - // Widen 32 bytes → 2 × Vector256 (32 chars total) - var (lower, upper) = Vector256.Widen(v); - lower.StoreUnsafe(ref dstRef, (uint)dstIdx); - upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector128.Count)); - srcIdx += Vector256.Count; - dstIdx += Vector256.Count; // 32 bytes → 32 chars - } - } - - // Phase 2/3 — scalar loop with DWORD ASCII batch - while (srcIdx < src.Length) - { - // DWORD ASCII batch: 4 ASCII bytes → 4 chars per iter - if (src.Length - srcIdx >= 4) - { - var dword = Unsafe.ReadUnaligned(ref Unsafe.Add(ref srcRef, srcIdx)); - if ((dword & 0x80808080u) == 0) - { - Unsafe.Add(ref dstRef, dstIdx) = (byte)dword; - Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(dword >> 8); - Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(dword >> 16); - Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(dword >> 24); - srcIdx += 4; - dstIdx += 4; - continue; - } - } - - // Scalar multi-byte branch (jump-table compile via switch) - var b0 = Unsafe.Add(ref srcRef, srcIdx); - switch (b0) - { - case < 0x80: - // 1-byte ASCII (U+0000–U+007F) - Unsafe.Add(ref dstRef, dstIdx++) = b0; - srcIdx += 1; - break; - case < 0xE0: - { - // 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF - // Latin extended, Cyrillic, Greek, Hebrew, Arabic. - var b1 = Unsafe.Add(ref srcRef, srcIdx + 1); - Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x1F) << 6) | (b1 & 0x3F)); - srcIdx += 2; - break; - } - case < 0xF0: - { - // 3-byte: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF - // CJK BMP, various other scripts. - var b1 = Unsafe.Add(ref srcRef, srcIdx + 1); - var b2 = Unsafe.Add(ref srcRef, srcIdx + 2); - Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F)); - srcIdx += 3; - break; - } - default: - { - // 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000–U+10FFFF - // Supplementary plane (emoji, rare CJK ext) → UTF-16 surrogate pair. - var b1 = Unsafe.Add(ref srcRef, srcIdx + 1); - var b2 = Unsafe.Add(ref srcRef, srcIdx + 2); - var b3 = Unsafe.Add(ref srcRef, srcIdx + 3); - var codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); - codepoint -= 0x10000; - Unsafe.Add(ref dstRef, dstIdx) = (ushort)(0xD800 | (codepoint >> 10)); - Unsafe.Add(ref dstRef, dstIdx + 1) = (ushort)(0xDC00 | (codepoint & 0x3FF)); - dstIdx += 2; - srcIdx += 4; - break; - } - } - } - - return dstIdx; - } - private string ReadStringUtf8Cached(int length) { var slice = _buffer.AsSpan(_position, length); diff --git a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs index 45b256a..ce9af11 100644 --- a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs +++ b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs @@ -699,7 +699,7 @@ public static partial class AcBinarySerializer var savedPos = _position; var encodeStart = savedPos + reserveSize; - var bytesWritten = EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes)); + var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes)); var actualVarUIntSize = VarUIntSize((uint)bytesWritten); if (actualVarUIntSize < reserveSize) @@ -768,7 +768,7 @@ public static partial class AcBinarySerializer EnsureCapacity(2 + maxBytesShort); // marker + 1-byte VarUInt + bytes (worst case) var savedPosShort = _position; - var bytesWrittenShort = EncodeUtf8SinglePass( + var bytesWrittenShort = Utf8Transcoder.EncodeUtf8SinglePass( value.AsSpan(), _buffer.AsSpan(savedPosShort + 1, maxBytesShort)); var isAsciiShort = bytesWrittenShort == charLength; @@ -805,7 +805,7 @@ public static partial class AcBinarySerializer var savedPos = _position; var encodeStart = savedPos + 1 + reserveVarUInt; - var bytesWritten = EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes)); + var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes)); var isAscii = bytesWritten == charLength; _buffer[savedPos] = isAscii ? BinaryTypeCode.StringAscii : BinaryTypeCode.String; @@ -876,161 +876,6 @@ public static partial class AcBinarySerializer _position += byteCount; } - /// - /// Custom UTF-16 → UTF-8 single-pass encoder. Symmetric with the deserializer's custom decoder - /// ('s DecodeUtf8SinglePass). - /// - /// - /// Bypasses .GetBytes virtual-dispatch + encoder-fallback - /// overhead. Trusted-input encoder — no validation (writer side, the input is a .NET string - /// which always has valid UTF-16 surrogate pairs). - /// - /// Layered for max throughput on mixed content: - /// • Phase 1 — Vector256 ASCII narrow: 16 chars/iter. Loads Vector256<ushort>, - /// tests (v & 0xFF80) == 0 for all-ASCII; on hit, narrows to Vector128<byte> - /// via Vector128.Narrow(GetLower, GetUpper) = 16 bytes per iter. - /// • Phase 2 — DWORD ASCII batch: 4 chars/iter. OR-mask test - /// (c0 | c1 | c2 | c3) & 0xFF80 == 0; on hit, 4 byte writes per iter. - /// • Phase 3 — Scalar multi-byte encode: 1-byte (ASCII), 2-byte (Latin extended, - /// Cyrillic, Greek), 3-byte (CJK BMP), 4-byte (supplementary plane via UTF-16 surrogate pair). - /// - /// Returns actual byte count written. Caller must ensure has at least - /// src.Length * 4 capacity (UTF-8 worst case). - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int EncodeUtf8SinglePass(ReadOnlySpan src, Span dst) - { - int srcIdx = 0, dstIdx = 0; - ref char srcRefChar = ref MemoryMarshal.GetReference(src); - ref ushort srcRefU16 = ref Unsafe.As(ref srcRefChar); - ref byte dstRef = ref MemoryMarshal.GetReference(dst); - - // Phase 1a — Vector512 ASCII narrow (32 chars/iter on AVX-512BW hosts). - // JIT-time path-selection via Avx512BW.IsSupported [Intrinsic] static bool — non-AVX-512 - // hosts get this branch eliminated by constant-folding (zero overhead in the generated asm). - if (Avx512BW.IsSupported) - { - var asciiMask512 = Vector512.Create((ushort)0xFF80); - while (src.Length - srcIdx >= Vector512.Count) // 32 chars per Vector512 - { - var v = Vector512.LoadUnsafe(ref srcRefU16, (uint)srcIdx); - // ASCII detect: any char's high bits set (>= 0x80)? - if ((v & asciiMask512) != Vector512.Zero) break; - // Narrow 32 ushorts (Vector512) → 32 bytes (Vector256) via two 256-bit halves. - // The JIT lowers this to AVX-512 VPACKUSWB on capable hosts (single-instruction pack). - var bytes = Vector256.Narrow(v.GetLower(), v.GetUpper()); - bytes.StoreUnsafe(ref dstRef, (uint)dstIdx); - srcIdx += Vector512.Count; - dstIdx += Vector512.Count; // 32 chars → 32 bytes (1:1 for ASCII) - } - } - - // Phase 1b — Vector256 ASCII narrow (16 chars/iter on AVX2 hosts; also handles tail < 32 chars - // after the AVX-512 path on capable hosts). - if (Vector256.IsHardwareAccelerated) - { - var asciiMask = Vector256.Create((ushort)0xFF80); - while (src.Length - srcIdx >= Vector256.Count) // 16 chars per Vector256 - { - var v = Vector256.LoadUnsafe(ref srcRefU16, (uint)srcIdx); - // ASCII detect: any char's high bits set (>= 0x80)? - if ((v & asciiMask) != Vector256.Zero) break; - // Narrow 16 ushorts (Vector256) → 16 bytes (Vector128) via two halves - var bytes = Vector128.Narrow(v.GetLower(), v.GetUpper()); - bytes.StoreUnsafe(ref dstRef, (uint)dstIdx); - srcIdx += Vector256.Count; - dstIdx += Vector256.Count; // 16 chars → 16 bytes (1:1 for ASCII) - } - } - - // Phase 1c — Vector128 ASCII narrow (16 chars/iter on Apple Silicon NEON, WASM SIMD, - // legacy SSE2; also handles tail < 16 chars after higher tiers). Cross-platform — - // Vector128.IsHardwareAccelerated is true on any 128-bit-SIMD-capable host. - // Two Vector128 loads (8 + 8 = 16 chars) narrow to one Vector128 (16 bytes). - if (Vector128.IsHardwareAccelerated) - { - var asciiMask128 = Vector128.Create((ushort)0xFF80); - while (src.Length - srcIdx >= 16) // 16 chars = 2 × Vector128 - { - var lo = Vector128.LoadUnsafe(ref srcRefU16, (uint)srcIdx); - var hi = Vector128.LoadUnsafe(ref srcRefU16, (uint)(srcIdx + 8)); - // ASCII detect: any char's high bits set in either half? - if (((lo | hi) & asciiMask128) != Vector128.Zero) break; - // Narrow 2× Vector128 (16 chars) → Vector128 (16 bytes) - var bytes = Vector128.Narrow(lo, hi); - bytes.StoreUnsafe(ref dstRef, (uint)dstIdx); - srcIdx += 16; - dstIdx += 16; - } - } - - // Phase 2/3 — scalar with DWORD ASCII batch - while (srcIdx < src.Length) - { - // DWORD ASCII batch: 4 chars → 4 bytes when all ASCII - if (src.Length - srcIdx >= 4) - { - var c0 = Unsafe.Add(ref srcRefChar, srcIdx); - var c1 = Unsafe.Add(ref srcRefChar, srcIdx + 1); - var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 2); - var c3 = Unsafe.Add(ref srcRefChar, srcIdx + 3); - if (((c0 | c1 | c2 | c3) & 0xFF80) == 0) - { - Unsafe.Add(ref dstRef, dstIdx) = (byte)c0; - Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)c1; - Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)c2; - Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)c3; - srcIdx += 4; - dstIdx += 4; - continue; - } - } - - // Scalar single-char encode - var c = Unsafe.Add(ref srcRefChar, srcIdx); - if (c < 0x80) - { - // 1-byte ASCII (U+0000–U+007F) - Unsafe.Add(ref dstRef, dstIdx++) = (byte)c; - srcIdx += 1; - } - else if (c < 0x800) - { - // 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF - // Latin extended (Hungarian, Polish, Czech, Spanish, French, German diacritics), - // Greek, Cyrillic, Hebrew, Arabic. - Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xC0 | (c >> 6)); - Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | (c & 0x3F)); - dstIdx += 2; - srcIdx += 1; - } - else if ((c & 0xF800) != 0xD800) - { - // 3-byte BMP: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF (excluding surrogate range) - // CJK BMP, various other BMP scripts. - Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xE0 | (c >> 12)); - Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((c >> 6) & 0x3F)); - Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | (c & 0x3F)); - dstIdx += 3; - srcIdx += 1; - } - else - { - // 4-byte: surrogate pair → supplementary plane codepoint (U+10000–U+10FFFF) - // High surrogate (0xD800–0xDBFF) followed by low surrogate (0xDC00–0xDFFF). - var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 1); - var codepoint = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00); - Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xF0 | (codepoint >> 18)); - Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((codepoint >> 12) & 0x3F)); - Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | ((codepoint >> 6) & 0x3F)); - Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(0x80 | (codepoint & 0x3F)); - dstIdx += 4; - srcIdx += 2; // consumed 2 chars (surrogate pair) - } - } - - return dstIdx; - } #endregion diff --git a/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs b/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs new file mode 100644 index 0000000..a69adab --- /dev/null +++ b/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs @@ -0,0 +1,416 @@ +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace AyCode.Core.Serializers.Binaries; + +/// +/// In-house UTF-8 ↔ UTF-16 transcoder used by the binary serializer hot path. Trust-input +/// semantics — no validation (writer-side input is a .NET string, reader-side input is valid +/// UTF-8 by writer contract). Bypasses virtual-dispatch +/// + EncoderFallback / DecoderFallback overhead. +/// +/// SIMD path hierarchy (cascading tail-handler): +/// +/// Vector512 / AVX-512BW — 64 byte/iter (Intel server, Intel 11th gen client, AMD Zen 4+) +/// Vector256 / AVX2 — 32 byte/iter (Intel 12-14th gen client, AMD Zen 3 and earlier) +/// Vector128 / SSE-NEON-WASM — 16 byte/iter (Apple Silicon NEON, WASM SIMD, legacy SSE2) +/// Scalar — final tail (< 16 byte) and no-SIMD fall-back +/// +/// +/// JIT/AOT path-selection: Avx512BW.IsSupported / Vector256.IsHardwareAccelerated / +/// Vector128.IsHardwareAccelerated are [Intrinsic] static booleans — the compiler +/// constant-folds the dead branches per host. Non-supported tiers eliminate to zero generated code. +/// +/// Algorithm reference: see BINARY_TODO.md#accore-bin-t-v4n2 for the multi-tier SIMD +/// transcoder design and per-tier acceptance criteria. +/// +internal static class Utf8Transcoder +{ + /// + /// Custom UTF-16 → UTF-8 single-pass encoder. Symmetric with . + /// + /// + /// Bypasses .GetBytes virtual-dispatch + encoder-fallback + /// overhead. Trusted-input encoder — no validation (writer side, the input is a .NET string + /// which always has valid UTF-16 surrogate pairs). + /// + /// Layered for max throughput on mixed content: + /// • Phase 1a — Vector512 ASCII narrow: 32 chars/iter on AVX-512BW hosts. JIT lowers + /// Vector256.Narrow to AVX-512 VPACKUSWB (single-instruction pack). + /// • Phase 1b — Vector256 ASCII narrow: 16 chars/iter on AVX2 hosts (also handles tail + /// < 32 chars after the AVX-512 path on capable hosts). + /// • Phase 1c — Vector128 ASCII narrow: 16 chars/iter on Apple Silicon NEON / WASM SIMD + /// / legacy SSE2 hosts (also handles tail < 16 chars). + /// • Phase 2 — DWORD ASCII batch: 4 chars/iter. OR-mask test + /// (c0 | c1 | c2 | c3) & 0xFF80 == 0; on hit, 4 byte writes per iter. + /// • Phase 3 — Scalar multi-byte encode: 1-byte (ASCII), 2-byte (Latin extended, + /// Cyrillic, Greek), 3-byte (CJK BMP), 4-byte (supplementary plane via UTF-16 surrogate pair). + /// + /// Returns actual byte count written. Caller must ensure has at least + /// src.Length * 4 capacity (UTF-8 worst case). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int EncodeUtf8SinglePass(ReadOnlySpan src, Span dst) + { + int srcIdx = 0, dstIdx = 0; + ref char srcRefChar = ref MemoryMarshal.GetReference(src); + ref ushort srcRefU16 = ref Unsafe.As(ref srcRefChar); + ref byte dstRef = ref MemoryMarshal.GetReference(dst); + + // Phase 1a — Vector512 ASCII narrow (32 chars/iter on AVX-512BW hosts). + // JIT-time path-selection via Avx512BW.IsSupported [Intrinsic] static bool — non-AVX-512 + // hosts get this branch eliminated by constant-folding (zero overhead in the generated asm). + if (Avx512BW.IsSupported) + { + var asciiMask512 = Vector512.Create((ushort)0xFF80); + while (src.Length - srcIdx >= Vector512.Count) // 32 chars per Vector512 + { + var v = Vector512.LoadUnsafe(ref srcRefU16, (uint)srcIdx); + // ASCII detect: any char's high bits set (>= 0x80)? + if ((v & asciiMask512) != Vector512.Zero) break; + // Narrow 32 ushorts (Vector512) → 32 bytes (Vector256) via two 256-bit halves. + // The JIT lowers this to AVX-512 VPACKUSWB on capable hosts (single-instruction pack). + var bytes = Vector256.Narrow(v.GetLower(), v.GetUpper()); + bytes.StoreUnsafe(ref dstRef, (uint)dstIdx); + srcIdx += Vector512.Count; + dstIdx += Vector512.Count; // 32 chars → 32 bytes (1:1 for ASCII) + } + } + + // Phase 1b — Vector256 ASCII narrow (16 chars/iter on AVX2 hosts; also handles tail < 32 chars + // after the AVX-512 path on capable hosts). + if (Vector256.IsHardwareAccelerated) + { + var asciiMask = Vector256.Create((ushort)0xFF80); + while (src.Length - srcIdx >= Vector256.Count) // 16 chars per Vector256 + { + var v = Vector256.LoadUnsafe(ref srcRefU16, (uint)srcIdx); + // ASCII detect: any char's high bits set (>= 0x80)? + if ((v & asciiMask) != Vector256.Zero) break; + // Narrow 16 ushorts (Vector256) → 16 bytes (Vector128) via two halves + var bytes = Vector128.Narrow(v.GetLower(), v.GetUpper()); + bytes.StoreUnsafe(ref dstRef, (uint)dstIdx); + srcIdx += Vector256.Count; + dstIdx += Vector256.Count; // 16 chars → 16 bytes (1:1 for ASCII) + } + } + + // Phase 1c — Vector128 ASCII narrow (16 chars/iter on Apple Silicon NEON, WASM SIMD, + // legacy SSE2; also handles tail < 16 chars after higher tiers). Cross-platform — + // Vector128.IsHardwareAccelerated is true on any 128-bit-SIMD-capable host. + // Two Vector128 loads (8 + 8 = 16 chars) narrow to one Vector128 (16 bytes). + if (Vector128.IsHardwareAccelerated) + { + var asciiMask128 = Vector128.Create((ushort)0xFF80); + while (src.Length - srcIdx >= 16) // 16 chars = 2 × Vector128 + { + var lo = Vector128.LoadUnsafe(ref srcRefU16, (uint)srcIdx); + var hi = Vector128.LoadUnsafe(ref srcRefU16, (uint)(srcIdx + 8)); + // ASCII detect: any char's high bits set in either half? + if (((lo | hi) & asciiMask128) != Vector128.Zero) break; + // Narrow 2× Vector128 (16 chars) → Vector128 (16 bytes) + var bytes = Vector128.Narrow(lo, hi); + bytes.StoreUnsafe(ref dstRef, (uint)dstIdx); + srcIdx += 16; + dstIdx += 16; + } + } + + // Phase 2/3 — scalar with DWORD ASCII batch + while (srcIdx < src.Length) + { + // DWORD ASCII batch: 4 chars → 4 bytes when all ASCII + if (src.Length - srcIdx >= 4) + { + var c0 = Unsafe.Add(ref srcRefChar, srcIdx); + var c1 = Unsafe.Add(ref srcRefChar, srcIdx + 1); + var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 2); + var c3 = Unsafe.Add(ref srcRefChar, srcIdx + 3); + if (((c0 | c1 | c2 | c3) & 0xFF80) == 0) + { + Unsafe.Add(ref dstRef, dstIdx) = (byte)c0; + Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)c1; + Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)c2; + Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)c3; + srcIdx += 4; + dstIdx += 4; + continue; + } + } + + // Scalar single-char encode + var c = Unsafe.Add(ref srcRefChar, srcIdx); + if (c < 0x80) + { + // 1-byte ASCII (U+0000–U+007F) + Unsafe.Add(ref dstRef, dstIdx++) = (byte)c; + srcIdx += 1; + } + else if (c < 0x800) + { + // 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF + // Latin extended (Hungarian, Polish, Czech, Spanish, French, German diacritics), + // Greek, Cyrillic, Hebrew, Arabic. + Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xC0 | (c >> 6)); + Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | (c & 0x3F)); + dstIdx += 2; + srcIdx += 1; + } + else if ((c & 0xF800) != 0xD800) + { + // 3-byte BMP: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF (excluding surrogate range) + // CJK BMP, various other BMP scripts. + Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xE0 | (c >> 12)); + Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((c >> 6) & 0x3F)); + Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | (c & 0x3F)); + dstIdx += 3; + srcIdx += 1; + } + else + { + // 4-byte: surrogate pair → supplementary plane codepoint (U+10000–U+10FFFF) + // High surrogate (0xD800–0xDBFF) followed by low surrogate (0xDC00–0xDFFF). + var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 1); + var codepoint = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00); + Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xF0 | (codepoint >> 18)); + Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((codepoint >> 12) & 0x3F)); + Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | ((codepoint >> 6) & 0x3F)); + Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(0x80 | (codepoint & 0x3F)); + dstIdx += 4; + srcIdx += 2; // consumed 2 chars (surrogate pair) + } + } + + return dstIdx; + } + + /// + /// Counts UTF-16 chars produced by decoding the given UTF-8 byte span. + /// + /// + /// Layered SIMD: Vector512 (64 byte/iter) on AVX-512BW hosts → Vector256 (32 byte/iter) on AVX2 + /// hosts → Vector128 (16 byte/iter) on Apple Silicon / WASM / SSE2 → scalar tail. All SIMD paths + /// use the same two bit-pattern checks: + /// • Non-continuation bytes (NOT 10xxxxxx, mask 0xC0 ≠ 0x80): each contributes 1 char. + /// • 4-byte start bytes (11110xxx, mask 0xF8 == 0xF0): each contributes an EXTRA char (surrogate pair). + /// + /// SIMD per-block result: (N - popcount(continuationMask)) + popcount(fourByteStartMask). + /// Scalar tail handles the remaining bytes. + /// + /// Char-count rules: + /// • Continuation bytes (10xxxxxx, 0x80–0xBF) — produce no char, skip. + /// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each. + /// • 4-byte start bytes (11110xxx, 0xF0–0xF7) — produce 2 chars (UTF-16 surrogate pair). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int CountUtf8Chars(ReadOnlySpan bytes) + { + var count = 0; + var i = 0; + ref var bytesRef = ref MemoryMarshal.GetReference(bytes); + + // SIMD path 1: 64 bytes/iter via Vector512 (AVX-512BW hosts) + if (Avx512BW.IsSupported && bytes.Length >= 64) + { + var contMask512 = Vector512.Create((byte)0xC0); + var contValue512 = Vector512.Create((byte)0x80); + var fourByteMask512 = Vector512.Create((byte)0xF8); + var fourByteValue512 = Vector512.Create((byte)0xF0); + + do + { + var v = Vector512.LoadUnsafe(ref bytesRef, (uint)i); + + // Non-continuation count: 64 - popcount(continuation byte mask) + var contMatches = Vector512.Equals(v & contMask512, contValue512); + var contBits = contMatches.ExtractMostSignificantBits(); // ulong + count += 64 - BitOperations.PopCount(contBits); + + // 4-byte start count: popcount(fourByte start byte mask) + var fourByteMatches = Vector512.Equals(v & fourByteMask512, fourByteValue512); + var fourByteBits = fourByteMatches.ExtractMostSignificantBits(); + count += BitOperations.PopCount(fourByteBits); + + i += 64; + } while (bytes.Length - i >= 64); + } + + // SIMD path 2: 32 bytes/iter via Vector256 (AVX2 hosts, also handles AVX-512 tail < 64) + if (Vector256.IsHardwareAccelerated && bytes.Length - i >= 32) + { + var contMask = Vector256.Create((byte)0xC0); + var contValue = Vector256.Create((byte)0x80); + var fourByteMask = Vector256.Create((byte)0xF8); + var fourByteValue = Vector256.Create((byte)0xF0); + + do + { + var v = Vector256.LoadUnsafe(ref bytesRef, (uint)i); + + // Non-continuation count: 32 - popcount(continuation byte mask) + var contMatches = Vector256.Equals(v & contMask, contValue); + var contBits = contMatches.ExtractMostSignificantBits(); + count += 32 - BitOperations.PopCount(contBits); + + // 4-byte start count: popcount(fourByte start byte mask) + var fourByteMatches = Vector256.Equals(v & fourByteMask, fourByteValue); + var fourByteBits = fourByteMatches.ExtractMostSignificantBits(); + count += BitOperations.PopCount(fourByteBits); + + i += 32; + } while (bytes.Length - i >= 32); + } + + // SIMD path 3: 16 bytes/iter via Vector128 (Apple Silicon NEON, WASM SIMD, legacy SSE2; + // also handles tail < 32 from higher tiers). Cross-platform — Vector128.IsHardwareAccelerated + // returns true on any host with a 128-bit SIMD ISA (NEON / SSE2 / WASM SIMD). + if (Vector128.IsHardwareAccelerated && bytes.Length - i >= 16) + { + var contMask128 = Vector128.Create((byte)0xC0); + var contValue128 = Vector128.Create((byte)0x80); + var fourByteMask128 = Vector128.Create((byte)0xF8); + var fourByteValue128 = Vector128.Create((byte)0xF0); + + do + { + var v = Vector128.LoadUnsafe(ref bytesRef, (uint)i); + + // Non-continuation count: 16 - popcount(continuation byte mask) + var contMatches = Vector128.Equals(v & contMask128, contValue128); + var contBits = contMatches.ExtractMostSignificantBits(); + count += 16 - BitOperations.PopCount(contBits); + + // 4-byte start count: popcount(fourByte start byte mask) + var fourByteMatches = Vector128.Equals(v & fourByteMask128, fourByteValue128); + var fourByteBits = fourByteMatches.ExtractMostSignificantBits(); + count += BitOperations.PopCount(fourByteBits); + + i += 16; + } while (bytes.Length - i >= 16); + } + + // Scalar tail (and fallback for non-SIMD hardware) + for (; i < bytes.Length; i++) + { + var b = Unsafe.Add(ref bytesRef, i); + if ((b & 0xC0) != 0x80) count++; // non-continuation byte + if ((b & 0xF8) == 0xF0) count++; // 4-byte start: extra char for surrogate pair + } + return count; + } + + /// + /// Single-pass UTF-8 → UTF-16 decoder. Returns the actual char count written to . + /// + /// + /// Layered approach for maximum throughput across mixed content: + /// • Phase 1 — Vector256 ASCII prefix bulk widen: 32 bytes/iter while all top bits are zero. + /// Uses to produce two Vector256<ushort> lanes + /// = 32 chars per iteration. Breaks on first non-ASCII byte found in the loaded vector. + /// • Phase 2 — DWORD ASCII batch: when ≥4 bytes remain, read as uint, test + /// (dword & 0x80808080u) == 0; on hit, widen 4 chars in 4 instructions and continue. + /// • Phase 3 — Scalar multi-byte branch: 1-byte (ASCII single), 2-byte (Latin extended, + /// Cyrillic, Greek, Hebrew, Arabic), 3-byte (CJK BMP), 4-byte (supplementary plane → surrogate pair). + /// Direct bit-extract, no validation — input is trusted. + /// + /// JIT compiles the switch into a jump table for predictable dispatch on mixed content. + /// Hungarian text typical pattern: ASCII run (Phase 1/2 widening) → 2-byte char (Phase 3 + /// case < 0xE0) → ASCII run → 2-byte char → ... — each phase optimal for its segment. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int DecodeUtf8SinglePass(ReadOnlySpan src, Span dst) + { + int srcIdx = 0, dstIdx = 0; + ref byte srcRef = ref MemoryMarshal.GetReference(src); + ref ushort dstRef = ref Unsafe.As(ref MemoryMarshal.GetReference(dst)); + + // Phase 1 — Vector256 ASCII prefix bulk widen (32 bytes/iter) + if (Vector256.IsHardwareAccelerated) + { + while (src.Length - srcIdx >= Vector256.Count) + { + var v = Vector256.LoadUnsafe(ref srcRef, (uint)srcIdx); + // ASCII detect: any high bit set among the 32 bytes? + if (v.ExtractMostSignificantBits() != 0) break; + + // Widen 32 bytes → 2 × Vector256 (32 chars total) + var (lower, upper) = Vector256.Widen(v); + lower.StoreUnsafe(ref dstRef, (uint)dstIdx); + upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector128.Count)); + srcIdx += Vector256.Count; + dstIdx += Vector256.Count; // 32 bytes → 32 chars + } + } + + // Phase 2/3 — scalar loop with DWORD ASCII batch + while (srcIdx < src.Length) + { + // DWORD ASCII batch: 4 ASCII bytes → 4 chars per iter + if (src.Length - srcIdx >= 4) + { + var dword = Unsafe.ReadUnaligned(ref Unsafe.Add(ref srcRef, srcIdx)); + if ((dword & 0x80808080u) == 0) + { + Unsafe.Add(ref dstRef, dstIdx) = (byte)dword; + Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(dword >> 8); + Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(dword >> 16); + Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(dword >> 24); + srcIdx += 4; + dstIdx += 4; + continue; + } + } + + // Scalar multi-byte branch (jump-table compile via switch) + var b0 = Unsafe.Add(ref srcRef, srcIdx); + switch (b0) + { + case < 0x80: + // 1-byte ASCII (U+0000–U+007F) + Unsafe.Add(ref dstRef, dstIdx++) = b0; + srcIdx += 1; + break; + case < 0xE0: + { + // 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF + // Latin extended, Cyrillic, Greek, Hebrew, Arabic. + var b1 = Unsafe.Add(ref srcRef, srcIdx + 1); + Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x1F) << 6) | (b1 & 0x3F)); + srcIdx += 2; + break; + } + case < 0xF0: + { + // 3-byte: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF + // CJK BMP, various other scripts. + var b1 = Unsafe.Add(ref srcRef, srcIdx + 1); + var b2 = Unsafe.Add(ref srcRef, srcIdx + 2); + Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F)); + srcIdx += 3; + break; + } + default: + { + // 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000–U+10FFFF + // Supplementary plane (emoji, rare CJK ext) → UTF-16 surrogate pair. + var b1 = Unsafe.Add(ref srcRef, srcIdx + 1); + var b2 = Unsafe.Add(ref srcRef, srcIdx + 2); + var b3 = Unsafe.Add(ref srcRef, srcIdx + 3); + var codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); + codepoint -= 0x10000; + Unsafe.Add(ref dstRef, dstIdx) = (ushort)(0xD800 | (codepoint >> 10)); + Unsafe.Add(ref dstRef, dstIdx + 1) = (ushort)(0xDC00 | (codepoint & 0x3FF)); + dstIdx += 2; + srcIdx += 4; + break; + } + } + } + + return dstIdx; + } +} diff --git a/AyCode.Core/docs/BINARY/BINARY_TODO.md b/AyCode.Core/docs/BINARY/BINARY_TODO.md index b3358da..81aa3f3 100644 --- a/AyCode.Core/docs/BINARY/BINARY_TODO.md +++ b/AyCode.Core/docs/BINARY/BINARY_TODO.md @@ -693,9 +693,18 @@ JIT/AOT path-selection via `[Intrinsic]` `IsSupported` static booleans — non-s | 1 | `CountUtf8Chars` (decode 1st pass) | ✅ done | ✅ existing | ✅ done | ✅ existing | | 2 | `EncodeUtf8SinglePass` Phase 1 (ASCII narrow) | ✅ done | ✅ existing | ✅ done | ✅ existing | | 3a | `DecodeUtf8SinglePass` multi-byte transcoder (Vector512) | ⏳ TODO | bail-out only | bail-out only | ✅ existing | -| 3b | `DecodeUtf8SinglePass` multi-byte transcoder (Vector256) | — | ⏳ TODO | bail-out only | ✅ existing | +| 3b | `DecodeUtf8SinglePass` multi-byte transcoder (Vector256) | — | 🔍 **deferred — see note** | bail-out only | ✅ existing | | 3c | `DecodeUtf8SinglePass` multi-byte transcoder (Vector128) | — | — | ⏳ TODO | ✅ existing | +**Note on Phase 3b (Vector256 / AVX2) — deferred, not dropped.** AVX2 lacks the AVX-512BW primitives (`CompareEqualMask` producing a `__mmask` k-register, in-lane `vpermb`, mask-driven `vpcompressb`) that make the classify-mask-compress-widen pipeline efficient. The `Vector256.Shuffle` is cross-lane via two `vpshufb` (per-128-bit-lane), which complicates leader-byte extraction across multi-byte sequences spanning the lane boundary. The simdutf C++ project — the canonical reference for this algorithm class — implements only **SSE4 (16-byte)** and **AVX-512 (64-byte)** paths; it explicitly skips AVX2 because the implementation cost-benefit is unfavorable on this algorithm. + +On AVX2 hosts, the Phase 3c (Vector128) transcoder runs as the primer multi-byte path AND as tail handler — covering AVX2 hosts with 16-byte/iter, which is already a significant win over the current scalar multi-byte branch. Phase 3b would require either: + +1. Hand-rolling an AVX2-specific 32-byte algorithm with cross-lane permute workarounds (research-grade complexity, uncertain net win — could be SLOWER than the Vector128 path due to cross-lane shuffle latency) +2. Waiting for `Avx10v1` / `Avx10v2` to expose AVX-512BW-class primitives in 256-bit form (Intel's unified vector ISA — `Avx10v1` already in .NET 9, `Avx10v2` arrives with future Intel hardware) + +**Re-evaluation triggers:** if benchmark on AVX2 hosts shows Phase 3c Vector128 path leaves > 10% Deser gap vs MemPack on multi-byte content; or if `Avx10v1` 256-bit primitives mature enough to make the algorithm tractable. Until then: **Phase 3b stays in the TODO as a research / future-work item** — not actively scheduled, but documented so a future contributor doesn't re-derive the AVX2 limitations. + **Phase 3 is the remaining gap — UTF-8 multi-byte decode on every host class**. ASCII path is already fast across all SIMD tiers (Vector256 + Vector128 prefix widen + `Encoding.Latin1.GetString` BCL fast path). The gap is on **multi-byte UTF-8 content** — Hungarian / Cyrillic / Greek (2-byte) and CJK BMP (3-byte) sequences — where the SIMD prefix bails out on the first non-ASCII byte and falls back to scalar bit-extract. The Repeated benchmark cell (Hungarian content) is the canonical witness; with all-Hungarian content (current bench data), Small / Repeated Deser cells trail MemPack by 6-14%. **Why all 3 SIMD tiers (not just AVX-512BW)** — public NuGet package goal: i18n payloads must be fast on every supported host (cloud server, desktop, mobile, Blazor WASM), not only AVX-512-capable cloud servers. The saját scalar multi-byte branch is the bottleneck on **all** non-ASCII content regardless of host class. The BCL `Encoding.UTF8` falls back to a similar scalar path on multi-byte content (with virtual dispatch + EncoderFallback overhead), so even where the BCL has its own SIMD 2-byte handler (.NET 9 PR #92580), our trust-input scalar wins on net — but a saját SIMD multi-byte path would dominate on every host.