diff --git a/AyCode.Core.Serializers.Console/Program.cs b/AyCode.Core.Serializers.Console/Program.cs index 97ac547..ed4ca4a 100644 --- a/AyCode.Core.Serializers.Console/Program.cs +++ b/AyCode.Core.Serializers.Console/Program.cs @@ -480,15 +480,21 @@ public static class Program private static List CreateSerializers(TestDataSet testData, string serializerMode) { // FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path. - // ONLY two benchmarks: AcBinary FastMode Byte[] (SGen) + MemoryPack Byte[]. Used for tight - // optimization-iteration cycles: if AcBinary improves on this comparison, every other config - // (BufWr, Pipe, Default) inherits the gain. The minimal suite removes noise from peripheral - // benchmarks and keeps the iteration loop fast (~20-30 sec instead of full 2-3 min). + // THREE benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + AcBinary FastMode Byte[] + // (WireMode.Fast = UTF-16 raw memcpy) + MemoryPack Byte[]. Shows BOTH sides of AcBinary's + // positioning vs MemPack: + // - Compact: smallest wire, UTF-8 encode/decode CPU cost + // - Fast (UTF-16 raw): comparable wire to MemPack, no encoding cost + // Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min. if (serializerMode == "fastestbyte") { + var fastWireOptions = AcBinarySerializerOptions.FastMode; + fastWireOptions.WireMode = WireMode.Fast; + return new List { new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.FastMode, "FastMode"), + new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"), new MemoryPackBenchmark(testData.Order, "Default"), }; } diff --git a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs index cba5167..9fd809c 100644 --- a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs +++ b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs @@ -455,11 +455,15 @@ public static partial class AcBinaryDeserializer /// /// Counts UTF-16 chars produced by decoding the given UTF-8 byte span. - /// Tight scalar loop the JIT auto-vectorizes for the common 1-byte ASCII branch; predictable - /// branches for 2/3/4-byte sequences. Result is the exact charCount for - /// allocation. /// /// + /// Vectorized via Vector256 (32 bytes/iter) using two bit-pattern checks: + /// • Non-continuation bytes (NOT 10xxxxxx, mask 0xC0 ≠ 0x80): each contributes 1 char. + /// • 4-byte start bytes (11110xxx, mask 0xF8 == 0xF0): each contributes an EXTRA char (surrogate pair). + /// + /// SIMD per-block result: (32 - popcount(continuationMask)) + popcount(fourByteStartMask). + /// Scalar tail handles the remaining <32 bytes. + /// /// Char-count rules: /// • Continuation bytes (10xxxxxx, 0x80–0xBF) — produce no char, skip. /// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each. @@ -469,9 +473,39 @@ public static partial class AcBinaryDeserializer private static int CountUtf8Chars(ReadOnlySpan bytes) { var count = 0; - for (var i = 0; i < bytes.Length; i++) + var i = 0; + ref var bytesRef = ref MemoryMarshal.GetReference(bytes); + + // SIMD path: 32 bytes/iter via Vector256 + if (Vector256.IsHardwareAccelerated && bytes.Length >= 32) { - var b = bytes[i]; + var contMask = Vector256.Create((byte)0xC0); + var contValue = Vector256.Create((byte)0x80); + var fourByteMask = Vector256.Create((byte)0xF8); + var fourByteValue = Vector256.Create((byte)0xF0); + + do + { + var v = Vector256.LoadUnsafe(ref bytesRef, (uint)i); + + // Non-continuation count: 32 - popcount(continuation byte mask) + var contMatches = Vector256.Equals(v & contMask, contValue); + var contBits = contMatches.ExtractMostSignificantBits(); + count += 32 - System.Numerics.BitOperations.PopCount(contBits); + + // 4-byte start count: popcount(fourByte start byte mask) + var fourByteMatches = Vector256.Equals(v & fourByteMask, fourByteValue); + var fourByteBits = fourByteMatches.ExtractMostSignificantBits(); + count += System.Numerics.BitOperations.PopCount(fourByteBits); + + i += 32; + } while (bytes.Length - i >= 32); + } + + // Scalar tail (and fallback for non-SIMD hardware) + for (; i < bytes.Length; i++) + { + var b = Unsafe.Add(ref bytesRef, i); if ((b & 0xC0) != 0x80) count++; // non-continuation byte if ((b & 0xF8) == 0xF0) count++; // 4-byte start: extra char for surrogate pair } diff --git a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs index 4b9eb1a..a7cea6d 100644 --- a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs +++ b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs @@ -5,6 +5,7 @@ using System.Collections.Generic; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; using System.Text; using System.Threading; using static AyCode.Core.Helpers.JsonUtilities; @@ -697,7 +698,7 @@ public static partial class AcBinarySerializer var savedPos = _position; var encodeStart = savedPos + reserveSize; - var bytesWritten = Utf8NoBom.GetBytes(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes)); + var bytesWritten = EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes)); var actualVarUIntSize = VarUIntSize((uint)bytesWritten); if (actualVarUIntSize < reserveSize) @@ -768,6 +769,120 @@ public static partial class AcBinarySerializer _position += byteCount; } + /// + /// Custom UTF-16 → UTF-8 single-pass encoder. Symmetric with the deserializer's custom decoder + /// ('s DecodeUtf8SinglePass). + /// + /// + /// Bypasses .GetBytes virtual-dispatch + encoder-fallback + /// overhead. Trusted-input encoder — no validation (writer side, the input is a .NET string + /// which always has valid UTF-16 surrogate pairs). + /// + /// Layered for max throughput on mixed content: + /// • Phase 1 — Vector256 ASCII narrow: 16 chars/iter. Loads Vector256<ushort>, + /// tests (v & 0xFF80) == 0 for all-ASCII; on hit, narrows to Vector128<byte> + /// via Vector128.Narrow(GetLower, GetUpper) = 16 bytes per iter. + /// • Phase 2 — DWORD ASCII batch: 4 chars/iter. OR-mask test + /// (c0 | c1 | c2 | c3) & 0xFF80 == 0; on hit, 4 byte writes per iter. + /// • Phase 3 — Scalar multi-byte encode: 1-byte (ASCII), 2-byte (Latin extended, + /// Cyrillic, Greek), 3-byte (CJK BMP), 4-byte (supplementary plane via UTF-16 surrogate pair). + /// + /// Returns actual byte count written. Caller must ensure has at least + /// src.Length * 4 capacity (UTF-8 worst case). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int EncodeUtf8SinglePass(ReadOnlySpan src, Span dst) + { + int srcIdx = 0, dstIdx = 0; + ref char srcRefChar = ref MemoryMarshal.GetReference(src); + ref ushort srcRefU16 = ref Unsafe.As(ref srcRefChar); + ref byte dstRef = ref MemoryMarshal.GetReference(dst); + + // Phase 1 — Vector256 ASCII narrow (16 chars/iter, falls out on first non-ASCII) + if (Vector256.IsHardwareAccelerated) + { + var asciiMask = Vector256.Create((ushort)0xFF80); + while (src.Length - srcIdx >= Vector256.Count) // 16 chars per Vector256 + { + var v = Vector256.LoadUnsafe(ref srcRefU16, (uint)srcIdx); + // ASCII detect: any char's high bits set (>= 0x80)? + if ((v & asciiMask) != Vector256.Zero) break; + // Narrow 16 ushorts (Vector256) → 16 bytes (Vector128) via two halves + var bytes = Vector128.Narrow(v.GetLower(), v.GetUpper()); + bytes.StoreUnsafe(ref dstRef, (uint)dstIdx); + srcIdx += Vector256.Count; + dstIdx += Vector256.Count; // 16 chars → 16 bytes (1:1 for ASCII) + } + } + + // Phase 2/3 — scalar with DWORD ASCII batch + while (srcIdx < src.Length) + { + // DWORD ASCII batch: 4 chars → 4 bytes when all ASCII + if (src.Length - srcIdx >= 4) + { + var c0 = Unsafe.Add(ref srcRefChar, srcIdx); + var c1 = Unsafe.Add(ref srcRefChar, srcIdx + 1); + var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 2); + var c3 = Unsafe.Add(ref srcRefChar, srcIdx + 3); + if (((c0 | c1 | c2 | c3) & 0xFF80) == 0) + { + Unsafe.Add(ref dstRef, dstIdx) = (byte)c0; + Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)c1; + Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)c2; + Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)c3; + srcIdx += 4; + dstIdx += 4; + continue; + } + } + + // Scalar single-char encode + var c = Unsafe.Add(ref srcRefChar, srcIdx); + if (c < 0x80) + { + // 1-byte ASCII (U+0000–U+007F) + Unsafe.Add(ref dstRef, dstIdx++) = (byte)c; + srcIdx += 1; + } + else if (c < 0x800) + { + // 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF + // Latin extended (Hungarian, Polish, Czech, Spanish, French, German diacritics), + // Greek, Cyrillic, Hebrew, Arabic. + Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xC0 | (c >> 6)); + Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | (c & 0x3F)); + dstIdx += 2; + srcIdx += 1; + } + else if ((c & 0xF800) != 0xD800) + { + // 3-byte BMP: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF (excluding surrogate range) + // CJK BMP, various other BMP scripts. + Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xE0 | (c >> 12)); + Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((c >> 6) & 0x3F)); + Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | (c & 0x3F)); + dstIdx += 3; + srcIdx += 1; + } + else + { + // 4-byte: surrogate pair → supplementary plane codepoint (U+10000–U+10FFFF) + // High surrogate (0xD800–0xDBFF) followed by low surrogate (0xDC00–0xDFFF). + var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 1); + var codepoint = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00); + Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xF0 | (codepoint >> 18)); + Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((codepoint >> 12) & 0x3F)); + Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | ((codepoint >> 6) & 0x3F)); + Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(0x80 | (codepoint & 0x3F)); + dstIdx += 4; + srcIdx += 2; // consumed 2 chars (surrogate pair) + } + } + + return dstIdx; + } + #endregion #region Bulk Array Writes — inline