[LOADED_DOCS: 2 files, no new loads]

SIMD-accelerated UTF-8 encode/decode for AcBinary

- Added Vector256-based SIMD path for UTF-8 char counting in deserializer, replacing scalar loop for faster ASCII/multibyte handling.
- Introduced EncodeUtf8SinglePass in serializer: layered SIMD/DWORD/scalar UTF-16→UTF-8 encoding, bypassing Encoding.UTF8.GetBytes.
- Updated serializer to use new encoder for string writes.
- Expanded "fastestbyte" benchmark mode to compare both AcBinary (UTF-8/UTF-16) and MemoryPack strategies.
- Improved comments and docs to clarify new SIMD logic.
This commit is contained in:
Loretta 2026-05-04 11:15:32 +02:00
parent 3a75210c70
commit ed59a0c031
3 changed files with 165 additions and 10 deletions

View File

@ -480,15 +480,21 @@ public static class Program
private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, string serializerMode) private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, string serializerMode)
{ {
// FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path. // FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
// ONLY two benchmarks: AcBinary FastMode Byte[] (SGen) + MemoryPack Byte[]. Used for tight // THREE benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + AcBinary FastMode Byte[]
// optimization-iteration cycles: if AcBinary improves on this comparison, every other config // (WireMode.Fast = UTF-16 raw memcpy) + MemoryPack Byte[]. Shows BOTH sides of AcBinary's
// (BufWr, Pipe, Default) inherits the gain. The minimal suite removes noise from peripheral // positioning vs MemPack:
// benchmarks and keeps the iteration loop fast (~20-30 sec instead of full 2-3 min). // - Compact: smallest wire, UTF-8 encode/decode CPU cost
// - Fast (UTF-16 raw): comparable wire to MemPack, no encoding cost
// Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
if (serializerMode == "fastestbyte") if (serializerMode == "fastestbyte")
{ {
var fastWireOptions = AcBinarySerializerOptions.FastMode;
fastWireOptions.WireMode = WireMode.Fast;
return new List<ISerializerBenchmark> return new List<ISerializerBenchmark>
{ {
new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.FastMode, "FastMode"), new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.FastMode, "FastMode"),
new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"),
new MemoryPackBenchmark(testData.Order, "Default"), new MemoryPackBenchmark(testData.Order, "Default"),
}; };
} }

View File

@ -455,11 +455,15 @@ public static partial class AcBinaryDeserializer
/// <summary> /// <summary>
/// Counts UTF-16 chars produced by decoding the given UTF-8 byte span. /// Counts UTF-16 chars produced by decoding the given UTF-8 byte span.
/// Tight scalar loop the JIT auto-vectorizes for the common 1-byte ASCII branch; predictable
/// branches for 2/3/4-byte sequences. Result is the exact <c>charCount</c> for
/// <see cref="string.Create{TState}"/> allocation.
/// </summary> /// </summary>
/// <remarks> /// <remarks>
/// Vectorized via Vector256 (32 bytes/iter) using two bit-pattern checks:
/// • Non-continuation bytes (NOT 10xxxxxx, mask 0xC0 ≠ 0x80): each contributes 1 char.
/// • 4-byte start bytes (11110xxx, mask 0xF8 == 0xF0): each contributes an EXTRA char (surrogate pair).
///
/// SIMD per-block result: <c>(32 - popcount(continuationMask)) + popcount(fourByteStartMask)</c>.
/// Scalar tail handles the remaining &lt;32 bytes.
///
/// Char-count rules: /// Char-count rules:
/// • Continuation bytes (10xxxxxx, 0x800xBF) — produce no char, skip. /// • Continuation bytes (10xxxxxx, 0x800xBF) — produce no char, skip.
/// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each. /// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each.
@ -469,9 +473,39 @@ public static partial class AcBinaryDeserializer
private static int CountUtf8Chars(ReadOnlySpan<byte> bytes) private static int CountUtf8Chars(ReadOnlySpan<byte> bytes)
{ {
var count = 0; var count = 0;
for (var i = 0; i < bytes.Length; i++) var i = 0;
ref var bytesRef = ref MemoryMarshal.GetReference(bytes);
// SIMD path: 32 bytes/iter via Vector256
if (Vector256.IsHardwareAccelerated && bytes.Length >= 32)
{ {
var b = bytes[i]; var contMask = Vector256.Create((byte)0xC0);
var contValue = Vector256.Create((byte)0x80);
var fourByteMask = Vector256.Create((byte)0xF8);
var fourByteValue = Vector256.Create((byte)0xF0);
do
{
var v = Vector256.LoadUnsafe(ref bytesRef, (uint)i);
// Non-continuation count: 32 - popcount(continuation byte mask)
var contMatches = Vector256.Equals(v & contMask, contValue);
var contBits = contMatches.ExtractMostSignificantBits();
count += 32 - System.Numerics.BitOperations.PopCount(contBits);
// 4-byte start count: popcount(fourByte start byte mask)
var fourByteMatches = Vector256.Equals(v & fourByteMask, fourByteValue);
var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
count += System.Numerics.BitOperations.PopCount(fourByteBits);
i += 32;
} while (bytes.Length - i >= 32);
}
// Scalar tail (and fallback for non-SIMD hardware)
for (; i < bytes.Length; i++)
{
var b = Unsafe.Add(ref bytesRef, i);
if ((b & 0xC0) != 0x80) count++; // non-continuation byte if ((b & 0xC0) != 0x80) count++; // non-continuation byte
if ((b & 0xF8) == 0xF0) count++; // 4-byte start: extra char for surrogate pair if ((b & 0xF8) == 0xF0) count++; // 4-byte start: extra char for surrogate pair
} }

View File

@ -5,6 +5,7 @@ using System.Collections.Generic;
using System.Numerics; using System.Numerics;
using System.Runtime.CompilerServices; using System.Runtime.CompilerServices;
using System.Runtime.InteropServices; using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Text; using System.Text;
using System.Threading; using System.Threading;
using static AyCode.Core.Helpers.JsonUtilities; using static AyCode.Core.Helpers.JsonUtilities;
@ -697,7 +698,7 @@ public static partial class AcBinarySerializer
var savedPos = _position; var savedPos = _position;
var encodeStart = savedPos + reserveSize; var encodeStart = savedPos + reserveSize;
var bytesWritten = Utf8NoBom.GetBytes(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes)); var bytesWritten = EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
var actualVarUIntSize = VarUIntSize((uint)bytesWritten); var actualVarUIntSize = VarUIntSize((uint)bytesWritten);
if (actualVarUIntSize < reserveSize) if (actualVarUIntSize < reserveSize)
@ -768,6 +769,120 @@ public static partial class AcBinarySerializer
_position += byteCount; _position += byteCount;
} }
/// <summary>
/// Custom UTF-16 → UTF-8 single-pass encoder. Symmetric with the deserializer's custom decoder
/// (<see cref="AcBinaryDeserializer"/>'s <c>DecodeUtf8SinglePass</c>).
/// </summary>
/// <remarks>
/// Bypasses <see cref="System.Text.Encoding.UTF8"/>.GetBytes virtual-dispatch + encoder-fallback
/// overhead. Trusted-input encoder — no validation (writer side, the input is a .NET string
/// which always has valid UTF-16 surrogate pairs).
///
/// Layered for max throughput on mixed content:
/// • <b>Phase 1 — Vector256 ASCII narrow:</b> 16 chars/iter. Loads <c>Vector256&lt;ushort&gt;</c>,
/// tests <c>(v &amp; 0xFF80) == 0</c> for all-ASCII; on hit, narrows to <c>Vector128&lt;byte&gt;</c>
/// via <c>Vector128.Narrow(GetLower, GetUpper)</c> = 16 bytes per iter.
/// • <b>Phase 2 — DWORD ASCII batch:</b> 4 chars/iter. OR-mask test
/// <c>(c0 | c1 | c2 | c3) &amp; 0xFF80 == 0</c>; on hit, 4 byte writes per iter.
/// • <b>Phase 3 — Scalar multi-byte encode:</b> 1-byte (ASCII), 2-byte (Latin extended,
/// Cyrillic, Greek), 3-byte (CJK BMP), 4-byte (supplementary plane via UTF-16 surrogate pair).
///
/// Returns actual byte count written. Caller must ensure <paramref name="dst"/> has at least
/// <c>src.Length * 4</c> capacity (UTF-8 worst case).
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int EncodeUtf8SinglePass(ReadOnlySpan<char> src, Span<byte> dst)
{
int srcIdx = 0, dstIdx = 0;
ref char srcRefChar = ref MemoryMarshal.GetReference(src);
ref ushort srcRefU16 = ref Unsafe.As<char, ushort>(ref srcRefChar);
ref byte dstRef = ref MemoryMarshal.GetReference(dst);
// Phase 1 — Vector256 ASCII narrow (16 chars/iter, falls out on first non-ASCII)
if (Vector256.IsHardwareAccelerated)
{
var asciiMask = Vector256.Create((ushort)0xFF80);
while (src.Length - srcIdx >= Vector256<ushort>.Count) // 16 chars per Vector256<ushort>
{
var v = Vector256.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
// ASCII detect: any char's high bits set (>= 0x80)?
if ((v & asciiMask) != Vector256<ushort>.Zero) break;
// Narrow 16 ushorts (Vector256) → 16 bytes (Vector128) via two halves
var bytes = Vector128.Narrow(v.GetLower(), v.GetUpper());
bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
srcIdx += Vector256<ushort>.Count;
dstIdx += Vector256<ushort>.Count; // 16 chars → 16 bytes (1:1 for ASCII)
}
}
// Phase 2/3 — scalar with DWORD ASCII batch
while (srcIdx < src.Length)
{
// DWORD ASCII batch: 4 chars → 4 bytes when all ASCII
if (src.Length - srcIdx >= 4)
{
var c0 = Unsafe.Add(ref srcRefChar, srcIdx);
var c1 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 2);
var c3 = Unsafe.Add(ref srcRefChar, srcIdx + 3);
if (((c0 | c1 | c2 | c3) & 0xFF80) == 0)
{
Unsafe.Add(ref dstRef, dstIdx) = (byte)c0;
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)c1;
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)c2;
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)c3;
srcIdx += 4;
dstIdx += 4;
continue;
}
}
// Scalar single-char encode
var c = Unsafe.Add(ref srcRefChar, srcIdx);
if (c < 0x80)
{
// 1-byte ASCII (U+0000U+007F)
Unsafe.Add(ref dstRef, dstIdx++) = (byte)c;
srcIdx += 1;
}
else if (c < 0x800)
{
// 2-byte: 110xxxxx 10xxxxxx → U+0080U+07FF
// Latin extended (Hungarian, Polish, Czech, Spanish, French, German diacritics),
// Greek, Cyrillic, Hebrew, Arabic.
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xC0 | (c >> 6));
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | (c & 0x3F));
dstIdx += 2;
srcIdx += 1;
}
else if ((c & 0xF800) != 0xD800)
{
// 3-byte BMP: 1110xxxx 10xxxxxx 10xxxxxx → U+0800U+FFFF (excluding surrogate range)
// CJK BMP, various other BMP scripts.
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xE0 | (c >> 12));
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((c >> 6) & 0x3F));
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | (c & 0x3F));
dstIdx += 3;
srcIdx += 1;
}
else
{
// 4-byte: surrogate pair → supplementary plane codepoint (U+10000U+10FFFF)
// High surrogate (0xD8000xDBFF) followed by low surrogate (0xDC000xDFFF).
var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
var codepoint = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00);
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xF0 | (codepoint >> 18));
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((codepoint >> 12) & 0x3F));
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | ((codepoint >> 6) & 0x3F));
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(0x80 | (codepoint & 0x3F));
dstIdx += 4;
srcIdx += 2; // consumed 2 chars (surrogate pair)
}
}
return dstIdx;
}
#endregion #endregion
#region Bulk Array Writes inline #region Bulk Array Writes inline