[LOADED_DOCS: 2 files, no new loads]
SIMD-accelerated UTF-8 encode/decode for AcBinary - Added Vector256-based SIMD path for UTF-8 char counting in deserializer, replacing scalar loop for faster ASCII/multibyte handling. - Introduced EncodeUtf8SinglePass in serializer: layered SIMD/DWORD/scalar UTF-16→UTF-8 encoding, bypassing Encoding.UTF8.GetBytes. - Updated serializer to use new encoder for string writes. - Expanded "fastestbyte" benchmark mode to compare both AcBinary (UTF-8/UTF-16) and MemoryPack strategies. - Improved comments and docs to clarify new SIMD logic.
This commit is contained in:
parent
3a75210c70
commit
ed59a0c031
|
|
@ -480,15 +480,21 @@ public static class Program
|
||||||
private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, string serializerMode)
|
private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, string serializerMode)
|
||||||
{
|
{
|
||||||
// FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
|
// FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
|
||||||
// ONLY two benchmarks: AcBinary FastMode Byte[] (SGen) + MemoryPack Byte[]. Used for tight
|
// THREE benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + AcBinary FastMode Byte[]
|
||||||
// optimization-iteration cycles: if AcBinary improves on this comparison, every other config
|
// (WireMode.Fast = UTF-16 raw memcpy) + MemoryPack Byte[]. Shows BOTH sides of AcBinary's
|
||||||
// (BufWr, Pipe, Default) inherits the gain. The minimal suite removes noise from peripheral
|
// positioning vs MemPack:
|
||||||
// benchmarks and keeps the iteration loop fast (~20-30 sec instead of full 2-3 min).
|
// - Compact: smallest wire, UTF-8 encode/decode CPU cost
|
||||||
|
// - Fast (UTF-16 raw): comparable wire to MemPack, no encoding cost
|
||||||
|
// Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
|
||||||
if (serializerMode == "fastestbyte")
|
if (serializerMode == "fastestbyte")
|
||||||
{
|
{
|
||||||
|
var fastWireOptions = AcBinarySerializerOptions.FastMode;
|
||||||
|
fastWireOptions.WireMode = WireMode.Fast;
|
||||||
|
|
||||||
return new List<ISerializerBenchmark>
|
return new List<ISerializerBenchmark>
|
||||||
{
|
{
|
||||||
new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.FastMode, "FastMode"),
|
new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.FastMode, "FastMode"),
|
||||||
|
new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"),
|
||||||
new MemoryPackBenchmark(testData.Order, "Default"),
|
new MemoryPackBenchmark(testData.Order, "Default"),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -455,11 +455,15 @@ public static partial class AcBinaryDeserializer
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Counts UTF-16 chars produced by decoding the given UTF-8 byte span.
|
/// Counts UTF-16 chars produced by decoding the given UTF-8 byte span.
|
||||||
/// Tight scalar loop the JIT auto-vectorizes for the common 1-byte ASCII branch; predictable
|
|
||||||
/// branches for 2/3/4-byte sequences. Result is the exact <c>charCount</c> for
|
|
||||||
/// <see cref="string.Create{TState}"/> allocation.
|
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <remarks>
|
/// <remarks>
|
||||||
|
/// Vectorized via Vector256 (32 bytes/iter) using two bit-pattern checks:
|
||||||
|
/// • Non-continuation bytes (NOT 10xxxxxx, mask 0xC0 ≠ 0x80): each contributes 1 char.
|
||||||
|
/// • 4-byte start bytes (11110xxx, mask 0xF8 == 0xF0): each contributes an EXTRA char (surrogate pair).
|
||||||
|
///
|
||||||
|
/// SIMD per-block result: <c>(32 - popcount(continuationMask)) + popcount(fourByteStartMask)</c>.
|
||||||
|
/// Scalar tail handles the remaining <32 bytes.
|
||||||
|
///
|
||||||
/// Char-count rules:
|
/// Char-count rules:
|
||||||
/// • Continuation bytes (10xxxxxx, 0x80–0xBF) — produce no char, skip.
|
/// • Continuation bytes (10xxxxxx, 0x80–0xBF) — produce no char, skip.
|
||||||
/// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each.
|
/// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each.
|
||||||
|
|
@ -469,9 +473,39 @@ public static partial class AcBinaryDeserializer
|
||||||
private static int CountUtf8Chars(ReadOnlySpan<byte> bytes)
|
private static int CountUtf8Chars(ReadOnlySpan<byte> bytes)
|
||||||
{
|
{
|
||||||
var count = 0;
|
var count = 0;
|
||||||
for (var i = 0; i < bytes.Length; i++)
|
var i = 0;
|
||||||
|
ref var bytesRef = ref MemoryMarshal.GetReference(bytes);
|
||||||
|
|
||||||
|
// SIMD path: 32 bytes/iter via Vector256
|
||||||
|
if (Vector256.IsHardwareAccelerated && bytes.Length >= 32)
|
||||||
{
|
{
|
||||||
var b = bytes[i];
|
var contMask = Vector256.Create((byte)0xC0);
|
||||||
|
var contValue = Vector256.Create((byte)0x80);
|
||||||
|
var fourByteMask = Vector256.Create((byte)0xF8);
|
||||||
|
var fourByteValue = Vector256.Create((byte)0xF0);
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
var v = Vector256.LoadUnsafe(ref bytesRef, (uint)i);
|
||||||
|
|
||||||
|
// Non-continuation count: 32 - popcount(continuation byte mask)
|
||||||
|
var contMatches = Vector256.Equals(v & contMask, contValue);
|
||||||
|
var contBits = contMatches.ExtractMostSignificantBits();
|
||||||
|
count += 32 - System.Numerics.BitOperations.PopCount(contBits);
|
||||||
|
|
||||||
|
// 4-byte start count: popcount(fourByte start byte mask)
|
||||||
|
var fourByteMatches = Vector256.Equals(v & fourByteMask, fourByteValue);
|
||||||
|
var fourByteBits = fourByteMatches.ExtractMostSignificantBits();
|
||||||
|
count += System.Numerics.BitOperations.PopCount(fourByteBits);
|
||||||
|
|
||||||
|
i += 32;
|
||||||
|
} while (bytes.Length - i >= 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scalar tail (and fallback for non-SIMD hardware)
|
||||||
|
for (; i < bytes.Length; i++)
|
||||||
|
{
|
||||||
|
var b = Unsafe.Add(ref bytesRef, i);
|
||||||
if ((b & 0xC0) != 0x80) count++; // non-continuation byte
|
if ((b & 0xC0) != 0x80) count++; // non-continuation byte
|
||||||
if ((b & 0xF8) == 0xF0) count++; // 4-byte start: extra char for surrogate pair
|
if ((b & 0xF8) == 0xF0) count++; // 4-byte start: extra char for surrogate pair
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ using System.Collections.Generic;
|
||||||
using System.Numerics;
|
using System.Numerics;
|
||||||
using System.Runtime.CompilerServices;
|
using System.Runtime.CompilerServices;
|
||||||
using System.Runtime.InteropServices;
|
using System.Runtime.InteropServices;
|
||||||
|
using System.Runtime.Intrinsics;
|
||||||
using System.Text;
|
using System.Text;
|
||||||
using System.Threading;
|
using System.Threading;
|
||||||
using static AyCode.Core.Helpers.JsonUtilities;
|
using static AyCode.Core.Helpers.JsonUtilities;
|
||||||
|
|
@ -697,7 +698,7 @@ public static partial class AcBinarySerializer
|
||||||
|
|
||||||
var savedPos = _position;
|
var savedPos = _position;
|
||||||
var encodeStart = savedPos + reserveSize;
|
var encodeStart = savedPos + reserveSize;
|
||||||
var bytesWritten = Utf8NoBom.GetBytes(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
|
var bytesWritten = EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
|
||||||
|
|
||||||
var actualVarUIntSize = VarUIntSize((uint)bytesWritten);
|
var actualVarUIntSize = VarUIntSize((uint)bytesWritten);
|
||||||
if (actualVarUIntSize < reserveSize)
|
if (actualVarUIntSize < reserveSize)
|
||||||
|
|
@ -768,6 +769,120 @@ public static partial class AcBinarySerializer
|
||||||
_position += byteCount;
|
_position += byteCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Custom UTF-16 → UTF-8 single-pass encoder. Symmetric with the deserializer's custom decoder
|
||||||
|
/// (<see cref="AcBinaryDeserializer"/>'s <c>DecodeUtf8SinglePass</c>).
|
||||||
|
/// </summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// Bypasses <see cref="System.Text.Encoding.UTF8"/>.GetBytes virtual-dispatch + encoder-fallback
|
||||||
|
/// overhead. Trusted-input encoder — no validation (writer side, the input is a .NET string
|
||||||
|
/// which always has valid UTF-16 surrogate pairs).
|
||||||
|
///
|
||||||
|
/// Layered for max throughput on mixed content:
|
||||||
|
/// • <b>Phase 1 — Vector256 ASCII narrow:</b> 16 chars/iter. Loads <c>Vector256<ushort></c>,
|
||||||
|
/// tests <c>(v & 0xFF80) == 0</c> for all-ASCII; on hit, narrows to <c>Vector128<byte></c>
|
||||||
|
/// via <c>Vector128.Narrow(GetLower, GetUpper)</c> = 16 bytes per iter.
|
||||||
|
/// • <b>Phase 2 — DWORD ASCII batch:</b> 4 chars/iter. OR-mask test
|
||||||
|
/// <c>(c0 | c1 | c2 | c3) & 0xFF80 == 0</c>; on hit, 4 byte writes per iter.
|
||||||
|
/// • <b>Phase 3 — Scalar multi-byte encode:</b> 1-byte (ASCII), 2-byte (Latin extended,
|
||||||
|
/// Cyrillic, Greek), 3-byte (CJK BMP), 4-byte (supplementary plane via UTF-16 surrogate pair).
|
||||||
|
///
|
||||||
|
/// Returns actual byte count written. Caller must ensure <paramref name="dst"/> has at least
|
||||||
|
/// <c>src.Length * 4</c> capacity (UTF-8 worst case).
|
||||||
|
/// </remarks>
|
||||||
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||||
|
private static int EncodeUtf8SinglePass(ReadOnlySpan<char> src, Span<byte> dst)
|
||||||
|
{
|
||||||
|
int srcIdx = 0, dstIdx = 0;
|
||||||
|
ref char srcRefChar = ref MemoryMarshal.GetReference(src);
|
||||||
|
ref ushort srcRefU16 = ref Unsafe.As<char, ushort>(ref srcRefChar);
|
||||||
|
ref byte dstRef = ref MemoryMarshal.GetReference(dst);
|
||||||
|
|
||||||
|
// Phase 1 — Vector256 ASCII narrow (16 chars/iter, falls out on first non-ASCII)
|
||||||
|
if (Vector256.IsHardwareAccelerated)
|
||||||
|
{
|
||||||
|
var asciiMask = Vector256.Create((ushort)0xFF80);
|
||||||
|
while (src.Length - srcIdx >= Vector256<ushort>.Count) // 16 chars per Vector256<ushort>
|
||||||
|
{
|
||||||
|
var v = Vector256.LoadUnsafe(ref srcRefU16, (uint)srcIdx);
|
||||||
|
// ASCII detect: any char's high bits set (>= 0x80)?
|
||||||
|
if ((v & asciiMask) != Vector256<ushort>.Zero) break;
|
||||||
|
// Narrow 16 ushorts (Vector256) → 16 bytes (Vector128) via two halves
|
||||||
|
var bytes = Vector128.Narrow(v.GetLower(), v.GetUpper());
|
||||||
|
bytes.StoreUnsafe(ref dstRef, (uint)dstIdx);
|
||||||
|
srcIdx += Vector256<ushort>.Count;
|
||||||
|
dstIdx += Vector256<ushort>.Count; // 16 chars → 16 bytes (1:1 for ASCII)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 2/3 — scalar with DWORD ASCII batch
|
||||||
|
while (srcIdx < src.Length)
|
||||||
|
{
|
||||||
|
// DWORD ASCII batch: 4 chars → 4 bytes when all ASCII
|
||||||
|
if (src.Length - srcIdx >= 4)
|
||||||
|
{
|
||||||
|
var c0 = Unsafe.Add(ref srcRefChar, srcIdx);
|
||||||
|
var c1 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
|
||||||
|
var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 2);
|
||||||
|
var c3 = Unsafe.Add(ref srcRefChar, srcIdx + 3);
|
||||||
|
if (((c0 | c1 | c2 | c3) & 0xFF80) == 0)
|
||||||
|
{
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx) = (byte)c0;
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)c1;
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)c2;
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)c3;
|
||||||
|
srcIdx += 4;
|
||||||
|
dstIdx += 4;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scalar single-char encode
|
||||||
|
var c = Unsafe.Add(ref srcRefChar, srcIdx);
|
||||||
|
if (c < 0x80)
|
||||||
|
{
|
||||||
|
// 1-byte ASCII (U+0000–U+007F)
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx++) = (byte)c;
|
||||||
|
srcIdx += 1;
|
||||||
|
}
|
||||||
|
else if (c < 0x800)
|
||||||
|
{
|
||||||
|
// 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF
|
||||||
|
// Latin extended (Hungarian, Polish, Czech, Spanish, French, German diacritics),
|
||||||
|
// Greek, Cyrillic, Hebrew, Arabic.
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xC0 | (c >> 6));
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | (c & 0x3F));
|
||||||
|
dstIdx += 2;
|
||||||
|
srcIdx += 1;
|
||||||
|
}
|
||||||
|
else if ((c & 0xF800) != 0xD800)
|
||||||
|
{
|
||||||
|
// 3-byte BMP: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF (excluding surrogate range)
|
||||||
|
// CJK BMP, various other BMP scripts.
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xE0 | (c >> 12));
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((c >> 6) & 0x3F));
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | (c & 0x3F));
|
||||||
|
dstIdx += 3;
|
||||||
|
srcIdx += 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// 4-byte: surrogate pair → supplementary plane codepoint (U+10000–U+10FFFF)
|
||||||
|
// High surrogate (0xD800–0xDBFF) followed by low surrogate (0xDC00–0xDFFF).
|
||||||
|
var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
|
||||||
|
var codepoint = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00);
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xF0 | (codepoint >> 18));
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((codepoint >> 12) & 0x3F));
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | ((codepoint >> 6) & 0x3F));
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(0x80 | (codepoint & 0x3F));
|
||||||
|
dstIdx += 4;
|
||||||
|
srcIdx += 2; // consumed 2 chars (surrogate pair)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return dstIdx;
|
||||||
|
}
|
||||||
|
|
||||||
#endregion
|
#endregion
|
||||||
|
|
||||||
#region Bulk Array Writes — inline
|
#region Bulk Array Writes — inline
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue