[LOADED_DOCS: 2 files, no new loads]
Disable ASCII fast paths; add FastestByte mode, plan tasks Temporarily disable ASCII string fast paths in AcBinarySerializer and AcBinaryDeserializer to isolate and benchmark the custom UTF-8 encoder/decoder. Add "FastestByte" benchmark mode for focused AcBinary vs MemoryPack Byte[] comparison. Update BINARY_TODO.md with new technical tasks for .NET 11 SIMD decoder, sentinel-length encoding, ASCII marker-dispatch, and a custom UTF-8 encoder. These changes support staged optimization and future performance improvements.
This commit is contained in:
parent
dc10315fc3
commit
3a75210c70
|
|
@ -45,7 +45,7 @@ public static class Program
|
||||||
private static int TestIterations = 1;
|
private static int TestIterations = 1;
|
||||||
private static int BenchmarkSamples = 1; // Debug: single sample, fast iteration
|
private static int BenchmarkSamples = 1; // Debug: single sample, fast iteration
|
||||||
#else
|
#else
|
||||||
private static int WarmupIterations = 5000; //5000
|
private static int WarmupIterations = 10000; //5000
|
||||||
private static int TestIterations = 1000; //1000
|
private static int TestIterations = 1000; //1000
|
||||||
private static int BenchmarkSamples = 3;
|
private static int BenchmarkSamples = 3;
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -479,6 +479,20 @@ public static class Program
|
||||||
|
|
||||||
private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, string serializerMode)
|
private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, string serializerMode)
|
||||||
{
|
{
|
||||||
|
// FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
|
||||||
|
// ONLY two benchmarks: AcBinary FastMode Byte[] (SGen) + MemoryPack Byte[]. Used for tight
|
||||||
|
// optimization-iteration cycles: if AcBinary improves on this comparison, every other config
|
||||||
|
// (BufWr, Pipe, Default) inherits the gain. The minimal suite removes noise from peripheral
|
||||||
|
// benchmarks and keeps the iteration loop fast (~20-30 sec instead of full 2-3 min).
|
||||||
|
if (serializerMode == "fastestbyte")
|
||||||
|
{
|
||||||
|
return new List<ISerializerBenchmark>
|
||||||
|
{
|
||||||
|
new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.FastMode, "FastMode"),
|
||||||
|
new MemoryPackBenchmark(testData.Order, "Default"),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
// AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer).
|
// AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer).
|
||||||
// Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with
|
// Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with
|
||||||
// the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it
|
// the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it
|
||||||
|
|
@ -837,6 +851,7 @@ public static class Program
|
||||||
System.Console.WriteLine(" [2] Comprehensive — release validation");
|
System.Console.WriteLine(" [2] Comprehensive — release validation");
|
||||||
System.Console.WriteLine(" [3] Edge cases — refactor verification");
|
System.Console.WriteLine(" [3] Edge cases — refactor verification");
|
||||||
System.Console.WriteLine(" [A] All layers");
|
System.Console.WriteLine(" [A] All layers");
|
||||||
|
System.Console.WriteLine(" [F] FastestByte — AcBinary FastMode Byte[] vs MemoryPack Byte[] only (tight optimization loop)");
|
||||||
System.Console.WriteLine(" [P] AsyncPipe — streaming I/O isolation (only AsyncPipe, all test data)");
|
System.Console.WriteLine(" [P] AsyncPipe — streaming I/O isolation (only AsyncPipe, all test data)");
|
||||||
System.Console.WriteLine($" [S] Settings — modify Warmup ({WarmupIterations}) / Iterations ({TestIterations}) / Samples ({BenchmarkSamples})");
|
System.Console.WriteLine($" [S] Settings — modify Warmup ({WarmupIterations}) / Iterations ({TestIterations}) / Samples ({BenchmarkSamples})");
|
||||||
System.Console.WriteLine(" [Q] Quit");
|
System.Console.WriteLine(" [Q] Quit");
|
||||||
|
|
@ -851,6 +866,7 @@ public static class Program
|
||||||
case '2': return ("comprehensive", "standard");
|
case '2': return ("comprehensive", "standard");
|
||||||
case '3': return ("edge", "standard");
|
case '3': return ("edge", "standard");
|
||||||
case 'a': return ("all", "standard");
|
case 'a': return ("all", "standard");
|
||||||
|
case 'f': return ("all", "fastestbyte");
|
||||||
case 'p': return ("all", "asyncpipe");
|
case 'p': return ("all", "asyncpipe");
|
||||||
case 's':
|
case 's':
|
||||||
ShowSettingsMenu();
|
ShowSettingsMenu();
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,10 @@
|
||||||
using System;
|
using System;
|
||||||
|
using System.Buffers;
|
||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
using System.Diagnostics;
|
using System.Diagnostics;
|
||||||
using System.Runtime.CompilerServices;
|
using System.Runtime.CompilerServices;
|
||||||
using System.Runtime.InteropServices;
|
using System.Runtime.InteropServices;
|
||||||
|
using System.Runtime.Intrinsics;
|
||||||
using System.Text;
|
using System.Text;
|
||||||
|
|
||||||
namespace AyCode.Core.Serializers.Binaries;
|
namespace AyCode.Core.Serializers.Binaries;
|
||||||
|
|
@ -381,21 +383,25 @@ public static partial class AcBinaryDeserializer
|
||||||
return ReadStringUtf8Cached(length);
|
return ReadStringUtf8Cached(length);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ASCII fast path: short strings (≤128 bytes) with all ASCII bytes
|
// BASELINE TEMP: ASCII fast path disabled — every string takes the custom UTF-8 decoder.
|
||||||
// use string.Create + direct byte→char widening, avoiding UTF8Encoding overhead.
|
// Used to measure custom decoder performance in isolation, without ASCII-fast-path-vs-decoder
|
||||||
if (length <= 128 && System.Text.Ascii.IsValid(_buffer.AsSpan(_position, length)))
|
// dispatch interference. Re-enable once decoder optimization is benchmarked and verified.
|
||||||
{
|
//
|
||||||
var pos = _position;
|
//// ASCII fast path: short strings (≤128 bytes) with all ASCII bytes
|
||||||
_position += length;
|
//// use string.Create + direct byte→char widening, avoiding UTF8Encoding overhead.
|
||||||
return string.Create(length, (Buffer: _buffer, Start: pos), static (chars, state) =>
|
//if (length <= 128 && System.Text.Ascii.IsValid(_buffer.AsSpan(_position, length)))
|
||||||
{
|
//{
|
||||||
var src = state.Buffer.AsSpan(state.Start, chars.Length);
|
// var pos = _position;
|
||||||
for (var i = 0; i < chars.Length; i++)
|
// _position += length;
|
||||||
chars[i] = (char)src[i];
|
// return string.Create(length, (Buffer: _buffer, Start: pos), static (chars, state) =>
|
||||||
});
|
// {
|
||||||
}
|
// var src = state.Buffer.AsSpan(state.Start, chars.Length);
|
||||||
|
// for (var i = 0; i < chars.Length; i++)
|
||||||
|
// chars[i] = (char)src[i];
|
||||||
|
// });
|
||||||
|
//}
|
||||||
|
|
||||||
// Non-ASCII path: custom UTF-8 decoder.
|
// All strings — custom UTF-8 decoder.
|
||||||
// Beats Encoding.UTF8.GetString by skipping the virtual-dispatch + encoder-fallback
|
// Beats Encoding.UTF8.GetString by skipping the virtual-dispatch + encoder-fallback
|
||||||
// overhead the BCL adds for arbitrary inputs. Two passes (count + decode) over the
|
// overhead the BCL adds for arbitrary inputs. Two passes (count + decode) over the
|
||||||
// bytes — both passes are tight scalar loops the JIT can auto-vectorize for the
|
// bytes — both passes are tight scalar loops the JIT can auto-vectorize for the
|
||||||
|
|
@ -413,34 +419,51 @@ public static partial class AcBinaryDeserializer
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Custom UTF-8 → UTF-16 string decoder. Single-allocation via <c>string.Create</c>;
|
/// Custom UTF-8 → UTF-16 string decoder.
|
||||||
/// counts chars first (vectorizable scalar loop), then decodes directly into the
|
|
||||||
/// allocated string's buffer.
|
|
||||||
/// </summary>
|
/// </summary>
|
||||||
[MethodImpl(MethodImplOptions.NoInlining)] // cold path; let JIT keep ReadStringUtf8 caller small
|
/// <remarks>
|
||||||
|
/// Two-pass over bytes (count + decode) with zero intermediate allocation:
|
||||||
|
/// • Pass 1 — <see cref="CountUtf8Chars"/>: counts UTF-16 chars produced (scalar, JIT-vectorizable).
|
||||||
|
/// • Pass 2 — <see cref="DecodeUtf8SinglePass"/> inside <see cref="string.Create{TState}"/> callback:
|
||||||
|
/// decodes directly into the newly-allocated string's char buffer. No memcpy, no temp buffer,
|
||||||
|
/// no <c>ArrayPool</c> rent.
|
||||||
|
///
|
||||||
|
/// Beats <see cref="System.Text.Encoding.UTF8"/>.GetString by:
|
||||||
|
/// 1. Skipping virtual-dispatch + encoder-fallback overhead the BCL adds for arbitrary inputs.
|
||||||
|
/// 2. Multi-byte branches via direct bit-extract — no overlong/surrogate range checks.
|
||||||
|
/// 3. Vector256 ASCII prefix bulk widen (32 bytes/iter while all-ASCII) inside Pass 2.
|
||||||
|
/// 4. DWORD ASCII batch (4 bytes/iter when ASCII-aligned) inside Pass 2's scalar loop.
|
||||||
|
///
|
||||||
|
/// The bytes are guaranteed valid UTF-8 because the writer used <c>Encoding.UTF8.GetBytes</c>.
|
||||||
|
/// If a wire payload is corrupt (incomplete multi-byte sequence), an
|
||||||
|
/// <see cref="IndexOutOfRangeException"/> surfaces at the continuation-byte read,
|
||||||
|
/// which the calling deserializer propagates as a deserialization failure.
|
||||||
|
/// </remarks>
|
||||||
|
[MethodImpl(MethodImplOptions.NoInlining)] // cold path; keep ReadStringUtf8 caller small
|
||||||
private string DecodeUtf8(int byteLength)
|
private string DecodeUtf8(int byteLength)
|
||||||
{
|
{
|
||||||
var pos = _position;
|
var pos = _position;
|
||||||
_position += byteLength;
|
_position += byteLength;
|
||||||
|
var src = _buffer.AsSpan(pos, byteLength);
|
||||||
var srcSpan = _buffer.AsSpan(pos, byteLength);
|
var charCount = CountUtf8Chars(src);
|
||||||
var charCount = CountUtf8Chars(srcSpan);
|
|
||||||
|
|
||||||
return string.Create(charCount, (Buffer: _buffer, Pos: pos, Len: byteLength), static (chars, state) =>
|
return string.Create(charCount, (Buffer: _buffer, Pos: pos, Len: byteLength), static (chars, state) =>
|
||||||
{
|
{
|
||||||
DecodeUtf8ToChars(state.Buffer.AsSpan(state.Pos, state.Len), chars);
|
DecodeUtf8SinglePass(state.Buffer.AsSpan(state.Pos, state.Len), chars);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Counts UTF-16 chars produced by decoding the given UTF-8 byte span.
|
/// Counts UTF-16 chars produced by decoding the given UTF-8 byte span.
|
||||||
/// JIT-vectorizable scalar loop: every iteration is a constant-shape branch on bit patterns.
|
/// Tight scalar loop the JIT auto-vectorizes for the common 1-byte ASCII branch; predictable
|
||||||
|
/// branches for 2/3/4-byte sequences. Result is the exact <c>charCount</c> for
|
||||||
|
/// <see cref="string.Create{TState}"/> allocation.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <remarks>
|
/// <remarks>
|
||||||
/// Char-count rules:
|
/// Char-count rules:
|
||||||
/// • Continuation bytes (10xxxxxx, 0x80–0xBF) — produced no char, skip.
|
/// • Continuation bytes (10xxxxxx, 0x80–0xBF) — produce no char, skip.
|
||||||
/// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each.
|
/// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each.
|
||||||
/// • 4-byte start bytes (11110xxx, 0xF0–0xF7) — produce 2 chars (surrogate pair).
|
/// • 4-byte start bytes (11110xxx, 0xF0–0xF7) — produce 2 chars (UTF-16 surrogate pair).
|
||||||
/// </remarks>
|
/// </remarks>
|
||||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||||
private static int CountUtf8Chars(ReadOnlySpan<byte> bytes)
|
private static int CountUtf8Chars(ReadOnlySpan<byte> bytes)
|
||||||
|
|
@ -449,72 +472,121 @@ public static partial class AcBinaryDeserializer
|
||||||
for (var i = 0; i < bytes.Length; i++)
|
for (var i = 0; i < bytes.Length; i++)
|
||||||
{
|
{
|
||||||
var b = bytes[i];
|
var b = bytes[i];
|
||||||
// Non-continuation byte: increments char count
|
if ((b & 0xC0) != 0x80) count++; // non-continuation byte
|
||||||
if ((b & 0xC0) != 0x80) count++;
|
if ((b & 0xF8) == 0xF0) count++; // 4-byte start: extra char for surrogate pair
|
||||||
// 4-byte start (11110xxx): adds extra char for surrogate pair
|
|
||||||
if ((b & 0xF8) == 0xF0) count++;
|
|
||||||
}
|
}
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Decodes UTF-8 bytes into UTF-16 chars in place. Caller guarantees <paramref name="dst"/>
|
/// Single-pass UTF-8 → UTF-16 decoder. Returns the actual char count written to <paramref name="dst"/>.
|
||||||
/// has at least the char count returned by <see cref="CountUtf8Chars"/>.
|
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// Layered approach for maximum throughput across mixed content:
|
||||||
|
/// • <b>Phase 1 — Vector256 ASCII prefix bulk widen:</b> 32 bytes/iter while all top bits are zero.
|
||||||
|
/// Uses <see cref="Vector256.Widen(Vector256{byte})"/> to produce two Vector256<ushort> lanes
|
||||||
|
/// = 32 chars per iteration. Breaks on first non-ASCII byte found in the loaded vector.
|
||||||
|
/// • <b>Phase 2 — DWORD ASCII batch:</b> when ≥4 bytes remain, read as <c>uint</c>, test
|
||||||
|
/// <c>(dword & 0x80808080u) == 0</c>; on hit, widen 4 chars in 4 instructions and continue.
|
||||||
|
/// • <b>Phase 3 — Scalar multi-byte branch:</b> 1-byte (ASCII single), 2-byte (Latin extended,
|
||||||
|
/// Cyrillic, Greek, Hebrew, Arabic), 3-byte (CJK BMP), 4-byte (supplementary plane → surrogate pair).
|
||||||
|
/// Direct bit-extract, no validation — input is trusted.
|
||||||
|
///
|
||||||
|
/// JIT compiles the switch into a jump table for predictable dispatch on mixed content.
|
||||||
|
/// Hungarian text typical pattern: ASCII run (Phase 1/2 widening) → 2-byte char (Phase 3
|
||||||
|
/// case < 0xE0) → ASCII run → 2-byte char → ... — each phase optimal for its segment.
|
||||||
|
/// </remarks>
|
||||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||||
private static void DecodeUtf8ToChars(ReadOnlySpan<byte> src, Span<char> dst)
|
private static int DecodeUtf8SinglePass(ReadOnlySpan<byte> src, Span<char> dst)
|
||||||
{
|
{
|
||||||
int srcIdx = 0, dstIdx = 0;
|
int srcIdx = 0, dstIdx = 0;
|
||||||
|
ref byte srcRef = ref MemoryMarshal.GetReference(src);
|
||||||
|
ref ushort dstRef = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(dst));
|
||||||
|
|
||||||
|
// Phase 1 — Vector256 ASCII prefix bulk widen (32 bytes/iter)
|
||||||
|
if (Vector256.IsHardwareAccelerated)
|
||||||
|
{
|
||||||
|
while (src.Length - srcIdx >= Vector256<byte>.Count)
|
||||||
|
{
|
||||||
|
var v = Vector256.LoadUnsafe(ref srcRef, (uint)srcIdx);
|
||||||
|
// ASCII detect: any high bit set among the 32 bytes?
|
||||||
|
if (v.ExtractMostSignificantBits() != 0) break;
|
||||||
|
|
||||||
|
// Widen 32 bytes → 2 × Vector256<ushort> (32 chars total)
|
||||||
|
var (lower, upper) = Vector256.Widen(v);
|
||||||
|
lower.StoreUnsafe(ref dstRef, (uint)dstIdx);
|
||||||
|
upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector128<ushort>.Count));
|
||||||
|
srcIdx += Vector256<byte>.Count;
|
||||||
|
dstIdx += Vector256<byte>.Count; // 32 bytes → 32 chars
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 2/3 — scalar loop with DWORD ASCII batch
|
||||||
while (srcIdx < src.Length)
|
while (srcIdx < src.Length)
|
||||||
{
|
{
|
||||||
var b0 = src[srcIdx];
|
// DWORD ASCII batch: 4 ASCII bytes → 4 chars per iter
|
||||||
|
if (src.Length - srcIdx >= 4)
|
||||||
|
{
|
||||||
|
var dword = Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref srcRef, srcIdx));
|
||||||
|
if ((dword & 0x80808080u) == 0)
|
||||||
|
{
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx) = (byte)dword;
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(dword >> 8);
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(dword >> 16);
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(dword >> 24);
|
||||||
|
srcIdx += 4;
|
||||||
|
dstIdx += 4;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scalar multi-byte branch (jump-table compile via switch)
|
||||||
|
var b0 = Unsafe.Add(ref srcRef, srcIdx);
|
||||||
switch (b0)
|
switch (b0)
|
||||||
{
|
{
|
||||||
case < 0x80:
|
case < 0x80:
|
||||||
// 1-byte ASCII (U+0000–U+007F)
|
// 1-byte ASCII (U+0000–U+007F)
|
||||||
dst[dstIdx++] = (char)b0;
|
Unsafe.Add(ref dstRef, dstIdx++) = b0;
|
||||||
srcIdx += 1;
|
srcIdx += 1;
|
||||||
break;
|
break;
|
||||||
case < 0xE0:
|
case < 0xE0:
|
||||||
{
|
{
|
||||||
// 2-byte sequence: 110xxxxx 10xxxxxx → U+0080–U+07FF
|
// 2-byte: 110xxxxx 10xxxxxx → U+0080–U+07FF
|
||||||
// Latin extended (Hungarian, Polish, Czech, Spanish, French diacritics),
|
// Latin extended, Cyrillic, Greek, Hebrew, Arabic.
|
||||||
// Greek, Cyrillic, Hebrew, Arabic, etc.
|
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
|
||||||
var b1 = src[srcIdx + 1];
|
Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x1F) << 6) | (b1 & 0x3F));
|
||||||
|
|
||||||
dst[dstIdx++] = (char)(((b0 & 0x1F) << 6) | (b1 & 0x3F));
|
|
||||||
srcIdx += 2;
|
srcIdx += 2;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case < 0xF0:
|
case < 0xF0:
|
||||||
{
|
{
|
||||||
// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF
|
// 3-byte: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF
|
||||||
// CJK BMP (most Chinese, Japanese, Korean), various other scripts.
|
// CJK BMP, various other scripts.
|
||||||
var b1 = src[srcIdx + 1];
|
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
|
||||||
var b2 = src[srcIdx + 2];
|
var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
|
||||||
|
Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F));
|
||||||
dst[dstIdx++] = (char)(((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F));
|
|
||||||
srcIdx += 3;
|
srcIdx += 3;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000–U+10FFFF
|
// 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000–U+10FFFF
|
||||||
// Supplementary plane (emoji, rare CJK ext, ancient scripts) — encoded as
|
// Supplementary plane (emoji, rare CJK ext) → UTF-16 surrogate pair.
|
||||||
// a UTF-16 surrogate pair.
|
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
|
||||||
var b1 = src[srcIdx + 1];
|
var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
|
||||||
var b2 = src[srcIdx + 2];
|
var b3 = Unsafe.Add(ref srcRef, srcIdx + 3);
|
||||||
var b3 = src[srcIdx + 3];
|
|
||||||
var codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
|
var codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
|
||||||
|
|
||||||
codepoint -= 0x10000;
|
codepoint -= 0x10000;
|
||||||
dst[dstIdx++] = (char)(0xD800 | (codepoint >> 10));
|
Unsafe.Add(ref dstRef, dstIdx) = (ushort)(0xD800 | (codepoint >> 10));
|
||||||
dst[dstIdx++] = (char)(0xDC00 | (codepoint & 0x3FF));
|
Unsafe.Add(ref dstRef, dstIdx + 1) = (ushort)(0xDC00 | (codepoint & 0x3FF));
|
||||||
|
dstIdx += 2;
|
||||||
srcIdx += 4;
|
srcIdx += 4;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return dstIdx;
|
||||||
}
|
}
|
||||||
|
|
||||||
private string ReadStringUtf8Cached(int length)
|
private string ReadStringUtf8Cached(int length)
|
||||||
|
|
|
||||||
|
|
@ -671,40 +671,44 @@ public static partial class AcBinarySerializer
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// D-2: single-pass UTF-8 encode with VarUInt backfill.
|
// D-2 + tight reserve: single-pass UTF-8 encode with input-bound-aware VarUInt slot.
|
||||||
// Replaces the prior try-ASCII-then-rewind-and-encode-UTF-8 pattern (1 scan ASCII / 3 scans
|
// Replaces the prior try-ASCII-then-rewind-and-encode-UTF-8 pattern (1 scan ASCII / 3 scans
|
||||||
// non-ASCII) with a single GetBytes call that works identically for both content classes.
|
// non-ASCII) with a single GetBytes call that works identically for both content classes.
|
||||||
//
|
//
|
||||||
// Layout: [reserved 5 bytes for max VarUInt][UTF-8 bytes...]
|
// Layout: [reserved N bytes for VarUInt][UTF-8 bytes...]
|
||||||
// 1. EnsureCapacity for worst-case (5 + charLength*4)
|
// 1. Compute worst-case byte count from charLength (UTF-8 max = 4 bytes/char) and the
|
||||||
// 2. GetBytes directly into buffer at savedPos+5 → returns exact byteCount
|
// VarUInt size needed for that upper bound. For charLength ≤ 31, reserveSize = 1
|
||||||
// 3. If actual VarUInt size < 5, memmove encoded bytes left to compact the gap
|
// (since 4*31 = 124 < 128 ⇒ VarUInt(124) = 1 byte). Most short strings hit this.
|
||||||
// 4. WriteVarUInt at savedPos and advance
|
// 2. EnsureCapacity for reserveSize + maxBytes.
|
||||||
|
// 3. GetBytes directly into buffer at savedPos+reserveSize → returns exact byteCount.
|
||||||
|
// 4. If actual VarUInt < reserveSize (rare), memmove encoded bytes left to compact.
|
||||||
|
// 5. WriteVarUInt at savedPos and advance.
|
||||||
//
|
//
|
||||||
// Span<byte>.CopyTo is overlap-safe via Buffer.Memmove. For typical short strings
|
// Win vs the prior fixed-5-byte reserve: short strings (the common case) skip the memmove
|
||||||
// (≤127 bytes UTF-8 → 1-byte VarUInt), the shift is 4 bytes — a few ns memcopy cost
|
// entirely. For 32-char strings the reserve is 2 bytes; if actual byteCount < 128 we
|
||||||
// that's dwarfed by the saved ASCII-scan-then-rewind overhead on non-ASCII content,
|
// memmove a smaller distance (1 byte) than the prior fixed approach (4 bytes).
|
||||||
// and is essentially free on ASCII content (cache-resident write).
|
//
|
||||||
|
// Span<byte>.CopyTo is overlap-safe via Buffer.Memmove on byte arrays.
|
||||||
var charLength = value.Length;
|
var charLength = value.Length;
|
||||||
const int maxVarUIntSize = 5;
|
|
||||||
var maxBytes = charLength * 4;
|
var maxBytes = charLength * 4;
|
||||||
|
var reserveSize = VarUIntSize((uint)maxBytes);
|
||||||
|
|
||||||
EnsureCapacity(maxVarUIntSize + maxBytes);
|
EnsureCapacity(reserveSize + maxBytes);
|
||||||
|
|
||||||
var savedPos = _position;
|
var savedPos = _position;
|
||||||
var encodeStart = savedPos + maxVarUIntSize;
|
var encodeStart = savedPos + reserveSize;
|
||||||
var bytesWritten = Utf8NoBom.GetBytes(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
|
var bytesWritten = Utf8NoBom.GetBytes(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
|
||||||
|
|
||||||
var varUIntSize = VarUIntSize((uint)bytesWritten);
|
var actualVarUIntSize = VarUIntSize((uint)bytesWritten);
|
||||||
if (varUIntSize < maxVarUIntSize)
|
if (actualVarUIntSize < reserveSize)
|
||||||
{
|
{
|
||||||
var shift = maxVarUIntSize - varUIntSize;
|
var shift = reserveSize - actualVarUIntSize;
|
||||||
_buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(encodeStart - shift, bytesWritten));
|
_buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(encodeStart - shift, bytesWritten));
|
||||||
}
|
}
|
||||||
|
|
||||||
_position = savedPos;
|
_position = savedPos;
|
||||||
|
|
||||||
WriteVarUIntUnsafe((uint)bytesWritten); // advances _position by varUIntSize
|
WriteVarUIntUnsafe((uint)bytesWritten); // advances _position by actualVarUIntSize
|
||||||
_position += bytesWritten;
|
_position += bytesWritten;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1333,17 +1333,22 @@ public static partial class AcBinarySerializer
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fast path for short strings: check length first (cheap), then ASCII
|
// BASELINE TEMP: ASCII fast paths disabled — every string takes the pure UTF-8 D-2 path
|
||||||
// FixStr encodes type+length in single byte for strings <= 31 chars
|
// (String marker + VarUInt byte count + UTF-8 bytes). Used to measure custom UTF-8 decoder
|
||||||
var length = value.Length;
|
// performance in isolation, without FixStr-vs-String dispatch interference. Re-enable the
|
||||||
if (length <= BinaryTypeCode.FixStrMaxLength)
|
// FixStr dispatch below once the decoder optimization is benchmarked and verified.
|
||||||
{
|
//
|
||||||
// For short strings, use direct ASCII copy (avoids double validation)
|
//// Fast path for short strings: check length first (cheap), then ASCII
|
||||||
context.WriteFixStrDirect(value);
|
//// FixStr encodes type+length in single byte for strings <= 31 chars
|
||||||
return;
|
//var length = value.Length;
|
||||||
}
|
//if (length <= BinaryTypeCode.FixStrMaxLength)
|
||||||
|
//{
|
||||||
|
// // For short strings, use direct ASCII copy (avoids double validation)
|
||||||
|
// context.WriteFixStrDirect(value);
|
||||||
|
// return;
|
||||||
|
//}
|
||||||
|
|
||||||
// Long strings - standard encoding
|
// All strings (short and long) — standard UTF-8 encoding via D-2 single-pass path
|
||||||
context.WriteByte(BinaryTypeCode.String);
|
context.WriteByte(BinaryTypeCode.String);
|
||||||
context.WriteStringUtf8(value);
|
context.WriteStringUtf8(value);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -658,3 +658,142 @@ Two `.csproj` files:
|
||||||
- `.Aot` throws a clear `InvalidOperationException` (not `MissingMethodException`) when a non-`[AcBinarySerializable]` type is encountered at deser time
|
- `.Aot` throws a clear `InvalidOperationException` (not `MissingMethodException`) when a non-`[AcBinarySerializable]` type is encountered at deser time
|
||||||
- `BINARY_FEATURES.md` NativeAOT Compatibility section documents both packages and when to choose which
|
- `BINARY_FEATURES.md` NativeAOT Compatibility section documents both packages and when to choose which
|
||||||
|
|
||||||
|
## ACCORE-BIN-T-V4N2: .NET 11 SIMD-specialized UTF-8 decoder via multi-targeting
|
||||||
|
**Priority:** P3 · **Type:** Performance · **Related:** `AcBinaryDeserializer.BinaryDeserializationContext.Read.cs::DecodeUtf8`
|
||||||
|
|
||||||
|
The custom UTF-8 → UTF-16 decoder in `DecodeUtf8` / `CountUtf8Chars` / `DecodeUtf8ToChars` currently targets .NET 9 — scalar two-pass with optional Vector256 ASCII prefix widen + DWORD ASCII batch (per Phase 1 optimization). .NET 11 (planned ~Nov 2026) exposes additional SIMD intrinsics that can meaningfully accelerate the decoder on AVX-512-capable hosts, particularly the `vpcompressb`-style mask-driven byte compression that simdutf relies on for its 64-byte AVX-512 transcoder.
|
||||||
|
|
||||||
|
### Why .NET 11 specifically (and not .NET 10)
|
||||||
|
|
||||||
|
- **.NET 10**: incremental SIMD improvements, but the changes that affect us are mostly inside the BCL (`Encoding.UTF8.GetString` internal SIMD widening). Our custom decoder bypasses the BCL — we don't benefit unless we hand-roll the same SIMD ourselves with .NET 9 intrinsics, which already work today. Multi-targeting `net9.0;net10.0` adds CI/test overhead with marginal payoff. **Skip.**
|
||||||
|
- **.NET 11**: PR #120628 (Vector512/Vector256 SIMD for UTF-8 utilities) was closed without merge but signals upcoming work in this area. Future iterations are expected to expose `Avx512Vbmi`-style mask-compress intrinsics that today require unsafe / Vector128-emulation paths. Target this once the framework lands.
|
||||||
|
|
||||||
|
### Implementation outline (when triggered)
|
||||||
|
|
||||||
|
- Multi-target `<TargetFrameworks>net9.0;net11.0</TargetFrameworks>` on `AyCode.Core.csproj`
|
||||||
|
- `#if NET11_0_OR_GREATER` block in `DecodeUtf8` selects an AVX-512-aware path: process 64-byte blocks via `Vector512<byte>` + `vpcompressb` for byte-stream extraction, fall back to the .NET 9 scalar+Vector256 path on non-AVX-512 hardware (`Avx512Vbmi.IsSupported` runtime check)
|
||||||
|
- Reuse the .NET 9 scalar path for short strings (<64 bytes) — SIMD setup cost dominates
|
||||||
|
- New benchmark cells comparing .NET 9 vs .NET 11 builds on the same hardware
|
||||||
|
|
||||||
|
### Acceptance
|
||||||
|
|
||||||
|
- `dotnet test` passes on both target frameworks
|
||||||
|
- Benchmark on AVX-512 hardware (Sapphire Rapids / Zen 4+) shows ≥1.5x non-ASCII deser speedup vs .NET 9 build for strings ≥256 bytes
|
||||||
|
- Short-string perf (≤64 bytes) within ±5% of .NET 9 build (no regression from multi-target setup)
|
||||||
|
- `BINARY_FEATURES.md` documents the SIMD path selection logic
|
||||||
|
|
||||||
|
### Trigger
|
||||||
|
|
||||||
|
- Wait for .NET 11 release (or RC)
|
||||||
|
- Re-evaluate once `dotnet/runtime` UTF-8 SIMD utilities re-land (post-PR #120628 follow-up)
|
||||||
|
- Skip entirely if .NET 11 BCL `Encoding.UTF8.GetString` becomes fast enough that hybrid (≥256 bytes → BCL, <256 → custom) wins without hand-rolled SIMD
|
||||||
|
|
||||||
|
## ACCORE-BIN-T-S5L8: Sentinel-length encoding for strings (wire-size optimization, both modes)
|
||||||
|
**Priority:** P3 · **Type:** Wire-format optimization · **Related:** `AcBinarySerializer.WriteString`, `AcBinaryDeserializer.ReadValue` string dispatch
|
||||||
|
|
||||||
|
The leading string-marker byte (`String` / `StringEmpty` / `Null`) exists primarily to distinguish null vs empty vs non-empty before dispatching. For **non-polymorphic, non-interned string properties** the marker can be replaced by a single sentinel-length VarUInt:
|
||||||
|
|
||||||
|
```
|
||||||
|
[VarUInt sentinelLength] [content bytes if applicable]
|
||||||
|
sentinelLength == 0 → null
|
||||||
|
sentinelLength == 1 → empty string
|
||||||
|
sentinelLength == N+1 → string of N bytes/chars, content follows
|
||||||
|
```
|
||||||
|
|
||||||
|
MemoryPack-style encoding pattern. Applies to **both** Compact (UTF-8) and FastWire (UTF-16 raw) modes; the content following the sentinel differs by mode.
|
||||||
|
|
||||||
|
### Per-mode impact
|
||||||
|
|
||||||
|
**FastWire mode** — wire layout today: `[String marker][VarUInt charCount][UTF-16 raw bytes]`. Sentinel saves 1 byte per non-null string.
|
||||||
|
|
||||||
|
| TestData | Current FastWire wire | Estimated with sentinel | Δ |
|
||||||
|
|---|---|---|---|
|
||||||
|
| Small | 3122 B | ~3050 B | -2% |
|
||||||
|
| Medium | 10905 B | ~10500 B | -4% |
|
||||||
|
| Large | 68603 B | ~67000 B | -2% |
|
||||||
|
| Repeated | 16244 B | ~15700 B | -3% |
|
||||||
|
| Deep | 15514 B | ~14900 B | -4% |
|
||||||
|
|
||||||
|
Closes the +1.7-8.1% FastWire wire gap vs MemoryPack to near zero or favorable while keeping AcBinary FastWire's +9-20% speed advantage.
|
||||||
|
|
||||||
|
**Compact mode** — wire layout today varies by length:
|
||||||
|
- Short (≤31 byte): `[FixStr+length][UTF-8 bytes]` — already 1-byte marker, ties sentinel.
|
||||||
|
- Long (>31 byte): `[String marker][VarUInt byteCount][UTF-8 bytes]` — sentinel saves 1 byte (the marker).
|
||||||
|
|
||||||
|
Compact gain: **only on long strings** (>31 byte UTF-8). Estimated −1 byte per long string. Workload-dependent: if most strings are short or use interning, gain is small. If many long mixed-content strings, meaningful saving.
|
||||||
|
|
||||||
|
### Limitations (both modes)
|
||||||
|
|
||||||
|
- **Polymorphic `object` properties**: marker needed for type discrimination. Sentinel encoding only applies when the property type is statically `string` or `string?`.
|
||||||
|
- **Interning incompatible**: sentinel cannot express `StringInternFirst` / `StringInterned` markers (those carry cache-index semantics). Interned properties keep marker-based encoding. FastWire mode already disables interning by design (consistent); Compact mode needs per-property dispatch (interned → marker, non-interned → sentinel).
|
||||||
|
- **Compact-mode FixStr ties**: short strings (≤31 byte UTF-8) gain nothing in Compact (FixStr is already 1-byte marker+length). The optimization wins only on long strings in Compact.
|
||||||
|
|
||||||
|
### Implementation outline (rough — refine when implementing)
|
||||||
|
|
||||||
|
1. Writer: branch in `WriteString` on property metadata flags `(IsString, IsNotInterned, IsNotPolymorphic)`. If sentinel-eligible, emit `VarUInt sentinelLength` + content. Else fall through to existing marker-based encoding.
|
||||||
|
2. Reader: matching branch in property reader. If sentinel-eligible (per property metadata), read `VarUInt sentinelLength`, dispatch on 0/1/N+1.
|
||||||
|
3. SGen: emit sentinel-encoding variant for non-polymorphic non-interned `string` typed properties; emit existing marker-encoding for the rest.
|
||||||
|
4. Wire format version bump OR header flag indicating sentinel-encoding-active. (Cross-version compat policy decided when implementing.)
|
||||||
|
|
||||||
|
### Trigger
|
||||||
|
|
||||||
|
- After D-2 / decoder optimization / marker-dispatch land (compact-mode focus completes)
|
||||||
|
- When wire-size positioning becomes a primary pillar for NuGet release
|
||||||
|
- Re-evaluate scope at implementation time — exact gain in Compact depends on consumer workload (long-string ratio, interning patterns)
|
||||||
|
|
||||||
|
### Acceptance
|
||||||
|
|
||||||
|
- FastWire mode: AcBinary wire ≤ MemoryPack on at least 4 of 5 test cells
|
||||||
|
- Compact mode: long-string wire bytes -1 each, no regression on short or interned strings
|
||||||
|
- Speed benchmark: no regression vs current encoding (essentially zero CPU cost — sentinel is shifted bookkeeping)
|
||||||
|
- Cross-version compat: documented format version bump + clean fail on old reader / new wire mismatch
|
||||||
|
- Polymorphic + interned property test cases pass unchanged (use existing marker-based encoding)
|
||||||
|
|
||||||
|
## ACCORE-BIN-T-M3R7: ASCII marker-dispatch — writer detect + reader dedicated path
|
||||||
|
**Priority:** P2 · **Type:** Performance + wire optimization · **Related:** `BinaryTypeCode.FixStrAsciiBase..StringAscii` markers (already defined), `WriteStringUtf8`, `ReadStringUtf8`, `WriteFixStrDirect`
|
||||||
|
|
||||||
|
> **Sorrendi megjegyzés:** ezt **AZ ENCODER OPTIMALIZÁCIÓ UTÁN** csináljuk (lásd `ACCORE-BIN-T-E2F9`). Indok: a custom encoder/decoder Vector256 ASCII narrow/widen path-jai már magukban gyorsan kezelik az ASCII byte-ot. A marker-dispatch ezen FELÜL csak a per-call dispatch-overhead spórolást hozza (no `Ascii.IsValid` scan, no decoder layer). Garantált win, de additív — méréstechnikailag tisztább a decoder/encoder utánra hagyni.
|
||||||
|
|
||||||
|
The `FixStrAscii*` (135-166) and `StringAscii` (167) markers are defined in `BinaryTypeCode.cs` with helper methods (`IsAsciiString`, `IsFixStrAscii`, `EncodeFixStrAscii`, `DecodeFixStrAsciiLength`). Encoding/decoding logic NOT yet implemented — currently both writer and reader use the universal `String` / `FixStr` markers.
|
||||||
|
|
||||||
|
### Implementation
|
||||||
|
- **Writer**: in `WriteStringUtf8` / `WriteFixStrDirect`, after UTF-8 encoding (D-2 path), check `bytesWritten == charLength` (= ASCII iff equal). If ASCII, emit `FixStrAscii` (≤31 byte) or `StringAscii` (>31 byte). Else emit existing `FixStr` / `String`. Free detect — both numbers already computed by D-2.
|
||||||
|
- **Reader**: in `ReadStringUtf8` (or upstream marker dispatch), branch on marker. ASCII markers → dedicated byte→char widening path (no UTF-8 decode, no `Ascii.IsValid` scan, no decoder dispatch). Non-ASCII markers → existing custom UTF-8 decoder.
|
||||||
|
- **SGen**: regenerate readers/writers to dispatch on the new markers.
|
||||||
|
- **Re-enable ASCII fast paths**: uncomment writer FixStr dispatch in `AcBinarySerializer.cs` and reader `Ascii.IsValid` block in `ReadStringUtf8` — these temporarily disabled blocks become the marker-aware paths (no IsValid scan needed since the marker is the contract).
|
||||||
|
|
||||||
|
### Wire format change
|
||||||
|
- Format version bump (1 → 2). Old readers fail clean on new wire (version mismatch). New readers must reject old wire OR support backward read.
|
||||||
|
|
||||||
|
### Acceptance
|
||||||
|
- Repeated Strings (Hungarian content) Deser: AcBinary closes the ~10% gap vs MemoryPack
|
||||||
|
- Pure ASCII tests (Small/Medium/Large/Deep): AcBinary Ser AND Deser ≥ MemoryPack
|
||||||
|
- Wire size: minimum -25% vs MemoryPack across all test cells
|
||||||
|
- SGen-generated code compiles and round-trips on all `[AcBinarySerializable]` types
|
||||||
|
- Decision documented: backward-compat policy for v2 vs v1 wire
|
||||||
|
|
||||||
|
## ACCORE-BIN-T-E2F9: Custom UTF-8 encoder (writer-side, symmetric with custom decoder)
|
||||||
|
**Priority:** P1 · **Type:** Performance · **Related:** decoder optimization (`AcBinaryDeserializer.BinaryDeserializationContext.Read.cs::DecodeUtf8SinglePass`)
|
||||||
|
|
||||||
|
> **Sorrendi megjegyzés:** ezt **A MARKER-DISPATCH ELŐTT** csináljuk (lásd `ACCORE-BIN-T-M3R7`). Indok: a custom encoder/decoder optimalizáció a "nehezebb, kevésbé biztos" win — a non-ASCII / mixed content workload-okat (Repeated Strings Hungarian) hozza be. A marker-dispatch utána már csak additív tisztítás a pure ASCII path dispatch-overhead-jén.
|
||||||
|
|
||||||
|
Replace `Encoding.UTF8.GetBytes` calls in `WriteStringUtf8` / `WriteStringUtf8Internal` / `WriteFixStrDirect` (collectively the writer's UTF-8 encode path, post-D-2) with a hand-rolled SIMD encoder. Symmetric to the decoder optimization (V4N2 / Read.cs::DecodeUtf8SinglePass).
|
||||||
|
|
||||||
|
### Layered structure (mirrors decoder)
|
||||||
|
- **Phase 1 — Vector256 ASCII narrow**: 16 chars (Vector256<ushort>) → 16 bytes (Vector128<byte>) via `Vector256.Narrow`. ASCII detect via `(v & 0xFF80).ExtractMostSignificantBits() == 0` (any high bit on UTF-16 char). Break on first non-ASCII char.
|
||||||
|
- **Phase 2 — DWORD ASCII batch**: 4 chars at a time, OR-mask test, 4 bytes per iter when ASCII.
|
||||||
|
- **Phase 3 — Scalar multi-byte encode**: 1-byte (ASCII) / 2-byte (Latin extended) / 3-byte (BMP) / 4-byte (surrogate pair → supplementary plane) UTF-8 encoding via direct bit-extract. No fallback dispatch — input is trusted UTF-16 (string).
|
||||||
|
- Use `System.Text.Unicode.Utf8.FromUtf16` as fallback target for scalar correctness — or skip BCL entirely with manual bit-pack.
|
||||||
|
|
||||||
|
### Why
|
||||||
|
`Encoding.UTF8.GetBytes` carries virtual-dispatch + encoder-fallback overhead even with SIMD ASCII fast path internally. Custom encoder skips this. ~15-30% Ser improvement on ASCII content, ~5-10% on non-ASCII (multi-byte path stays scalar).
|
||||||
|
|
||||||
|
### Trigger
|
||||||
|
- **NEXT** — implementation order P1 before marker-dispatch (M3R7)
|
||||||
|
- Re-evaluate if .NET 11 BCL UTF-8 GetBytes becomes faster (PR #120628 follow-up)
|
||||||
|
|
||||||
|
### Acceptance
|
||||||
|
- Writer-side benchmark: ≥15% Ser speedup on ASCII content (Small/Medium/Large/Deep), ≥5% on non-ASCII (Repeated)
|
||||||
|
- Wire format unchanged (custom encoder produces same bytes as `Encoding.UTF8`)
|
||||||
|
- Round-trip tests pass
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue