AyCode.Core/AyCode.Core.Serializers.Con.../Program.cs

1829 lines
100 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using AyCode.Core.Compression;
using AyCode.Core.Serializers.Attributes;
using AyCode.Core.Serializers.Binaries;
using AyCode.Core.Tests.Serialization; // DrainFromAsync extension (test-only, used by benchmark)
using AyCode.Core.Tests.TestModels;
using MemoryPack;
#if !AYCODE_NATIVEAOT
using MessagePack;
using MessagePack.Resolvers;
#endif
using Microsoft.Extensions.Options;
using System.Buffers;
using System.Diagnostics;
using System.IO.Pipelines;
using System.IO.Pipes;
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Text;
using System.Text.Json;
using AyCode.Core.Serializers.Console.Benchmarks;
namespace AyCode.Core.Serializers.Console;
/// <summary>
/// Comprehensive benchmark application for all serializers.
/// Compares: AcBinary (all options), MemoryPack, MessagePack, Newtonsoft.Json, System.Text.Json
///
/// Usage:
/// dotnet run # Run all benchmarks
/// dotnet run -- quick # Quick mode (fewer iterations)
/// dotnet run -- serialize # Serialize only
/// dotnet run -- deserialize # Deserialize only
/// </summary>
public static class Program
{
// Configuration (constants, mutable state, attribute-flag aggregation) → Configuration.cs
/// <summary>
/// Common Options-column formatter for every AcBinary serializer benchmark row. Renders the
/// configured options-level value AND the effective attribute-level enable flag side-by-side
/// (e.g. <c>Interning=All(opt) | False (attr)</c>) so attribute-suppressed features cannot
/// silently mislead. Pass any benchmark-specific extras (e.g. <c>", BufferSize=4096B"</c>)
/// in <paramref name="extra"/> — they are appended after the common fields.
/// </summary>
private static string BuildAcBinaryOptionsDescription(AcBinarySerializerOptions options, string extra = "")
{
// PropertyFilter: opt-side is "Set"/"None" depending on whether a callback is registered (the callback
// itself isn't a meaningful display value); attr-side is the cross-type-aggregated bool (true = every
// tagged type has the feature enabled, false = at least one type opted out via
// [AcBinarySerializable(enablePropertyFilterFeature: false)] → SGen-emit + Runtime hot-loop both gate).
var propFilterOpt = options.PropertyFilter == null ? "None" : "Set";
return $"WireMode={options.WireMode}, " +
$"RefHandling={options.ReferenceHandling}(opt) | {Configuration.AttrFlags.refHandling} (attr), " +
$"Interning={options.UseStringInterning}(opt) | {Configuration.AttrFlags.internString} (attr), " +
$"Metadata={options.UseMetadata}(opt) | {Configuration.AttrFlags.metadata} (attr), " +
$"PropertyFilter={propFilterOpt}(opt) | {Configuration.AttrFlags.propertyFilter} (attr), " +
$"SGen={options.UseGeneratedCode}, " +
$"Compression={options.UseCompression}{extra}";
}
/// <summary>
/// Returns MemoryPack serializer options aligned with <see cref="Configuration.SelectedWireMode"/> for a fair
/// apples-to-apples wire-format comparison:
/// <list type="bullet">
/// <item><see cref="WireMode.Compact"/> → <see cref="MemoryPackSerializerOptions.Default"/> (UTF-8) — both
/// engines encode UTF-8, comparison is purely about header / tier / dispatch overhead.</item>
/// <item><see cref="WireMode.Fast"/> → <see cref="MemoryPackSerializerOptions.Utf16"/> (UTF-16 raw memcpy) —
/// both engines write UTF-16 raw bytes, so wire-size and CPU comparison reflect the same string-encoding family.</item>
/// </list>
/// Without this alignment the FastWire vs MemPack-default comparison conflates two unrelated dimensions
/// (UTF-16 raw vs UTF-8 encoded) and produces a misleading +40% wire-size delta that is structurally
/// the encoding-family difference, NOT an AcBinary-specific overhead.
/// </summary>
private static MemoryPackSerializerOptions GetMemPackOptions() =>
Configuration.SelectedWireMode == WireMode.Fast
? MemoryPackSerializerOptions.Utf16
: MemoryPackSerializerOptions.Default;
/// <summary>
/// Converts a total-time (in ms across <see cref="Configuration.TestIterations"/>) into per-operation microseconds.
/// Formula: <c>totalMs / iterations × 1000</c>. The benchmark stores <c>*TimeMs</c> as the cumulative
/// median over the timing run; the display layer renders per-op µs to make numbers iteration-count
/// independent (e.g. switching <c>Configuration.TestIterations</c> 1000 → 100 leaves the displayed µs/op unchanged
/// — only its sample noise grows). Symmetric with the already-per-op <c>*AllocBytesPerOp</c> fields.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
/// <summary>
/// Converts a total-time (in ms across <paramref name="iterations"/>) into per-operation microseconds.
/// Per-op µs is the iter-independent unit: 1000 iter and 50000 iter of the same operation should
/// produce the same per-op µs (within noise). Necessary because per-cell adaptive iteration makes
/// <c>iterations</c> a per-row property — there is no longer a single global Configuration.TestIterations to divide by.
/// </summary>
// Output helpers (PrintResult, SaveResults, OverallStats, FormatMicrosWithRange, etc.) → Output.cs
// BenchmarkResult DTO → BenchmarkResult.cs
public static void Main(string[] args)
{
// Set console encoding to UTF-8 for proper Unicode character display
System.Console.OutputEncoding = Encoding.UTF8;
// Setup validation — abort BEFORE any benchmark logic if MemoryPack baseline is invalid.
// Done early so user is told immediately, not after warmup.
BenchmarkLoop.ValidateMemoryPackSetup();
// CLI mode (args provided): run once, parse args, exit. Backward-compatible behaviour.
if (args.Length > 0)
{
if (!TryParseCliArgs(args, out var layer, out var opMode, out var serializerMode))
return; // invalid args
RunBenchmark(layer, opMode, serializerMode);
return;
}
// Interactive mode (no args): loop the menu so the user doesn't have to restart between runs.
// Q exits the menu (and the application).
while (true)
{
var selection = Menu.ShowInteractiveMenu();
if (selection == null) return; // user pressed Q
RunBenchmark(selection.Value.layer, "all", selection.Value.serializerMode);
System.Console.WriteLine();
System.Console.WriteLine("─────────────────────────────────────────────────────────────────────");
System.Console.WriteLine("Returning to menu — press any key to continue, or Q to quit...");
var key = System.Console.ReadKey(intercept: true);
if (key.Key == ConsoleKey.Q) return;
System.Console.WriteLine();
}
}
/// <summary>
/// Parses CLI arguments into (layer, opMode, serializerMode). Returns <c>false</c> if the args
/// are invalid; the caller should then exit without running the standard benchmark.
/// </summary>
private static bool TryParseCliArgs(string[] args, out string layer, out string opMode, out string serializerMode)
{
layer = "all";
opMode = "all";
serializerMode = "standard";
var arg = args[0].ToLower();
// Quick mode: short warmup, few iterations, small sample count
if (arg == "quick")
{
Configuration.WarmupIterations = 5;
Configuration.TestIterations = 100;
Configuration.BenchmarkSamples = 3;
layer = "all";
}
else if (arg is "core" or "comprehensive" or "edge" or "all"
or "small" or "medium" or "large" or "repeated" or "deep")
{
layer = arg;
}
else if (arg is "asyncpipe" or "pipe")
{
// AsyncPipe-only mode: streaming I/O isolation across all test data.
layer = "all";
serializerMode = "asyncpipe";
}
else if (arg is "ser" or "serialize")
{
opMode = "serialize";
layer = "all";
}
else if (arg is "des" or "deserialize")
{
opMode = "deserialize";
layer = "all";
}
else
{
// Backwards compat: unknown arg → treat as layer keyword
layer = arg;
}
return true;
}
/// <summary>
/// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup
/// + measurement → grouped results print → save to disk. Used by both the CLI and interactive
/// menu paths; the interactive loop calls this repeatedly without restarting the process.
/// </summary>
private static void RunBenchmark(string layer, string opMode, string serializerMode)
{
System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗");
System.Console.WriteLine("║ COMPREHENSIVE SERIALIZER BENCHMARK SUITE ║");
System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");
// Stabilization: pin the entire benchmark process to a single logical CPU and bump priority
// class. Single-core affinity stops Windows from migrating the bench thread between cores
// mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority
// reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise
// randomly inflate samples by 5-15%.
// Try/finally guarantees the original state is restored even if a benchmark throws — leaving
// a developer machine pinned to one core after a crashed run is a real foot-gun.
// Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot.
var process = Process.GetCurrentProcess();
var origAffinity = (IntPtr)0;
var origPriority = ProcessPriorityClass.Normal;
var stabilizationApplied = false;
// ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at
// runtime; skip the affinity step there but still raise priority class (which IS supported
// on macOS, just less effective for stabilization than affinity pinning).
if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
{
try
{
origAffinity = process.ProcessorAffinity;
origPriority = process.PriorityClass;
// Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one
// core, consistently" — not which one. If CPU 0 is heavily contended on the host
// (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak
// the mask here. The benchmark is single-threaded for the in-memory rows so single
// core is sufficient; round-trip-only NamedPipe rows have a server-drain thread
// that will share the core (acceptable — the bench measures end-to-end RT anyway).
process.ProcessorAffinity = (IntPtr)1;
process.PriorityClass = ProcessPriorityClass.High;
stabilizationApplied = true;
System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High.");
}
catch (Exception ex)
{
// Affinity/priority changes may fail on locked-down hosts (group policies, containers
// without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still
// works, just with the platform default scheduling.
System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}");
}
}
try
{
var allResults = new List<BenchmarkResult>();
var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
var testDataSets = BenchmarkLoop.FilterByLayer(allTestDataSets, layer);
System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard");
System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
System.Console.WriteLine();
// Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
// Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
// alone doesn't ensure that every Serialize<T>/IBufferWriter overload is fully Tier 1 by the time we
// start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
// on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
// Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
// them all in the background; the per-cell warmup that follows then locks in cache + branch state.
if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
{
System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");
foreach (var testData in testDataSets)
{
var preSerializers = CreateSerializers(testData, serializerMode);
try
{
foreach (var s in preSerializers)
{
// Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated:
// Ser path first, then Des path — same pattern as the per-cell warmup in
// RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming).
s.WarmupSerialize(2000);
s.WarmupDeserialize(2000);
}
}
finally
{
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
}
}
// Let background tiered-JIT compilation drain before we begin measuring.
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
System.Console.WriteLine("✓ Global pre-warmup complete.\n");
}
foreach (var testData in testDataSets)
{
System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");
var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
allResults.AddRange(results);
}
// Print grouped results
Output.PrintGroupedResults(allResults, testDataSets);
// Save results to file
Output.SaveResults(allResults, testDataSets);
System.Console.WriteLine("\n✓ Benchmark complete!");
}
finally
{
// Restore process state — affinity/priority changes are process-wide and persist across
// interactive-mode iterations of the menu. Without restore, the second menu run would
// already be on CPU-0 + High priority before its own try-block applied them, masking
// any stabilization-disabled comparison.
if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
{
try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ }
try { process.PriorityClass = origPriority; } catch { /* best-effort */ }
}
}
}
#region Benchmark Execution
private static List<BenchmarkResult> RunBenchmarksForTestData(TestDataSet testData, string mode, string serializerMode)
{
var results = new List<BenchmarkResult>();
var serializers = CreateSerializers(testData, serializerMode);
// Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure.
System.Console.WriteLine("Verifying round-trip correctness...");
foreach (var serializer in serializers)
{
if (!serializer.VerifyRoundTrip())
{
System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}");
System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting.");
Environment.Exit(1);
}
}
System.Console.WriteLine("✓ All serializers passed round-trip verification.");
// Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and
// Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary.
//
// Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor
// in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a
// cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs
// keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement
// (steady-state). Branch-predictor history also stays clean per path.
//
// GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer
// pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent
// heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations.
//
// Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT).
// Each phase's freshly-promoted methods settle before its timing starts.
System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n");
foreach (var serializer in serializers)
{
var result = new BenchmarkResult
{
TestDataName = testData.DisplayName, // Use DisplayName for IId% info
Engine = serializer.Engine,
IoMode = serializer.IoMode,
DispatchMode = serializer.DispatchMode,
OptionsPreset = serializer.OptionsPreset,
OptionsDescription = serializer.OptionsDescription,
SerializedSize = serializer.SerializedSize,
SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes,
SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes,
IsRoundTripOnly = serializer.IsRoundTripOnly
};
// Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark
// is visibly stuck on a specific row at a specific %% rather than silently hanging.
var groupLabel = $"{result.SerializerName}";
if (serializer.IsRoundTripOnly)
{
// Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT,
// Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the
// entire round-trip path, then record into the RT result columns.
if (mode is "all" or "serialize" or "ser")
{
BenchmarkLoop.ForceGcCollect();
serializer.WarmupSerialize(Configuration.WarmupIterations);
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
var rtIter = BenchmarkLoop.CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
var (rtMed, rtMin, rtMax, rtStd) = BenchmarkLoop.RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
result.RoundTripTimeMs = rtMed;
result.RoundTripTimeMinMs = rtMin;
result.RoundTripTimeMaxMs = rtMax;
result.RoundTripTimeStdDevMs = rtStd;
result.RoundTripIterations = rtIter;
// Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len])
// also show up — otherwise current-thread alloc would only count the client side and look ~halved.
result.RoundTripAllocBytesPerOp = BenchmarkLoop.MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]");
}
// mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently.
}
else
{
// ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
if (mode is "all" or "serialize" or "ser")
{
BenchmarkLoop.ForceGcCollect();
serializer.WarmupSerialize(Configuration.WarmupIterations);
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
var serIter = BenchmarkLoop.CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
var (serMed, serMin, serMax, serStd) = BenchmarkLoop.RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
result.SerializeTimeMs = serMed;
result.SerializeTimeMinMs = serMin;
result.SerializeTimeMaxMs = serMax;
result.SerializeTimeStdDevMs = serStd;
result.SerializeIterations = serIter;
// Dedicated alloc-only sample (separate from timing samples; keeps timing pure)
result.SerializeAllocBytesPerOp = BenchmarkLoop.MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
}
// ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
// The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the
// Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph).
if (mode is "all" or "deserialize" or "des")
{
BenchmarkLoop.ForceGcCollect();
serializer.WarmupDeserialize(Configuration.WarmupIterations);
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
var desIter = BenchmarkLoop.CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs);
var (desMed, desMin, desMax, desStd) = BenchmarkLoop.RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
result.DeserializeTimeMs = desMed;
result.DeserializeTimeMinMs = desMin;
result.DeserializeTimeMaxMs = desMax;
result.DeserializeTimeStdDevMs = desStd;
result.DeserializeIterations = desIter;
result.DeserializeAllocBytesPerOp = BenchmarkLoop.MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]");
}
// Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration,
// batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
// then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
// RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp.
var serPerOp = Output.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations);
var desPerOp = Output.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations);
var rtPerOp = serPerOp + desPerOp;
result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations);
result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
}
results.Add(result);
Output.PrintResult(result);
}
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released
// before the next test data builds new ones — otherwise pipes / handles leak across test cells).
foreach (var s in serializers) (s as IDisposable)?.Dispose();
return results;
}
private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, string serializerMode)
{
// FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
// TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[].
// - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head.
// Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
//
// FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint —
// we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor
// reference earlier. Re-enable when revisiting Fast wire-mode performance.
if (serializerMode == "fastestbyte")
{
var fastestByteOptions = AcBinarySerializerOptions.FastMode;
fastestByteOptions.WireMode = Configuration.SelectedWireMode;
return new List<ISerializerBenchmark>
{
new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"),
//new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"),
new MemoryPackBenchmark(testData.Order, "Default"),
};
}
// AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer).
// Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with
// the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it
// in isolation so the timing numbers reflect ONLY the streaming path.
if (serializerMode == "asyncpipe")
{
// NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here
// drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to
// the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the
// NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall
// fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level
// wire chunk AND kernel transfer unit; change ONLY this line when tuning.
var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode;
binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize;
binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode;
return new List<ISerializerBenchmark>
{
// Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync.
// Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer +
// MRES wait-on-byte-shortage — over a kernel NamedPipe.
new AcBinaryNamedPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
// Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport,
// same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize<T>(byte[]).
// No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with
// the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from
// kernel-transport-overhead (raw vs in-process Byte[]).
new AcBinaryNamedPipeRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
// Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel).
// Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the
// kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk
// on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this
// isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the
// in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback
// is the worst-case benchmark scenario for chunked-streaming and not representative of real network
// / file / cross-thread Pipe scenarios.
new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
// Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples
// baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw
// byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory].
new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
};
}
// Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the
// AsyncPipe menu / CLI mode, never bundled with the steady-state suite).
var binaryNoInternOption = AcBinarySerializerOptions.Default;
binaryNoInternOption.UseStringInterning = StringInterningMode.None;
binaryNoInternOption.WireMode = Configuration.SelectedWireMode;
var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default;
binaryDefaultNoSgenOption.UseGeneratedCode = false;
binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode;
var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode;
binaryFastModeNoSgenOption.UseGeneratedCode = false;
binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode;
var binaryFastModeOption = AcBinarySerializerOptions.FastMode;
binaryFastModeOption.WireMode = Configuration.SelectedWireMode;
// BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives
// the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call.
// Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration
// allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead
// vs syscall count).
var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode;
binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize;
binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode;
// In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment
// concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory
// System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task).
var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode;
binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize;
binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode;
var defaultOptions = AcBinarySerializerOptions.Default;
defaultOptions.UseStringInterning = StringInterningMode.None;
defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId;
defaultOptions.WireMode = Configuration.SelectedWireMode;
return new List<ISerializerBenchmark>
{
// ============================================================
// AcBinary — Byte[] API (uncomment to compare option presets side-by-side)
// ============================================================
// Fastest Byte[] — SGen path (UseGeneratedCode=true, default).
new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
// Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch.
// Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples.
// NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates
// when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish).
new AcBinaryBenchmark(testData.Order, binaryFastModeNoSgenOption, "FastMode"),
// Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) +
// UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size
// and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data.
new AcBinaryBenchmark(testData.Order, defaultOptions, "Default"),
//new AcBinaryBenchmark(testData.Order, binaryDefaultNoSgenOption, "Default"),
//new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"),
//new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"),
// AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario)
new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
// AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario).
// 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter
// allocation. Optimum for this scenario.
new AcBinaryFreshBufferWriterBenchmark(testData.Order, binaryFastModeBufWrChunk, "FastMode (4KB)"),
// AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the
// Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost
// (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns.
// The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode.
new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"),
// Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to-
// apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs
// chunked-streaming wire format. The IO column shows "Bytes(in-mem)".
new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"),
// AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED
// here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport
// measurements.
// ============================================================
// MemoryPack — three I/O modes for apples-to-apples comparison
// ============================================================
new MemoryPackBenchmark(testData.Order, "Default"),
new MemoryPackBufferWriterBenchmark(testData.Order, "Default"),
new MemoryPackFreshBufferWriterBenchmark(testData.Order, "Default"),
// ============================================================
// MessagePack — for legacy comparison
// ============================================================
#if !AYCODE_NATIVEAOT
// MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed
// ListFormatter<T> et al. — fails under NativeAOT publish with "No parameterless constructor".
// Excluded from the AOT build; available for regular JIT runs only.
new MessagePackBenchmark(testData.Order, "ContractBased"),
#endif
// System.Text.Json (commented — JSON serializer for reference; not in active suite)
//new SystemTextJsonBenchmark(testData.Order, "Default")
};
}
#endregion
#region Serializer Implementations
internal sealed class AcBinaryBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
private readonly AcBinarySerializerOptions _options;
private readonly byte[] _serialized;
public string Engine => Configuration.EngineAcBinary;
public string IoMode => Configuration.IoByteArray;
public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes => 0;
public long SetupDeserializeAllocBytes => 0;
public string OptionsDescription => BuildAcBinaryOptionsDescription(_options);
public AcBinaryBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
{
_order = order;
_options = options;
OptionsPreset = optionsPreset;
_serialized = AcBinarySerializer.Serialize(order, options);
//_options.UseCompression = Lz4CompressionMode.Block;
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
AcBinarySerializer.Serialize(_order, _options);
//if (_options.ReferenceHandling != ReferenceHandlingMode.None || _options.UseStringInterning != StringInterningMode.None)
//{
// AcBinarySerializer.ScanOnly(_order, _options);
//}
//else AcBinarySerializer.Serialize(_order, _options);
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize() => AcBinaryDeserializer.Deserialize<TestOrder>(_serialized, _options);
public bool VerifyRoundTrip()
{
var bytes = AcBinarySerializer.Serialize(_order, _options);
var roundTripped = AcBinaryDeserializer.Deserialize<TestOrder>(bytes, _options);
return BenchmarkLoop.DeepEqualsViaJson(_order, roundTripped);
}
}
internal sealed class MemoryPackBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
private readonly MemoryPackSerializerOptions _options;
private readonly byte[] _serialized;
public string Engine => Configuration.EngineMemoryPack;
public string IoMode => Configuration.IoByteArray;
public string DispatchMode => Configuration.ModeSGen; // MemoryPack always uses [MemoryPackable] source-generated formatters
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes => 0;
public long SetupDeserializeAllocBytes => 0;
public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}";
public MemoryPackBenchmark(TestOrder order, string optionsPreset)
{
_order = order;
OptionsPreset = optionsPreset;
_options = GetMemPackOptions();
_serialized = MemoryPackSerializer.Serialize(order, _options);
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize() => MemoryPackSerializer.Serialize(_order, _options);
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize() => MemoryPackSerializer.Deserialize<TestOrder>(_serialized, _options);
public bool VerifyRoundTrip()
{
var bytes = MemoryPackSerializer.Serialize(_order, _options);
var roundTripped = MemoryPackSerializer.Deserialize<TestOrder>(bytes, _options);
return BenchmarkLoop.DeepEqualsViaJson(_order, roundTripped);
}
}
#if !AYCODE_NATIVEAOT
// MessagePack benchmark — excluded from NativeAOT build because v3's StandardResolver falls back
// to DynamicGenericResolver for closed-generic types (List<TestOrderItem> et al.), which uses
// Activator.CreateInstance on formatter types the AOT trimmer drops → MissingMethodException at runtime.
// Available for regular JIT runs (`dotnet run`) only.
internal sealed class MessagePackBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
private readonly MessagePackSerializerOptions _options;
private readonly byte[] _serialized;
public string Engine => Configuration.EngineMessagePack;
public string IoMode => Configuration.IoByteArray;
public string DispatchMode => Configuration.ModeSGen; // MessagePack uses [MessagePackObject] source-generated formatters (StandardResolver)
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes => 0;
public long SetupDeserializeAllocBytes => 0;
public string OptionsDescription { get; }
public MessagePackBenchmark(TestOrder order, string optionsPreset)
{
_order = order;
OptionsPreset = optionsPreset;
//_options = ContractlessStandardResolver.Options.WithCompression(MessagePackCompression.None);
//_options = ContractlessStandardResolver.Options.WithCompression(MessagePackCompression.Lz4Block);
_options = MessagePackSerializerOptions.Standard.WithCompression(MessagePackCompression.None);
var isContractless = _options.Resolver is ContractlessStandardResolver;
OptionsDescription = $"Mode={( isContractless ? "Contractless" : "ContractBased")}, Compression={_options.Compression}";
_serialized = MessagePackSerializer.Serialize(order, _options);
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize() => MessagePackSerializer.Serialize(_order, _options);
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize() => MessagePackSerializer.Deserialize<TestOrder>(_serialized, _options);
public bool VerifyRoundTrip()
{
var bytes = MessagePackSerializer.Serialize(_order, _options);
var roundTripped = MessagePackSerializer.Deserialize<TestOrder>(bytes, _options);
return BenchmarkLoop.DeepEqualsViaJson(_order, roundTripped);
}
}
#endif
/// <summary>
/// Benchmarks AcBinary via the IBufferWriter overload with a pre-allocated, reused ArrayBufferWriter.
/// Realistic IBufferWriter usage pattern: caller owns + reuses the writer (zero alloc per call after warmup).
/// </summary>
/// <summary>
/// Benchmarks AcBinary via the IBufferWriter overload, allocating a FRESH ArrayBufferWriter on EVERY call.
/// One-shot scenario — represents code that doesn't reuse a writer across calls.
/// Uses BufferWriterChunkSize=4096 (production-realistic, SignalR-aligned) instead of the 65535 default —
/// otherwise AcBinary would request 64KB upfront via GetSpan(), forcing the fresh ABW to allocate 64KB
/// regardless of payload size (heavy over-allocation for small payloads).
/// </summary>
internal sealed class AcBinaryFreshBufferWriterBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
private readonly AcBinarySerializerOptions _options;
private readonly byte[] _serialized;
public string Engine => Configuration.EngineAcBinary;
public string IoMode => Configuration.IoBufWrNew;
public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes => 0;
public long SetupDeserializeAllocBytes => 0;
public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B");
public AcBinaryFreshBufferWriterBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
{
_order = order;
// BufferWriterChunkSize comes from the caller (central source of truth in CreateSerializers
// — the binaryFastMode4KbChunk options instance). Do NOT mutate _options here; tune the chunk
// size in CreateSerializers only.
_options = options;
OptionsPreset = optionsPreset;
_serialized = AcBinarySerializer.Serialize(order, _options);
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
var abw = new ArrayBufferWriter<byte>(); // FRESH every call — alloc + grow as needed
AcBinarySerializer.Serialize(_order, abw, _options);
}
// BufWr semantic: read from a ReadOnlySequence<byte> (the ROS overload), NOT from byte[] —
// single-segment array-backed sequence triggers the fast-path in AcBinaryDeserializer.cs:298 which
// redirects to the byte[] overload. This means the bench actually exercises the ROS-input path
// (the production-realistic surface for SignalR / Pipe consumers) rather than secretly testing
// byte[] Deser under the BufWr label.
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize() => AcBinaryDeserializer.Deserialize<TestOrder>(new ReadOnlySequence<byte>(_serialized), _options);
public bool VerifyRoundTrip()
{
var abw = new ArrayBufferWriter<byte>();
AcBinarySerializer.Serialize(_order, abw, _options);
var roundTripped = AcBinaryDeserializer.Deserialize<TestOrder>(new ReadOnlySequence<byte>(abw.WrittenMemory), _options);
return BenchmarkLoop.DeepEqualsViaJson(_order, roundTripped);
}
}
/// <summary>
/// Benchmarks AcBinary over a long-lived NamedPipe IPC connection using the AcBinary native streaming API
/// (<see cref="AcBinarySerializer.SerializeChunked{T}(T, System.IO.Pipelines.PipeWriter, AcBinarySerializerOptions)"/>
/// + <see cref="AsyncPipeReaderInput"/> + <see cref="AsyncPipeReaderInputExtensions.DrainFromAsync"/>).
/// Mirrors what a real consumer (e.g. <c>DeserializeFromPipeReaderAsync</c>) does per message:
/// long-lived <see cref="AsyncPipeReaderInput"/> with multi-message wire framing on top of a long-lived NamedPipe.
///
/// <para><b>Architecture</b>:</para>
/// <list type="bullet">
/// <item>Constructor (NOT timed): sets up <see cref="NamedPipeServerStream"/> + <see cref="NamedPipeClientStream"/>,
/// waits for connection, creates one long-lived <see cref="System.IO.Pipelines.PipeWriter"/> /
/// <see cref="System.IO.Pipelines.PipeReader"/> pair, ONE long-lived <see cref="AsyncPipeReaderInput"/>
/// in <c>multiMessage = true</c> mode, ONE drain Task that pumps <see cref="AsyncPipeReaderInputExtensions.DrainFromAsync"/>
/// forever, and ONE deserialize Task that loops <c>AcBinaryDeserializer.Deserialize&lt;T&gt;(input, opts)</c>
/// producing into a <see cref="System.Threading.Channels.Channel{T}"/>.</item>
/// <item>Per-iteration <see cref="Serialize"/> (timed): sender writes via
/// <see cref="AcBinarySerializer.SerializeChunkedFramed{T}(T, System.IO.Pipelines.PipeWriter, AcBinarySerializerOptions)"/>
/// — multi-message wire (<c>[201][UINT16][data]...[202]</c>); the <c>[202]</c> end marker arms the input's
/// <c>_readPos = -1</c> sentinel, so the next message's first <c>AppendToBuffer</c> recycles the buffer to 0.
/// Then receiver awaits the channel for the deserialized result.</item>
/// <item><see cref="Deserialize"/> is a no-op (full round-trip captured in <see cref="Serialize"/>);
/// <see cref="IsRoundTripOnly"/>=true → Ser ms / SerAlloc oszlopok N/A, RT ms = full round-trip.</item>
/// </list>
///
/// <para><b>Per-iter overhead</b>: 0 new <c>Task.Run</c>, 0 new <c>AsyncPipeReaderInput</c>, 0 new <c>CancellationTokenSource</c>.
/// Pure cost = <c>SerializeChunkedFramed</c> (CPU + chunk-onkénti flush) + kernel write/read syscalls + 1 sync barrier
/// (channel) + deserialized graph alloc. The "multi-message reuse" pattern enabled by Q4T8 fix (R5K2 minimum: <c>_readPos = -1</c>
/// sentinel + <c>AppendToBuffer</c> sliding-window cycling).</para>
///
/// <para><b>Approximation note</b>: single-process loopback NamedPipe. Real cross-process / cross-machine SignalR
/// adds further transport latency (TCP, WebSocket framing) on top. The benchmark gives a lower bound.</para>
/// </summary>
internal sealed class AcBinaryNamedPipeBenchmark : ISerializerBenchmark, IDisposable
{
private readonly TestOrder _order;
private readonly AcBinarySerializerOptions _options;
private readonly byte[] _serialized; // for SerializedSize reporting only
// Long-lived pipe lifecycle (set up once in ctor — NOT timed).
private readonly NamedPipeServerStream _pipeServer;
private readonly NamedPipeClientStream _pipeClient;
private readonly PipeWriter _pipeWriter;
private readonly PipeReader _pipeReader;
// Long-lived multi-message receive infrastructure (set up once in ctor).
private readonly AsyncPipeReaderInput _input;
private readonly CancellationTokenSource _cts;
private readonly Task _drainTask; // BG: PipeReader → input.Feed (continuous pump)
private readonly Task _consumerTask; // BG: per-iter Deserialize<T>(input) loop, signaled by calling thread
private readonly ManualResetEventSlim _consumeRequest = new(false);
private readonly ManualResetEventSlim _consumeDone = new(false);
private object? _lastResult; // captured during VerifyRoundTrip; null in benchmark iters
private bool _captureResult; // toggle: when true, ConsumeLoop stores result; otherwise discards
private bool _disposed;
public string Engine => Configuration.EngineAcBinary;
public string IoMode => Configuration.IoNamedPipe;
public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes { get; }
public long SetupDeserializeAllocBytes { get; }
public bool IsRoundTripOnly => true;
public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=NamedPipe(long-lived,multiMessage,2-task)");
public AcBinaryNamedPipeBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
{
_order = order;
// BufferWriterChunkSize comes from the caller (central source of truth in CreateSerializers
// — the binaryFastMode4KbChunk options instance). Do NOT mutate _options here; tune the chunk
// size in CreateSerializers only.
_options = options;
OptionsPreset = optionsPreset;
_serialized = AcBinarySerializer.Serialize(order, _options);
// 1× pipe setup. Kernel-side pipe buffer (inBufferSize / outBufferSize on the server ctor — the
// client inherits the server-defined buffer size at connect time) matches BufferWriterChunkSize
// exactly: AsyncPipeWriterOutput now treats chunkSize as the chunk-on-wire total size (header +
// data), so one WriteFile(chunkSize) syscall lands in exactly one kernel-page slot — page-aligned,
// no fragmentation, no IRP reordering. _options.BufferWriterChunkSize is the single tunable source.
var pipeName = $"AcBinaryBench-{Guid.NewGuid():N}";
// === SERIALIZE-side setup measurement ===
// pipe-pair (server + client) + connect handshake + writer-side PipeWriter wrapper.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeSer = GC.GetAllocatedBytesForCurrentThread();
_pipeServer = new NamedPipeServerStream(pipeName, PipeDirection.In, 1, PipeTransmissionMode.Byte,
System.IO.Pipes.PipeOptions.Asynchronous,
inBufferSize: _options.BufferWriterChunkSize,
outBufferSize: _options.BufferWriterChunkSize);
_pipeClient = new NamedPipeClientStream(".", pipeName, PipeDirection.Out, System.IO.Pipes.PipeOptions.Asynchronous);
var serverWait = _pipeServer.WaitForConnectionAsync();
_pipeClient.Connect();
serverWait.GetAwaiter().GetResult();
_pipeWriter = PipeWriter.Create(_pipeClient);
var afterSer = GC.GetAllocatedBytesForCurrentThread();
SetupSerializeAllocBytes = afterSer - beforeSer;
// === DESERIALIZE-side setup measurement ===
// PipeReader wrapper + AsyncPipeReaderInput (ArrayPool rent + ManualResetEventSlim) + drain
// task + consumer task scaffolding. Two long-lived BG tasks total: drain pumps bytes from the
// kernel pipe into input; consumer drives Deserialize<T>(input) per iter on signal.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeDes = GC.GetAllocatedBytesForCurrentThread();
_pipeReader = PipeReader.Create(_pipeServer);
_input = new AsyncPipeReaderInput(_options.BufferWriterChunkSize * 2, multiMessage: true);
_cts = new CancellationTokenSource();
// Drain task: pumps PipeReader → input.Feed forever (or until cancel). Single Task.Run for
// the full benchmark lifetime — its overhead is amortised across all messages.
_drainTask = Task.Run(() => _input.DrainFromAsync(_pipeReader, _cts.Token));
// Consumer task: per-iter Deserialize<T>(input) loop. Started here once; signaled per-iter via
// _consumeRequest. Enables Ser↔Des streaming overlap — calling thread runs SerializeChunkedFramed
// while THIS task simultaneously runs Deserialize<T>, both consuming/producing through the
// sliding-window buffer pipelined by the drain task.
_consumerTask = Task.Run(ConsumeLoop);
var afterDes = GC.GetAllocatedBytesForCurrentThread();
SetupDeserializeAllocBytes = afterDes - beforeDes;
}
// BG consumer: parks on _consumeRequest, runs Deserialize<T>(_input) when signaled, signals _consumeDone.
// The Deserialize call internally blocks on the input's MRES whenever the drain hasn't yet fed enough
// bytes for the next read — that's where the streaming-pipeline overlap with the calling thread (Ser)
// happens.
private void ConsumeLoop()
{
var ct = _cts.Token;
try
{
while (true)
{
_consumeRequest.Wait(ct);
if (ct.IsCancellationRequested) return;
_consumeRequest.Reset();
try
{
var result = AcBinaryDeserializer.Deserialize<TestOrder>(_input, _options);
if (_captureResult) _lastResult = result;
}
catch
{
// Swallow — calling thread sees the failure via missing/incorrect _lastResult during VerifyRoundTrip,
// or the benchmark loop just continues (timing impacted). Production teardown handled in Dispose.
}
finally
{
_consumeDone.Set();
}
}
}
catch (OperationCanceledException)
{
// Cooperative cancel — Dispose path. Swallow.
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
// 2-task streaming pipeline:
// 1. Calling thread signals consumer task to begin Deserialize<T>(input). Consumer immediately
// starts; first read blocks on input's MRES because no bytes flowed yet.
// 2. Calling thread starts SerializeChunkedFramed → chunks flow through PipeWriter → kernel pipe →
// drain task (BG) feeds input.Feed → MRES pulses → consumer's Deserialize<T> consumes bytes
// chunk by chunk. Ser↔Des truly overlap here.
// 3. Calling thread waits for _consumeDone (signaling Deserialize<T> returned).
_consumeDone.Reset();
_consumeRequest.Set();
AcBinarySerializer.SerializeChunkedFramed(_order, _pipeWriter, _options);
_consumeDone.Wait();
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize()
{
// No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract.
}
public bool VerifyRoundTrip()
{
// Use the same 2-task streaming path as the benchmark, but capture the result for graph-equality.
_captureResult = true;
try
{
Serialize();
var result = _lastResult as TestOrder;
return result != null && BenchmarkLoop.DeepEqualsViaJson(_order, result);
}
finally
{
_captureResult = false;
_lastResult = null;
}
}
public void Dispose()
{
if (_disposed) return;
_disposed = true;
// Cancel drain + consumer tasks → both exit. Pulse _consumeRequest in case consumer is parked.
try { _cts.Cancel(); } catch { /* swallow on teardown */ }
try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ }
try { _drainTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
// Complete writer + dispose pipe lifecycle.
try { _pipeWriter.CompleteAsync().AsTask().Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
try { _pipeReader.Complete(); } catch { /* swallow on teardown */ }
try { _pipeClient.Dispose(); } catch { /* swallow on teardown */ }
try { _pipeServer.Dispose(); } catch { /* swallow on teardown */ }
try { _input.Dispose(); } catch { /* swallow on teardown */ }
try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ }
try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ }
try { _cts.Dispose(); } catch { /* swallow on teardown */ }
}
}
/// <summary>
/// Same chunked-framed AsyncPipe code path as <see cref="AcBinaryNamedPipeBenchmark"/>, but the transport
/// is an in-memory <see cref="System.IO.Pipelines.Pipe"/> instead of a kernel <c>NamedPipe</c>. The Pipe's
/// <c>Writer</c>/<c>Reader</c> pair is a managed-only zero-copy slab handoff — no syscalls, no kernel
/// buffer copy, no IRP queueing.
///
/// <para><b>Why this benchmark matters</b>: by holding ALL other variables constant (same SerializeChunkedFramed,
/// same AsyncPipeReaderInput, same drain task, same consumer task, same multi-message wire format), this
/// row isolates the <b>kernel-NamedPipe transport overhead</b> from the chunked-streaming framework's pure
/// CPU cost. The expected delta vs <see cref="AcBinaryNamedPipeBenchmark"/>: per-chunk overhead drops from
/// ~25-30 µs (kernel-syscall pair + IRP) to ~1-2 µs (managed slab handoff). Multi-chunk Large-message rows
/// should converge dramatically toward <see cref="AcBinaryNamedPipeRawByteArrayBenchmark"/>.</para>
///
/// <para><b>Real-world relevance</b>: in-memory Pipe is the typical primitive used for cross-thread serializer
/// pipelines inside a single process (e.g. SignalR's Kestrel transport adapter, gRPC framework internals,
/// custom message brokers). The numbers from this row reflect that scenario, NOT the kernel-pipe loopback
/// of the NamedPipe benchmark.</para>
/// </summary>
internal sealed class AcBinaryInMemoryPipeBenchmark : ISerializerBenchmark, IDisposable
{
private readonly TestOrder _order;
private readonly AcBinarySerializerOptions _options;
private readonly byte[] _serialized; // for SerializedSize reporting only
// Long-lived in-memory pipe lifecycle (set up once in ctor — NOT timed).
private readonly Pipe _pipe;
private readonly PipeWriter _pipeWriter;
private readonly PipeReader _pipeReader;
// Long-lived multi-message receive infrastructure (set up once in ctor) — same pattern as the NamedPipe
// variant: drain pumps reader into AsyncPipeReaderInput, consumer task drives Deserialize<T>(input).
private readonly AsyncPipeReaderInput _input;
private readonly CancellationTokenSource _cts;
private readonly Task _drainTask;
private readonly Task _consumerTask;
private readonly ManualResetEventSlim _consumeRequest = new(false);
private readonly ManualResetEventSlim _consumeDone = new(false);
private object? _lastResult;
private bool _captureResult;
private bool _disposed;
public string Engine => Configuration.EngineAcBinary;
public string IoMode => Configuration.IoInMemoryPipe;
public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes { get; }
public long SetupDeserializeAllocBytes { get; }
public bool IsRoundTripOnly => true;
public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=Pipe(in-memory,multiMessage,2-task)");
public AcBinaryInMemoryPipeBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
{
_order = order;
_options = options;
OptionsPreset = optionsPreset;
_serialized = AcBinarySerializer.Serialize(order, _options);
// === SERIALIZE-side setup measurement ===
// In-memory Pipe construction. NO kernel-pipe pair, NO Connect handshake — just a managed Pipe object
// and a reference to its Writer side. PipeWriterImpl (parallel-flush capable, NOT StreamPipeWriter).
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeSer = GC.GetAllocatedBytesForCurrentThread();
_pipe = new Pipe();
_pipeWriter = _pipe.Writer;
var afterSer = GC.GetAllocatedBytesForCurrentThread();
SetupSerializeAllocBytes = afterSer - beforeSer;
// === DESERIALIZE-side setup measurement ===
// PipeReader reference + AsyncPipeReaderInput (ArrayPool rent + ManualResetEventSlim) + drain task +
// consumer task scaffolding. Identical to the NamedPipe variant on the receive side.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeDes = GC.GetAllocatedBytesForCurrentThread();
_pipeReader = _pipe.Reader;
_input = new AsyncPipeReaderInput(_options.BufferWriterChunkSize * 2, multiMessage: true);
_cts = new CancellationTokenSource();
_drainTask = Task.Run(() => _input.DrainFromAsync(_pipeReader, _cts.Token));
_consumerTask = Task.Run(ConsumeLoop);
var afterDes = GC.GetAllocatedBytesForCurrentThread();
SetupDeserializeAllocBytes = afterDes - beforeDes;
}
// BG consumer: parks on _consumeRequest, runs Deserialize<T>(_input) when signaled, signals _consumeDone.
// Mirror of AcBinaryNamedPipeBenchmark.ConsumeLoop — same pattern, same MRES protocol.
private void ConsumeLoop()
{
var ct = _cts.Token;
try
{
while (true)
{
_consumeRequest.Wait(ct);
if (ct.IsCancellationRequested) return;
_consumeRequest.Reset();
try
{
var result = AcBinaryDeserializer.Deserialize<TestOrder>(_input, _options);
if (_captureResult) _lastResult = result;
}
catch
{
// Swallow — see ConsumeLoop in NamedPipe variant for rationale.
}
finally
{
_consumeDone.Set();
}
}
}
catch (OperationCanceledException)
{
// Cooperative cancel — Dispose path. Swallow.
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
// Same 2-task streaming pipeline as NamedPipe variant — only the transport differs (in-memory Pipe
// instead of kernel NamedPipe). Per-chunk SerializeChunkedFramed → PipeWriter slab → drain task
// reads from PipeReader → input.Feed → consumer Deserialize<T> consumes byte-by-byte.
//
// Uses the Pipe-overload (instead of the PipeWriter-overload) so the FlushPolicy parameter is
// exposed for tuning. Toggle between FlushPolicy.PerChunk (bounded peak memory, per-chunk await
// FlushAsync) and FlushPolicy.Coalesced (fire-and-forget per chunk, pipe-coalesced flushes up to
// PauseWriterThreshold ~64 KB) to A/B-test the streaming-pipeline overhead. FlushPolicy.PerChunk
// is functionally equivalent to the PipeWriter-overload (both internally route to
// SerializeToPipeWriterCore with FlushPolicy.PerChunk).
_consumeDone.Reset();
_consumeRequest.Set();
AcBinarySerializer.SerializeChunkedFramed(_order, _pipe, _options, FlushPolicy.Coalesced);
_consumeDone.Wait();
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize()
{
// No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract.
}
public bool VerifyRoundTrip()
{
_captureResult = true;
try
{
Serialize();
var result = _lastResult as TestOrder;
return result != null && BenchmarkLoop.DeepEqualsViaJson(_order, result);
}
finally
{
_captureResult = false;
_lastResult = null;
}
}
public void Dispose()
{
if (_disposed) return;
_disposed = true;
// Cancel drain + consumer tasks → both exit. Pulse _consumeRequest in case consumer is parked.
try { _cts.Cancel(); } catch { /* swallow on teardown */ }
try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ }
try { _drainTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
// Complete writer + reader (in-memory Pipe — no underlying stream to dispose).
try { _pipeWriter.CompleteAsync().AsTask().Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
try { _pipeReader.Complete(); } catch { /* swallow on teardown */ }
try { _input.Dispose(); } catch { /* swallow on teardown */ }
try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ }
try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ }
try { _cts.Dispose(); } catch { /* swallow on teardown */ }
}
}
/// <summary>
/// Raw <c>byte[]</c> over a long-lived NamedPipe — NO chunk-framing, NO <c>AsyncPipeReaderInput</c>,
/// NO sliding-window buffer. Calling thread serialises + writes; a long-lived background consumer task
/// reads and deserialises. Two-task pattern enables Ser↔Read overlap (kernel-pipe-pipelined) AND
/// avoids the kernel-buffer-full deadlock when <c>bytes.Length &gt; inBufferSize</c>.
///
/// Side-by-side with <see cref="AcBinaryNamedPipeBenchmark"/> (chunked-framed AsyncPipe stack) this
/// isolates two cost components on the SAME kernel-pipe transport with the SAME <c>inBufferSize</c>:
/// <list type="bullet">
/// <item><description><b>This row vs <see cref="AcBinaryBenchmark"/> (Byte[])</b> — pure kernel-NamedPipe
/// overhead (WriteFile / ReadFile syscalls + IRP queueing + buffer-copy + thread-handoff).</description></item>
/// <item><description><b>This row vs <see cref="AcBinaryNamedPipeBenchmark"/> (chunked-framed)</b> — pure
/// AsyncPipe-framework overhead (chunk header writes + sliding-window <c>Feed</c> + MRES wait inside
/// <c>AsyncPipeReaderInput</c>) AND the streaming-pipeline benefit of intra-message Ser↔Des overlap (which
/// raw lacks — raw can only Ser↔Read overlap, with Des sequential after Read completes).</description></item>
/// </list>
/// Per-iter <c>byte[]</c> allocation from <c>AcBinarySerializer.Serialize</c> is part of the cost (matches
/// <see cref="AcBinaryBenchmark"/>'s API contract); the receive-side scratch buffer is also allocated per-iter
/// on the consumer-task (counted via <c>GC.GetTotalAllocatedBytes</c> in <c>BenchmarkLoop.MeasureAllocationTotal</c>).
/// </summary>
internal sealed class AcBinaryNamedPipeRawByteArrayBenchmark : ISerializerBenchmark, IDisposable
{
private readonly TestOrder _order;
private readonly AcBinarySerializerOptions _options;
private readonly byte[] _serialized; // for SerializedSize reporting + receive-side size known upfront
// Long-lived pipe lifecycle (set up once in ctor — NOT timed).
private readonly NamedPipeServerStream _pipeServer;
private readonly NamedPipeClientStream _pipeClient;
// Long-lived consumer-task infrastructure (Read + Deserialize on BG thread, signaled per iter).
// Mirrors AcBinaryNamedPipeBenchmark's drain+consumer pair, but raw byte[] doesn't have an
// intermediate sliding-window buffer, so Read+Des happen sequentially in one BG task: Read N bytes
// → Deserialize<T>(bytes) → signal done. Calling thread's Ser↔Write overlaps with this BG Read+Des
// through kernel-pipe pipelining.
private readonly CancellationTokenSource _cts;
private readonly Task _consumerTask;
private readonly ManualResetEventSlim _consumeRequest = new(false);
private readonly ManualResetEventSlim _consumeDone = new(false);
private int _pendingReadSize;
private object? _lastResult; // captured during VerifyRoundTrip; null in benchmark iters
private bool _captureResult; // toggle: when true, ConsumerLoop stores result; otherwise discards
private bool _disposed;
public string Engine => Configuration.EngineAcBinary;
public string IoMode => Configuration.IoNamedPipeRaw;
public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes { get; }
public long SetupDeserializeAllocBytes { get; }
public bool IsRoundTripOnly => true;
public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=NamedPipe(raw,2-task)");
public AcBinaryNamedPipeRawByteArrayBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
{
_order = order;
// BufferWriterChunkSize comes from the caller — same source-of-truth contract as
// AcBinaryNamedPipeBenchmark. The kernel pipe-buffer (inBufferSize) is wired to it so the
// raw-vs-chunked comparison runs on identical transport conditions.
_options = options;
OptionsPreset = optionsPreset;
_serialized = AcBinarySerializer.Serialize(order, _options);
var pipeName = $"AcBinaryBenchRaw-{Guid.NewGuid():N}";
// === SERIALIZE-side setup measurement ===
// pipe-pair (server + client) + connect handshake. NO PipeWriter wrapper — we use the raw
// Stream.Write API directly, matching the no-framing semantics of this benchmark.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeSer = GC.GetAllocatedBytesForCurrentThread();
_pipeServer = new NamedPipeServerStream(pipeName, PipeDirection.In, 1, PipeTransmissionMode.Byte,
System.IO.Pipes.PipeOptions.Asynchronous,
inBufferSize: _options.BufferWriterChunkSize,
outBufferSize: _options.BufferWriterChunkSize);
_pipeClient = new NamedPipeClientStream(".", pipeName, PipeDirection.Out, System.IO.Pipes.PipeOptions.Asynchronous);
var serverWait = _pipeServer.WaitForConnectionAsync();
_pipeClient.Connect();
serverWait.GetAwaiter().GetResult();
var afterSer = GC.GetAllocatedBytesForCurrentThread();
SetupSerializeAllocBytes = afterSer - beforeSer;
// === DESERIALIZE-side setup measurement ===
// 1× background consumer-task + 2× MRES (request / done) + cancellation source. Matches the
// chunked benchmark's deserialize-side setup cost shape.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeDes = GC.GetAllocatedBytesForCurrentThread();
_cts = new CancellationTokenSource();
_consumerTask = Task.Run(ConsumerLoop);
var afterDes = GC.GetAllocatedBytesForCurrentThread();
SetupDeserializeAllocBytes = afterDes - beforeDes;
}
// BG consumer: parks on _consumeRequest, reads N bytes from pipe, runs Deserialize<T>(bytes), signals
// _consumeDone. The Read overlaps with the calling thread's Write through the kernel-pipe; Des happens
// sequentially after Read completes (raw byte[] needs the full message to deserialize).
private void ConsumerLoop()
{
var ct = _cts.Token;
try
{
while (true)
{
_consumeRequest.Wait(ct);
if (ct.IsCancellationRequested) return;
_consumeRequest.Reset();
try
{
var size = _pendingReadSize;
var bytes = new byte[size]; // per-iter alloc — counted by BenchmarkLoop.MeasureAllocationTotal
var totalRead = 0;
while (totalRead < size)
{
var n = _pipeServer.Read(bytes, totalRead, size - totalRead);
if (n == 0) break; // pipe closed / EOF — partial read swallowed
totalRead += n;
}
var result = AcBinaryDeserializer.Deserialize<TestOrder>(bytes, _options);
if (_captureResult) _lastResult = result;
}
catch
{
// Swallow — calling thread sees the failure via missing/incorrect _lastResult during VerifyRoundTrip,
// or the benchmark loop just continues (timing impacted). Production teardown handled in Dispose.
}
finally
{
_consumeDone.Set();
}
}
}
catch (OperationCanceledException)
{
// Cooperative cancel — Dispose path. Swallow.
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
// 2-task streaming pipeline:
// 1. Calling thread serialises → fresh byte[] (per-iter alloc, matches AcBinaryBenchmark contract).
// 2. Calling thread hands off expected size + signals consumer task. Consumer task starts Read loop
// on the pipe (BG thread). Calling thread proceeds to Write the bytes — Read and Write overlap
// through the kernel-pipe (kernel buffer fills, drains as consumer reads, sender resumes).
// 3. Calling thread waits for _consumeDone (consumer task finished Read+Des).
//
// Note: unlike chunked, raw byte[] cannot do Ser↔Des overlap (Des needs the full bytes before
// starting). Only Write↔Read overlaps here. The Des sequence on BG thread is: Read full bytes →
// Des the full graph → signal done. This is the architectural difference between raw and chunked.
var bytes = AcBinarySerializer.Serialize(_order, _options);
_pendingReadSize = bytes.Length;
_consumeDone.Reset();
_consumeRequest.Set();
_pipeClient.Write(bytes, 0, bytes.Length);
_pipeClient.Flush();
_consumeDone.Wait();
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize()
{
// No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract.
}
public bool VerifyRoundTrip()
{
// Use the same 2-task streaming path as the benchmark, but capture the result for graph-equality.
_captureResult = true;
try
{
Serialize();
var result = _lastResult as TestOrder;
return result != null && BenchmarkLoop.DeepEqualsViaJson(_order, result);
}
finally
{
_captureResult = false;
_lastResult = null;
}
}
public void Dispose()
{
if (_disposed) return;
_disposed = true;
// Cancel the consumer task → ConsumerLoop exits its Wait via OperationCanceledException.
try { _cts.Cancel(); } catch { /* swallow on teardown */ }
try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ }
try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
// Symmetric teardown — close client first (writer side), then server.
try { _pipeClient.Dispose(); } catch { /* swallow on teardown */ }
try { _pipeServer.Dispose(); } catch { /* swallow on teardown */ }
try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ }
try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ }
try { _cts.Dispose(); } catch { /* swallow on teardown */ }
}
}
/// <summary>
/// Raw <c>byte[]</c> over an in-memory cross-thread handoff — NO transport (no NamedPipe, no Pipe, no
/// Channel<see langword="&lt;T&gt;"/>). Calling thread serialises into a fresh <c>byte[]</c>, hands it to a
/// background consumer task via a single byte[] slot + MRES pair; the consumer deserialises and signals done.
///
/// <para><b>Why this benchmark matters</b>: completes the 2x2 transport × wire-format matrix:</para>
/// <list type="bullet">
/// <item><description><b>NamedPipe + Chunked</b> = <see cref="AcBinaryNamedPipeBenchmark"/></description></item>
/// <item><description><b>NamedPipe + Raw</b> = <see cref="AcBinaryNamedPipeRawByteArrayBenchmark"/></description></item>
/// <item><description><b>In-memory Pipe + Chunked</b> = <see cref="AcBinaryInMemoryPipeBenchmark"/></description></item>
/// <item><description><b>In-memory + Raw</b> = THIS row — apples-to-apples baseline for the in-memory chunked row</description></item>
/// </list>
/// <para>Side-by-side with <see cref="AcBinaryInMemoryPipeBenchmark"/> this isolates the chunked-streaming
/// framework's pure CPU cost, with the same in-memory transport (zero kernel involvement) on both sides.
/// Side-by-side with <see cref="AcBinaryNamedPipeRawByteArrayBenchmark"/> this isolates the kernel-NamedPipe
/// overhead on the raw-byte[] side.</para>
/// </summary>
internal sealed class AcBinaryInMemoryRawByteArrayBenchmark : ISerializerBenchmark, IDisposable
{
private readonly TestOrder _order;
private readonly AcBinarySerializerOptions _options;
private readonly byte[] _serialized; // for SerializedSize reporting only
// Long-lived consumer-task infrastructure (Deserialize on BG thread, signaled per iter).
// No transport — just a byte[] slot for handoff between calling thread and consumer task.
private readonly CancellationTokenSource _cts;
private readonly Task _consumerTask;
private readonly ManualResetEventSlim _consumeRequest = new(false);
private readonly ManualResetEventSlim _consumeDone = new(false);
private byte[]? _pendingBytes; // calling thread → consumer task handoff slot
private object? _lastResult; // captured during VerifyRoundTrip; null in benchmark iters
private bool _captureResult;
private bool _disposed;
public string Engine => Configuration.EngineAcBinary;
public string IoMode => Configuration.IoInMemoryRaw;
public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes { get; }
public long SetupDeserializeAllocBytes { get; }
public bool IsRoundTripOnly => true;
public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=in-memory(raw,2-task)");
public AcBinaryInMemoryRawByteArrayBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
{
_order = order;
_options = options;
OptionsPreset = optionsPreset;
_serialized = AcBinarySerializer.Serialize(order, _options);
// === SERIALIZE-side setup measurement ===
// Nothing to set up — calling thread allocates byte[] per iter via AcBinarySerializer.Serialize.
SetupSerializeAllocBytes = 0;
// === DESERIALIZE-side setup measurement ===
// 1× background consumer-task + 2× MRES (request / done) + cancellation source.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeDes = GC.GetAllocatedBytesForCurrentThread();
_cts = new CancellationTokenSource();
_consumerTask = Task.Run(ConsumerLoop);
var afterDes = GC.GetAllocatedBytesForCurrentThread();
SetupDeserializeAllocBytes = afterDes - beforeDes;
}
// BG consumer: parks on _consumeRequest, picks up the byte[] from _pendingBytes, runs Deserialize<T>(bytes),
// signals _consumeDone. Direct in-process handoff — no transport syscall, no buffer copy beyond the byte[]
// reference itself (zero-copy by reference).
private void ConsumerLoop()
{
var ct = _cts.Token;
try
{
while (true)
{
_consumeRequest.Wait(ct);
if (ct.IsCancellationRequested) return;
_consumeRequest.Reset();
try
{
var bytes = _pendingBytes;
if (bytes != null)
{
var result = AcBinaryDeserializer.Deserialize<TestOrder>(bytes, _options);
if (_captureResult) _lastResult = result;
}
}
catch
{
// Swallow — see ConsumerLoop in NamedPipe variant for rationale.
}
finally
{
_consumeDone.Set();
}
}
}
catch (OperationCanceledException)
{
// Cooperative cancel — Dispose path. Swallow.
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
// 2-task in-memory pipeline:
// 1. Calling thread serialises → fresh byte[] (per-iter alloc, matches AcBinaryBenchmark contract).
// 2. Calling thread parks the byte[] into _pendingBytes and signals consumer task. Consumer task
// picks up the reference (zero-copy) and runs Deserialize<T>(bytes).
// 3. Calling thread waits for _consumeDone (consumer task finished Des).
//
// Same architectural limitation as the NamedPipe-raw variant: Des cannot start until full bytes
// are available. Only the per-iter Ser↔Des thread-handoff overlaps slightly (calling thread starts
// signalling and waiting while consumer thread takes the byte[]).
var bytes = AcBinarySerializer.Serialize(_order, _options);
_pendingBytes = bytes;
_consumeDone.Reset();
_consumeRequest.Set();
_consumeDone.Wait();
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize()
{
// No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract.
}
public bool VerifyRoundTrip()
{
_captureResult = true;
try
{
Serialize();
var result = _lastResult as TestOrder;
return result != null && BenchmarkLoop.DeepEqualsViaJson(_order, result);
}
finally
{
_captureResult = false;
_lastResult = null;
}
}
public void Dispose()
{
if (_disposed) return;
_disposed = true;
try { _cts.Cancel(); } catch { /* swallow on teardown */ }
try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ }
try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ }
try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ }
try { _cts.Dispose(); } catch { /* swallow on teardown */ }
}
}
/// <summary>
/// Benchmarks MemoryPack via the IBufferWriter overload, allocating a FRESH ArrayBufferWriter on EVERY call.
/// Apples-to-apples counterpart to AcBinaryFreshBufferWriterBenchmark.
/// </summary>
internal sealed class MemoryPackFreshBufferWriterBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
private readonly MemoryPackSerializerOptions _options;
private readonly byte[] _serialized;
public string Engine => Configuration.EngineMemoryPack;
public string IoMode => Configuration.IoBufWrNew;
public string DispatchMode => Configuration.ModeSGen; // MemoryPack always uses [MemoryPackable] source-generated formatters
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes => 0;
public long SetupDeserializeAllocBytes => 0;
public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}";
public MemoryPackFreshBufferWriterBenchmark(TestOrder order, string optionsPreset)
{
_order = order;
OptionsPreset = optionsPreset;
_options = GetMemPackOptions();
_serialized = MemoryPackSerializer.Serialize(order, _options);
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
var abw = new ArrayBufferWriter<byte>();
MemoryPackSerializer.Serialize(abw, _order, _options);
}
// BufWr semantic: read from a ReadOnlySequence<byte> overload (apples-to-apples with AcBinary's
// BufWr Deser path). MemoryPack's ROS overload also single-segment-fast-paths internally.
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize() => MemoryPackSerializer.Deserialize<TestOrder>(new ReadOnlySequence<byte>(_serialized), _options);
public bool VerifyRoundTrip()
{
var abw = new ArrayBufferWriter<byte>();
MemoryPackSerializer.Serialize(abw, _order, _options);
var roundTripped = MemoryPackSerializer.Deserialize<TestOrder>(new ReadOnlySequence<byte>(abw.WrittenMemory), _options);
return BenchmarkLoop.DeepEqualsViaJson(_order, roundTripped);
}
}
internal sealed class AcBinaryBufferWriterBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
private readonly AcBinarySerializerOptions _options;
private readonly byte[] _serialized;
private readonly ArrayBufferWriter<byte> _bufferWriter;
public string Engine => Configuration.EngineAcBinary;
public string IoMode => Configuration.IoBufWrReuse;
public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes { get; }
public long SetupDeserializeAllocBytes => 0;
public string OptionsDescription => BuildAcBinaryOptionsDescription(_options);
public AcBinaryBufferWriterBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
{
_order = order;
_options = options;
OptionsPreset = optionsPreset;
_serialized = AcBinarySerializer.Serialize(order, options);
// Measure ONLY the BufferWriter infrastructure setup on the serialize side (excluding the
// helper Serialize above). Deserialize side reads directly from `_serialized` byte[] — no
// dedicated setup allocation, hence SetupDeserializeAllocBytes = 0.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeSetup = GC.GetAllocatedBytesForCurrentThread();
_bufferWriter = new ArrayBufferWriter<byte>(_serialized.Length * 2);
var afterSetup = GC.GetAllocatedBytesForCurrentThread();
SetupSerializeAllocBytes = afterSetup - beforeSetup;
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
_bufferWriter.ResetWrittenCount(); // reuse — no alloc, no zeroing
AcBinarySerializer.Serialize(_order, _bufferWriter, _options);
}
// BufWr semantic: read from a ReadOnlySequence<byte> (the ROS overload), NOT from byte[] —
// single-segment array-backed sequence triggers the fast-path in AcBinaryDeserializer.cs:298 which
// redirects to the byte[] overload. This means the bench actually exercises the ROS-input path
// (the production-realistic surface for SignalR / Pipe consumers) rather than secretly testing
// byte[] Deser under the BufWr label.
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize() => AcBinaryDeserializer.Deserialize<TestOrder>(new ReadOnlySequence<byte>(_serialized), _options);
public bool VerifyRoundTrip()
{
_bufferWriter.ResetWrittenCount();
AcBinarySerializer.Serialize(_order, _bufferWriter, _options);
var roundTripped = AcBinaryDeserializer.Deserialize<TestOrder>(new ReadOnlySequence<byte>(_bufferWriter.WrittenMemory), _options);
return BenchmarkLoop.DeepEqualsViaJson(_order, roundTripped);
}
}
/// <summary>
/// Benchmarks MemoryPack via the IBufferWriter overload with a pre-allocated, reused ArrayBufferWriter.
/// Apples-to-apples counterpart to AcBinaryBufferWriterBenchmark — MemoryPack's IBufferWriter is the path it's designed for.
/// </summary>
internal sealed class MemoryPackBufferWriterBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
private readonly MemoryPackSerializerOptions _options;
private readonly byte[] _serialized;
private readonly ArrayBufferWriter<byte> _bufferWriter;
public string Engine => Configuration.EngineMemoryPack;
public string IoMode => Configuration.IoBufWrReuse;
public string DispatchMode => Configuration.ModeSGen; // MemoryPack always uses [MemoryPackable] source-generated formatters
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes { get; }
public long SetupDeserializeAllocBytes => 0;
public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}";
public MemoryPackBufferWriterBenchmark(TestOrder order, string optionsPreset)
{
_order = order;
OptionsPreset = optionsPreset;
_options = GetMemPackOptions();
_serialized = MemoryPackSerializer.Serialize(order, _options);
// Serialize-side setup only — see AcBinaryBufferWriterBenchmark for the full rationale.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeSetup = GC.GetAllocatedBytesForCurrentThread();
_bufferWriter = new ArrayBufferWriter<byte>(_serialized.Length * 2);
var afterSetup = GC.GetAllocatedBytesForCurrentThread();
SetupSerializeAllocBytes = afterSetup - beforeSetup;
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
_bufferWriter.ResetWrittenCount();
MemoryPackSerializer.Serialize(_bufferWriter, _order, _options);
}
// BufWr semantic: read from a ReadOnlySequence<byte> overload (apples-to-apples with AcBinary's
// BufWr Deser path). MemoryPack's ROS overload also single-segment-fast-paths internally.
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize() => MemoryPackSerializer.Deserialize<TestOrder>(new ReadOnlySequence<byte>(_serialized), _options);
public bool VerifyRoundTrip()
{
_bufferWriter.ResetWrittenCount();
MemoryPackSerializer.Serialize(_bufferWriter, _order, _options);
var roundTripped = MemoryPackSerializer.Deserialize<TestOrder>(new ReadOnlySequence<byte>(_bufferWriter.WrittenMemory), _options);
return BenchmarkLoop.DeepEqualsViaJson(_order, roundTripped);
}
}
internal sealed class SystemTextJsonBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
private readonly JsonSerializerOptions _options;
private readonly string _serialized;
private readonly byte[] _serializedUtf8;
public string Engine => Configuration.EngineSystemTextJson;
public string IoMode => Configuration.IoString;
public string DispatchMode => Configuration.ModeRuntime; // System.Text.Json default uses reflection-based metadata (no source generator opt-in here)
public string OptionsPreset { get; }
public int SerializedSize => _serializedUtf8.Length;
public long SetupSerializeAllocBytes => 0;
public long SetupDeserializeAllocBytes => 0;
public SystemTextJsonBenchmark(TestOrder order, string optionsPreset)
{
_order = order;
OptionsPreset = optionsPreset;
_options = new JsonSerializerOptions
{
WriteIndented = false,
DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull,
ReferenceHandler = System.Text.Json.Serialization.ReferenceHandler.IgnoreCycles
};
_serialized = JsonSerializer.Serialize(order, _options);
_serializedUtf8 = Configuration.Utf8NoBom.GetBytes(_serialized);
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize() => JsonSerializer.Serialize(_order, _options);
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize() => JsonSerializer.Deserialize<TestOrder>(_serialized, _options);
public bool VerifyRoundTrip()
{
var json = JsonSerializer.Serialize(_order, _options);
var roundTripped = JsonSerializer.Deserialize<TestOrder>(json, _options);
return BenchmarkLoop.DeepEqualsViaJson(_order, roundTripped);
}
}
#endregion
// Results / output formatters → Output.cs
// BenchmarkResult DTO → BenchmarkResult.cs
}