using AyCode.Core.Compression;
using AyCode.Core.Serializers.Attributes;
using AyCode.Core.Serializers.Binaries;
using AyCode.Core.Tests.Serialization; // DrainFromAsync extension (test-only, used by benchmark)
using AyCode.Core.Tests.TestModels;
using MemoryPack;
#if !AYCODE_NATIVEAOT
using MessagePack;
using MessagePack.Resolvers;
#endif
using Microsoft.Extensions.Options;
using System.Buffers;
using System.Diagnostics;
using System.IO.Pipelines;
using System.IO.Pipes;
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Text;
using System.Text.Json;
using AyCode.Core.Serializers.Console.Benchmarks;
namespace AyCode.Core.Serializers.Console;
///
/// Comprehensive benchmark application for all serializers.
/// Compares: AcBinary (all options), MemoryPack, MessagePack, Newtonsoft.Json, System.Text.Json
///
/// Usage:
/// dotnet run # Run all benchmarks
/// dotnet run -- quick # Quick mode (fewer iterations)
/// dotnet run -- serialize # Serialize only
/// dotnet run -- deserialize # Deserialize only
///
public static class Program
{
// Configuration (constants, mutable state, attribute-flag aggregation) → Configuration.cs
///
/// Common Options-column formatter for every AcBinary serializer benchmark row. Renders the
/// configured options-level value AND the effective attribute-level enable flag side-by-side
/// (e.g. Interning=All(opt) | False (attr)) so attribute-suppressed features cannot
/// silently mislead. Pass any benchmark-specific extras (e.g. ", BufferSize=4096B")
/// in — they are appended after the common fields.
///
private static string BuildAcBinaryOptionsDescription(AcBinarySerializerOptions options, string extra = "")
{
// PropertyFilter: opt-side is "Set"/"None" depending on whether a callback is registered (the callback
// itself isn't a meaningful display value); attr-side is the cross-type-aggregated bool (true = every
// tagged type has the feature enabled, false = at least one type opted out via
// [AcBinarySerializable(enablePropertyFilterFeature: false)] → SGen-emit + Runtime hot-loop both gate).
var propFilterOpt = options.PropertyFilter == null ? "None" : "Set";
return $"WireMode={options.WireMode}, " +
$"RefHandling={options.ReferenceHandling}(opt) | {Configuration.AttrFlags.refHandling} (attr), " +
$"Interning={options.UseStringInterning}(opt) | {Configuration.AttrFlags.internString} (attr), " +
$"Metadata={options.UseMetadata}(opt) | {Configuration.AttrFlags.metadata} (attr), " +
$"PropertyFilter={propFilterOpt}(opt) | {Configuration.AttrFlags.propertyFilter} (attr), " +
$"SGen={options.UseGeneratedCode}, " +
$"Compression={options.UseCompression}{extra}";
}
///
/// Returns MemoryPack serializer options aligned with for a fair
/// apples-to-apples wire-format comparison:
///
/// - → (UTF-8) — both
/// engines encode UTF-8, comparison is purely about header / tier / dispatch overhead.
/// - → (UTF-16 raw memcpy) —
/// both engines write UTF-16 raw bytes, so wire-size and CPU comparison reflect the same string-encoding family.
///
/// Without this alignment the FastWire vs MemPack-default comparison conflates two unrelated dimensions
/// (UTF-16 raw vs UTF-8 encoded) and produces a misleading +40% wire-size delta that is structurally
/// the encoding-family difference, NOT an AcBinary-specific overhead.
///
private static MemoryPackSerializerOptions GetMemPackOptions() =>
Configuration.SelectedWireMode == WireMode.Fast
? MemoryPackSerializerOptions.Utf16
: MemoryPackSerializerOptions.Default;
///
/// Converts a total-time (in ms across ) into per-operation microseconds.
/// Formula: totalMs / iterations × 1000. The benchmark stores *TimeMs as the cumulative
/// median over the timing run; the display layer renders per-op µs to make numbers iteration-count
/// independent (e.g. switching Configuration.TestIterations 1000 → 100 leaves the displayed µs/op unchanged
/// — only its sample noise grows). Symmetric with the already-per-op *AllocBytesPerOp fields.
///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
///
/// Converts a total-time (in ms across ) into per-operation microseconds.
/// Per-op µs is the iter-independent unit: 1000 iter and 50000 iter of the same operation should
/// produce the same per-op µs (within noise). Necessary because per-cell adaptive iteration makes
/// iterations a per-row property — there is no longer a single global Configuration.TestIterations to divide by.
///
// Output helpers (PrintResult, SaveResults, OverallStats, FormatMicrosWithRange, etc.) → Output.cs
// BenchmarkResult DTO → BenchmarkResult.cs
public static void Main(string[] args)
{
// Set console encoding to UTF-8 for proper Unicode character display
System.Console.OutputEncoding = Encoding.UTF8;
// Setup validation — abort BEFORE any benchmark logic if MemoryPack baseline is invalid.
// Done early so user is told immediately, not after warmup.
BenchmarkLoop.ValidateMemoryPackSetup();
// CLI mode (args provided): run once, parse args, exit. Backward-compatible behaviour.
if (args.Length > 0)
{
if (!TryParseCliArgs(args, out var layer, out var opMode, out var serializerMode))
return; // invalid args
RunBenchmark(layer, opMode, serializerMode);
return;
}
// Interactive mode (no args): loop the menu so the user doesn't have to restart between runs.
// Q exits the menu (and the application).
while (true)
{
var selection = Menu.ShowInteractiveMenu();
if (selection == null) return; // user pressed Q
RunBenchmark(selection.Value.layer, "all", selection.Value.serializerMode);
System.Console.WriteLine();
System.Console.WriteLine("─────────────────────────────────────────────────────────────────────");
System.Console.WriteLine("Returning to menu — press any key to continue, or Q to quit...");
var key = System.Console.ReadKey(intercept: true);
if (key.Key == ConsoleKey.Q) return;
System.Console.WriteLine();
}
}
///
/// Parses CLI arguments into (layer, opMode, serializerMode). Returns false if the args
/// are invalid; the caller should then exit without running the standard benchmark.
///
private static bool TryParseCliArgs(string[] args, out string layer, out string opMode, out string serializerMode)
{
layer = "all";
opMode = "all";
serializerMode = "standard";
var arg = args[0].ToLower();
// Quick mode: short warmup, few iterations, small sample count
if (arg == "quick")
{
Configuration.WarmupIterations = 5;
Configuration.TestIterations = 100;
Configuration.BenchmarkSamples = 3;
layer = "all";
}
else if (arg is "core" or "comprehensive" or "edge" or "all"
or "small" or "medium" or "large" or "repeated" or "deep")
{
layer = arg;
}
else if (arg is "asyncpipe" or "pipe")
{
// AsyncPipe-only mode: streaming I/O isolation across all test data.
layer = "all";
serializerMode = "asyncpipe";
}
else if (arg is "ser" or "serialize")
{
opMode = "serialize";
layer = "all";
}
else if (arg is "des" or "deserialize")
{
opMode = "deserialize";
layer = "all";
}
else
{
// Backwards compat: unknown arg → treat as layer keyword
layer = arg;
}
return true;
}
///
/// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup
/// + measurement → grouped results print → save to disk. Used by both the CLI and interactive
/// menu paths; the interactive loop calls this repeatedly without restarting the process.
///
private static void RunBenchmark(string layer, string opMode, string serializerMode)
{
System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗");
System.Console.WriteLine("║ COMPREHENSIVE SERIALIZER BENCHMARK SUITE ║");
System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");
// Stabilization: pin the entire benchmark process to a single logical CPU and bump priority
// class. Single-core affinity stops Windows from migrating the bench thread between cores
// mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority
// reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise
// randomly inflate samples by 5-15%.
// Try/finally guarantees the original state is restored even if a benchmark throws — leaving
// a developer machine pinned to one core after a crashed run is a real foot-gun.
// Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot.
var process = Process.GetCurrentProcess();
var origAffinity = (IntPtr)0;
var origPriority = ProcessPriorityClass.Normal;
var stabilizationApplied = false;
// ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at
// runtime; skip the affinity step there but still raise priority class (which IS supported
// on macOS, just less effective for stabilization than affinity pinning).
if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
{
try
{
origAffinity = process.ProcessorAffinity;
origPriority = process.PriorityClass;
// Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one
// core, consistently" — not which one. If CPU 0 is heavily contended on the host
// (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak
// the mask here. The benchmark is single-threaded for the in-memory rows so single
// core is sufficient; round-trip-only NamedPipe rows have a server-drain thread
// that will share the core (acceptable — the bench measures end-to-end RT anyway).
process.ProcessorAffinity = (IntPtr)1;
process.PriorityClass = ProcessPriorityClass.High;
stabilizationApplied = true;
System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High.");
}
catch (Exception ex)
{
// Affinity/priority changes may fail on locked-down hosts (group policies, containers
// without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still
// works, just with the platform default scheduling.
System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}");
}
}
try
{
var allResults = new List();
var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
var testDataSets = BenchmarkLoop.FilterByLayer(allTestDataSets, layer);
System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard");
System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
System.Console.WriteLine();
// Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
// Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
// alone doesn't ensure that every Serialize/IBufferWriter overload is fully Tier 1 by the time we
// start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
// on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
// Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
// them all in the background; the per-cell warmup that follows then locks in cache + branch state.
if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
{
System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");
foreach (var testData in testDataSets)
{
var preSerializers = CreateSerializers(testData, serializerMode);
try
{
foreach (var s in preSerializers)
{
// Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated:
// Ser path first, then Des path — same pattern as the per-cell warmup in
// RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming).
s.WarmupSerialize(2000);
s.WarmupDeserialize(2000);
}
}
finally
{
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
}
}
// Let background tiered-JIT compilation drain before we begin measuring.
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
System.Console.WriteLine("✓ Global pre-warmup complete.\n");
}
foreach (var testData in testDataSets)
{
System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");
var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
allResults.AddRange(results);
}
// Print grouped results
Output.PrintGroupedResults(allResults, testDataSets);
// Save results to file
Output.SaveResults(allResults, testDataSets);
System.Console.WriteLine("\n✓ Benchmark complete!");
}
finally
{
// Restore process state — affinity/priority changes are process-wide and persist across
// interactive-mode iterations of the menu. Without restore, the second menu run would
// already be on CPU-0 + High priority before its own try-block applied them, masking
// any stabilization-disabled comparison.
if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
{
try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ }
try { process.PriorityClass = origPriority; } catch { /* best-effort */ }
}
}
}
#region Benchmark Execution
private static List RunBenchmarksForTestData(TestDataSet testData, string mode, string serializerMode)
{
var results = new List();
var serializers = CreateSerializers(testData, serializerMode);
// Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure.
System.Console.WriteLine("Verifying round-trip correctness...");
foreach (var serializer in serializers)
{
if (!serializer.VerifyRoundTrip())
{
System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}");
System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting.");
Environment.Exit(1);
}
}
System.Console.WriteLine("✓ All serializers passed round-trip verification.");
// Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and
// Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary.
//
// Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor
// in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a
// cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs
// keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement
// (steady-state). Branch-predictor history also stays clean per path.
//
// GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer
// pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent
// heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations.
//
// Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT).
// Each phase's freshly-promoted methods settle before its timing starts.
System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n");
foreach (var serializer in serializers)
{
var result = new BenchmarkResult
{
TestDataName = testData.DisplayName, // Use DisplayName for IId% info
Engine = serializer.Engine,
IoMode = serializer.IoMode,
DispatchMode = serializer.DispatchMode,
OptionsPreset = serializer.OptionsPreset,
OptionsDescription = serializer.OptionsDescription,
SerializedSize = serializer.SerializedSize,
SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes,
SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes,
IsRoundTripOnly = serializer.IsRoundTripOnly
};
// Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark
// is visibly stuck on a specific row at a specific %% rather than silently hanging.
var groupLabel = $"{result.SerializerName}";
if (serializer.IsRoundTripOnly)
{
// Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT,
// Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the
// entire round-trip path, then record into the RT result columns.
if (mode is "all" or "serialize" or "ser")
{
BenchmarkLoop.ForceGcCollect();
serializer.WarmupSerialize(Configuration.WarmupIterations);
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
var rtIter = BenchmarkLoop.CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
var (rtMed, rtMin, rtMax, rtStd) = BenchmarkLoop.RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
result.RoundTripTimeMs = rtMed;
result.RoundTripTimeMinMs = rtMin;
result.RoundTripTimeMaxMs = rtMax;
result.RoundTripTimeStdDevMs = rtStd;
result.RoundTripIterations = rtIter;
// Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len])
// also show up — otherwise current-thread alloc would only count the client side and look ~halved.
result.RoundTripAllocBytesPerOp = BenchmarkLoop.MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]");
}
// mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently.
}
else
{
// ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
if (mode is "all" or "serialize" or "ser")
{
BenchmarkLoop.ForceGcCollect();
serializer.WarmupSerialize(Configuration.WarmupIterations);
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
var serIter = BenchmarkLoop.CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
var (serMed, serMin, serMax, serStd) = BenchmarkLoop.RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
result.SerializeTimeMs = serMed;
result.SerializeTimeMinMs = serMin;
result.SerializeTimeMaxMs = serMax;
result.SerializeTimeStdDevMs = serStd;
result.SerializeIterations = serIter;
// Dedicated alloc-only sample (separate from timing samples; keeps timing pure)
result.SerializeAllocBytesPerOp = BenchmarkLoop.MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
}
// ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
// The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the
// Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph).
if (mode is "all" or "deserialize" or "des")
{
BenchmarkLoop.ForceGcCollect();
serializer.WarmupDeserialize(Configuration.WarmupIterations);
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
var desIter = BenchmarkLoop.CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs);
var (desMed, desMin, desMax, desStd) = BenchmarkLoop.RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
result.DeserializeTimeMs = desMed;
result.DeserializeTimeMinMs = desMin;
result.DeserializeTimeMaxMs = desMax;
result.DeserializeTimeStdDevMs = desStd;
result.DeserializeIterations = desIter;
result.DeserializeAllocBytesPerOp = BenchmarkLoop.MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]");
}
// Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration,
// batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
// then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
// RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp.
var serPerOp = Output.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations);
var desPerOp = Output.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations);
var rtPerOp = serPerOp + desPerOp;
result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations);
result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
}
results.Add(result);
Output.PrintResult(result);
}
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released
// before the next test data builds new ones — otherwise pipes / handles leak across test cells).
foreach (var s in serializers) (s as IDisposable)?.Dispose();
return results;
}
private static List CreateSerializers(TestDataSet testData, string serializerMode)
{
// FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
// TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[].
// - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head.
// Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
//
// FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint —
// we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor
// reference earlier. Re-enable when revisiting Fast wire-mode performance.
if (serializerMode == "fastestbyte")
{
var fastestByteOptions = AcBinarySerializerOptions.FastMode;
fastestByteOptions.WireMode = Configuration.SelectedWireMode;
return new List
{
new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"),
//new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"),
new MemoryPackBenchmark(testData.Order, "Default"),
};
}
// AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer).
// Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with
// the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it
// in isolation so the timing numbers reflect ONLY the streaming path.
if (serializerMode == "asyncpipe")
{
// NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here
// drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to
// the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the
// NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall
// fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level
// wire chunk AND kernel transfer unit; change ONLY this line when tuning.
var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode;
binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize;
binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode;
return new List
{
// Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync.
// Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer +
// MRES wait-on-byte-shortage — over a kernel NamedPipe.
new AcBinaryNamedPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
// Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport,
// same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize(byte[]).
// No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with
// the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from
// kernel-transport-overhead (raw vs in-process Byte[]).
new AcBinaryNamedPipeRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
// Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel).
// Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the
// kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk
// on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this
// isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the
// in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback
// is the worst-case benchmark scenario for chunked-streaming and not representative of real network
// / file / cross-thread Pipe scenarios.
new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
// Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples
// baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw
// byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory].
new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
};
}
// Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the
// AsyncPipe menu / CLI mode, never bundled with the steady-state suite).
var binaryNoInternOption = AcBinarySerializerOptions.Default;
binaryNoInternOption.UseStringInterning = StringInterningMode.None;
binaryNoInternOption.WireMode = Configuration.SelectedWireMode;
var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default;
binaryDefaultNoSgenOption.UseGeneratedCode = false;
binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode;
var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode;
binaryFastModeNoSgenOption.UseGeneratedCode = false;
binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode;
var binaryFastModeOption = AcBinarySerializerOptions.FastMode;
binaryFastModeOption.WireMode = Configuration.SelectedWireMode;
// BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives
// the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call.
// Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration
// allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead
// vs syscall count).
var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode;
binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize;
binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode;
// In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment
// concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory
// System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task).
var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode;
binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize;
binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode;
var defaultOptions = AcBinarySerializerOptions.Default;
defaultOptions.UseStringInterning = StringInterningMode.None;
defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId;
defaultOptions.WireMode = Configuration.SelectedWireMode;
return new List
{
// ============================================================
// AcBinary — Byte[] API (uncomment to compare option presets side-by-side)
// ============================================================
// Fastest Byte[] — SGen path (UseGeneratedCode=true, default).
new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
// Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch.
// Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples.
// NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates
// when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish).
new AcBinaryBenchmark(testData.Order, binaryFastModeNoSgenOption, "FastMode"),
// Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) +
// UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size
// and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data.
new AcBinaryBenchmark(testData.Order, defaultOptions, "Default"),
//new AcBinaryBenchmark(testData.Order, binaryDefaultNoSgenOption, "Default"),
//new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"),
//new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"),
// AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario)
new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
// AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario).
// 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter
// allocation. Optimum for this scenario.
new AcBinaryFreshBufferWriterBenchmark(testData.Order, binaryFastModeBufWrChunk, "FastMode (4KB)"),
// AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the
// Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost
// (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns.
// The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode.
new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"),
// Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to-
// apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs
// chunked-streaming wire format. The IO column shows "Bytes(in-mem)".
new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"),
// AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED
// here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport
// measurements.
// ============================================================
// MemoryPack — three I/O modes for apples-to-apples comparison
// ============================================================
new MemoryPackBenchmark(testData.Order, "Default"),
new MemoryPackBufferWriterBenchmark(testData.Order, "Default"),
new MemoryPackFreshBufferWriterBenchmark(testData.Order, "Default"),
// ============================================================
// MessagePack — for legacy comparison
// ============================================================
#if !AYCODE_NATIVEAOT
// MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed
// ListFormatter et al. — fails under NativeAOT publish with "No parameterless constructor".
// Excluded from the AOT build; available for regular JIT runs only.
new MessagePackBenchmark(testData.Order, "ContractBased"),
#endif
// System.Text.Json (commented — JSON serializer for reference; not in active suite)
//new SystemTextJsonBenchmark(testData.Order, "Default")
};
}
#endregion
#region Serializer Implementations
internal sealed class AcBinaryBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
private readonly AcBinarySerializerOptions _options;
private readonly byte[] _serialized;
public string Engine => Configuration.EngineAcBinary;
public string IoMode => Configuration.IoByteArray;
public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes => 0;
public long SetupDeserializeAllocBytes => 0;
public string OptionsDescription => BuildAcBinaryOptionsDescription(_options);
public AcBinaryBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
{
_order = order;
_options = options;
OptionsPreset = optionsPreset;
_serialized = AcBinarySerializer.Serialize(order, options);
//_options.UseCompression = Lz4CompressionMode.Block;
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
AcBinarySerializer.Serialize(_order, _options);
//if (_options.ReferenceHandling != ReferenceHandlingMode.None || _options.UseStringInterning != StringInterningMode.None)
//{
// AcBinarySerializer.ScanOnly(_order, _options);
//}
//else AcBinarySerializer.Serialize(_order, _options);
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize() => AcBinaryDeserializer.Deserialize(_serialized, _options);
public bool VerifyRoundTrip()
{
var bytes = AcBinarySerializer.Serialize(_order, _options);
var roundTripped = AcBinaryDeserializer.Deserialize(bytes, _options);
return BenchmarkLoop.DeepEqualsViaJson(_order, roundTripped);
}
}
internal sealed class MemoryPackBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
private readonly MemoryPackSerializerOptions _options;
private readonly byte[] _serialized;
public string Engine => Configuration.EngineMemoryPack;
public string IoMode => Configuration.IoByteArray;
public string DispatchMode => Configuration.ModeSGen; // MemoryPack always uses [MemoryPackable] source-generated formatters
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes => 0;
public long SetupDeserializeAllocBytes => 0;
public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}";
public MemoryPackBenchmark(TestOrder order, string optionsPreset)
{
_order = order;
OptionsPreset = optionsPreset;
_options = GetMemPackOptions();
_serialized = MemoryPackSerializer.Serialize(order, _options);
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize() => MemoryPackSerializer.Serialize(_order, _options);
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize() => MemoryPackSerializer.Deserialize(_serialized, _options);
public bool VerifyRoundTrip()
{
var bytes = MemoryPackSerializer.Serialize(_order, _options);
var roundTripped = MemoryPackSerializer.Deserialize(bytes, _options);
return BenchmarkLoop.DeepEqualsViaJson(_order, roundTripped);
}
}
#if !AYCODE_NATIVEAOT
// MessagePack benchmark — excluded from NativeAOT build because v3's StandardResolver falls back
// to DynamicGenericResolver for closed-generic types (List et al.), which uses
// Activator.CreateInstance on formatter types the AOT trimmer drops → MissingMethodException at runtime.
// Available for regular JIT runs (`dotnet run`) only.
internal sealed class MessagePackBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
private readonly MessagePackSerializerOptions _options;
private readonly byte[] _serialized;
public string Engine => Configuration.EngineMessagePack;
public string IoMode => Configuration.IoByteArray;
public string DispatchMode => Configuration.ModeSGen; // MessagePack uses [MessagePackObject] source-generated formatters (StandardResolver)
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes => 0;
public long SetupDeserializeAllocBytes => 0;
public string OptionsDescription { get; }
public MessagePackBenchmark(TestOrder order, string optionsPreset)
{
_order = order;
OptionsPreset = optionsPreset;
//_options = ContractlessStandardResolver.Options.WithCompression(MessagePackCompression.None);
//_options = ContractlessStandardResolver.Options.WithCompression(MessagePackCompression.Lz4Block);
_options = MessagePackSerializerOptions.Standard.WithCompression(MessagePackCompression.None);
var isContractless = _options.Resolver is ContractlessStandardResolver;
OptionsDescription = $"Mode={( isContractless ? "Contractless" : "ContractBased")}, Compression={_options.Compression}";
_serialized = MessagePackSerializer.Serialize(order, _options);
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize() => MessagePackSerializer.Serialize(_order, _options);
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize() => MessagePackSerializer.Deserialize(_serialized, _options);
public bool VerifyRoundTrip()
{
var bytes = MessagePackSerializer.Serialize(_order, _options);
var roundTripped = MessagePackSerializer.Deserialize(bytes, _options);
return BenchmarkLoop.DeepEqualsViaJson(_order, roundTripped);
}
}
#endif
///
/// Benchmarks AcBinary via the IBufferWriter overload with a pre-allocated, reused ArrayBufferWriter.
/// Realistic IBufferWriter usage pattern: caller owns + reuses the writer (zero alloc per call after warmup).
///
///
/// Benchmarks AcBinary via the IBufferWriter overload, allocating a FRESH ArrayBufferWriter on EVERY call.
/// One-shot scenario — represents code that doesn't reuse a writer across calls.
/// Uses BufferWriterChunkSize=4096 (production-realistic, SignalR-aligned) instead of the 65535 default —
/// otherwise AcBinary would request 64KB upfront via GetSpan(), forcing the fresh ABW to allocate 64KB
/// regardless of payload size (heavy over-allocation for small payloads).
///
internal sealed class AcBinaryFreshBufferWriterBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
private readonly AcBinarySerializerOptions _options;
private readonly byte[] _serialized;
public string Engine => Configuration.EngineAcBinary;
public string IoMode => Configuration.IoBufWrNew;
public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes => 0;
public long SetupDeserializeAllocBytes => 0;
public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B");
public AcBinaryFreshBufferWriterBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
{
_order = order;
// BufferWriterChunkSize comes from the caller (central source of truth in CreateSerializers
// — the binaryFastMode4KbChunk options instance). Do NOT mutate _options here; tune the chunk
// size in CreateSerializers only.
_options = options;
OptionsPreset = optionsPreset;
_serialized = AcBinarySerializer.Serialize(order, _options);
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
var abw = new ArrayBufferWriter(); // FRESH every call — alloc + grow as needed
AcBinarySerializer.Serialize(_order, abw, _options);
}
// BufWr semantic: read from a ReadOnlySequence (the ROS overload), NOT from byte[] —
// single-segment array-backed sequence triggers the fast-path in AcBinaryDeserializer.cs:298 which
// redirects to the byte[] overload. This means the bench actually exercises the ROS-input path
// (the production-realistic surface for SignalR / Pipe consumers) rather than secretly testing
// byte[] Deser under the BufWr label.
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize() => AcBinaryDeserializer.Deserialize(new ReadOnlySequence(_serialized), _options);
public bool VerifyRoundTrip()
{
var abw = new ArrayBufferWriter();
AcBinarySerializer.Serialize(_order, abw, _options);
var roundTripped = AcBinaryDeserializer.Deserialize(new ReadOnlySequence(abw.WrittenMemory), _options);
return BenchmarkLoop.DeepEqualsViaJson(_order, roundTripped);
}
}
///
/// Benchmarks AcBinary over a long-lived NamedPipe IPC connection using the AcBinary native streaming API
/// (
/// + + ).
/// Mirrors what a real consumer (e.g. DeserializeFromPipeReaderAsync) does per message:
/// long-lived with multi-message wire framing on top of a long-lived NamedPipe.
///
/// Architecture:
///
/// - Constructor (NOT timed): sets up + ,
/// waits for connection, creates one long-lived /
/// pair, ONE long-lived
/// in multiMessage = true mode, ONE drain Task that pumps
/// forever, and ONE deserialize Task that loops AcBinaryDeserializer.Deserialize<T>(input, opts)
/// producing into a .
/// - Per-iteration (timed): sender writes via
///
/// — multi-message wire ([201][UINT16][data]...[202]); the [202] end marker arms the input's
/// _readPos = -1 sentinel, so the next message's first AppendToBuffer recycles the buffer to 0.
/// Then receiver awaits the channel for the deserialized result.
/// - is a no-op (full round-trip captured in );
/// =true → Ser ms / SerAlloc oszlopok N/A, RT ms = full round-trip.
///
///
/// Per-iter overhead: 0 new Task.Run, 0 new AsyncPipeReaderInput, 0 new CancellationTokenSource.
/// Pure cost = SerializeChunkedFramed (CPU + chunk-onkénti flush) + kernel write/read syscalls + 1 sync barrier
/// (channel) + deserialized graph alloc. The "multi-message reuse" pattern enabled by Q4T8 fix (R5K2 minimum: _readPos = -1
/// sentinel + AppendToBuffer sliding-window cycling).
///
/// Approximation note: single-process loopback NamedPipe. Real cross-process / cross-machine SignalR
/// adds further transport latency (TCP, WebSocket framing) on top. The benchmark gives a lower bound.
///
internal sealed class AcBinaryNamedPipeBenchmark : ISerializerBenchmark, IDisposable
{
private readonly TestOrder _order;
private readonly AcBinarySerializerOptions _options;
private readonly byte[] _serialized; // for SerializedSize reporting only
// Long-lived pipe lifecycle (set up once in ctor — NOT timed).
private readonly NamedPipeServerStream _pipeServer;
private readonly NamedPipeClientStream _pipeClient;
private readonly PipeWriter _pipeWriter;
private readonly PipeReader _pipeReader;
// Long-lived multi-message receive infrastructure (set up once in ctor).
private readonly AsyncPipeReaderInput _input;
private readonly CancellationTokenSource _cts;
private readonly Task _drainTask; // BG: PipeReader → input.Feed (continuous pump)
private readonly Task _consumerTask; // BG: per-iter Deserialize(input) loop, signaled by calling thread
private readonly ManualResetEventSlim _consumeRequest = new(false);
private readonly ManualResetEventSlim _consumeDone = new(false);
private object? _lastResult; // captured during VerifyRoundTrip; null in benchmark iters
private bool _captureResult; // toggle: when true, ConsumeLoop stores result; otherwise discards
private bool _disposed;
public string Engine => Configuration.EngineAcBinary;
public string IoMode => Configuration.IoNamedPipe;
public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes { get; }
public long SetupDeserializeAllocBytes { get; }
public bool IsRoundTripOnly => true;
public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=NamedPipe(long-lived,multiMessage,2-task)");
public AcBinaryNamedPipeBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
{
_order = order;
// BufferWriterChunkSize comes from the caller (central source of truth in CreateSerializers
// — the binaryFastMode4KbChunk options instance). Do NOT mutate _options here; tune the chunk
// size in CreateSerializers only.
_options = options;
OptionsPreset = optionsPreset;
_serialized = AcBinarySerializer.Serialize(order, _options);
// 1× pipe setup. Kernel-side pipe buffer (inBufferSize / outBufferSize on the server ctor — the
// client inherits the server-defined buffer size at connect time) matches BufferWriterChunkSize
// exactly: AsyncPipeWriterOutput now treats chunkSize as the chunk-on-wire total size (header +
// data), so one WriteFile(chunkSize) syscall lands in exactly one kernel-page slot — page-aligned,
// no fragmentation, no IRP reordering. _options.BufferWriterChunkSize is the single tunable source.
var pipeName = $"AcBinaryBench-{Guid.NewGuid():N}";
// === SERIALIZE-side setup measurement ===
// pipe-pair (server + client) + connect handshake + writer-side PipeWriter wrapper.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeSer = GC.GetAllocatedBytesForCurrentThread();
_pipeServer = new NamedPipeServerStream(pipeName, PipeDirection.In, 1, PipeTransmissionMode.Byte,
System.IO.Pipes.PipeOptions.Asynchronous,
inBufferSize: _options.BufferWriterChunkSize,
outBufferSize: _options.BufferWriterChunkSize);
_pipeClient = new NamedPipeClientStream(".", pipeName, PipeDirection.Out, System.IO.Pipes.PipeOptions.Asynchronous);
var serverWait = _pipeServer.WaitForConnectionAsync();
_pipeClient.Connect();
serverWait.GetAwaiter().GetResult();
_pipeWriter = PipeWriter.Create(_pipeClient);
var afterSer = GC.GetAllocatedBytesForCurrentThread();
SetupSerializeAllocBytes = afterSer - beforeSer;
// === DESERIALIZE-side setup measurement ===
// PipeReader wrapper + AsyncPipeReaderInput (ArrayPool rent + ManualResetEventSlim) + drain
// task + consumer task scaffolding. Two long-lived BG tasks total: drain pumps bytes from the
// kernel pipe into input; consumer drives Deserialize(input) per iter on signal.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeDes = GC.GetAllocatedBytesForCurrentThread();
_pipeReader = PipeReader.Create(_pipeServer);
_input = new AsyncPipeReaderInput(_options.BufferWriterChunkSize * 2, multiMessage: true);
_cts = new CancellationTokenSource();
// Drain task: pumps PipeReader → input.Feed forever (or until cancel). Single Task.Run for
// the full benchmark lifetime — its overhead is amortised across all messages.
_drainTask = Task.Run(() => _input.DrainFromAsync(_pipeReader, _cts.Token));
// Consumer task: per-iter Deserialize(input) loop. Started here once; signaled per-iter via
// _consumeRequest. Enables Ser↔Des streaming overlap — calling thread runs SerializeChunkedFramed
// while THIS task simultaneously runs Deserialize, both consuming/producing through the
// sliding-window buffer pipelined by the drain task.
_consumerTask = Task.Run(ConsumeLoop);
var afterDes = GC.GetAllocatedBytesForCurrentThread();
SetupDeserializeAllocBytes = afterDes - beforeDes;
}
// BG consumer: parks on _consumeRequest, runs Deserialize(_input) when signaled, signals _consumeDone.
// The Deserialize call internally blocks on the input's MRES whenever the drain hasn't yet fed enough
// bytes for the next read — that's where the streaming-pipeline overlap with the calling thread (Ser)
// happens.
private void ConsumeLoop()
{
var ct = _cts.Token;
try
{
while (true)
{
_consumeRequest.Wait(ct);
if (ct.IsCancellationRequested) return;
_consumeRequest.Reset();
try
{
var result = AcBinaryDeserializer.Deserialize(_input, _options);
if (_captureResult) _lastResult = result;
}
catch
{
// Swallow — calling thread sees the failure via missing/incorrect _lastResult during VerifyRoundTrip,
// or the benchmark loop just continues (timing impacted). Production teardown handled in Dispose.
}
finally
{
_consumeDone.Set();
}
}
}
catch (OperationCanceledException)
{
// Cooperative cancel — Dispose path. Swallow.
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
// 2-task streaming pipeline:
// 1. Calling thread signals consumer task to begin Deserialize(input). Consumer immediately
// starts; first read blocks on input's MRES because no bytes flowed yet.
// 2. Calling thread starts SerializeChunkedFramed → chunks flow through PipeWriter → kernel pipe →
// drain task (BG) feeds input.Feed → MRES pulses → consumer's Deserialize consumes bytes
// chunk by chunk. Ser↔Des truly overlap here.
// 3. Calling thread waits for _consumeDone (signaling Deserialize returned).
_consumeDone.Reset();
_consumeRequest.Set();
AcBinarySerializer.SerializeChunkedFramed(_order, _pipeWriter, _options);
_consumeDone.Wait();
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize()
{
// No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract.
}
public bool VerifyRoundTrip()
{
// Use the same 2-task streaming path as the benchmark, but capture the result for graph-equality.
_captureResult = true;
try
{
Serialize();
var result = _lastResult as TestOrder;
return result != null && BenchmarkLoop.DeepEqualsViaJson(_order, result);
}
finally
{
_captureResult = false;
_lastResult = null;
}
}
public void Dispose()
{
if (_disposed) return;
_disposed = true;
// Cancel drain + consumer tasks → both exit. Pulse _consumeRequest in case consumer is parked.
try { _cts.Cancel(); } catch { /* swallow on teardown */ }
try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ }
try { _drainTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
// Complete writer + dispose pipe lifecycle.
try { _pipeWriter.CompleteAsync().AsTask().Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
try { _pipeReader.Complete(); } catch { /* swallow on teardown */ }
try { _pipeClient.Dispose(); } catch { /* swallow on teardown */ }
try { _pipeServer.Dispose(); } catch { /* swallow on teardown */ }
try { _input.Dispose(); } catch { /* swallow on teardown */ }
try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ }
try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ }
try { _cts.Dispose(); } catch { /* swallow on teardown */ }
}
}
///
/// Same chunked-framed AsyncPipe code path as , but the transport
/// is an in-memory instead of a kernel NamedPipe. The Pipe's
/// Writer/Reader pair is a managed-only zero-copy slab handoff — no syscalls, no kernel
/// buffer copy, no IRP queueing.
///
/// Why this benchmark matters: by holding ALL other variables constant (same SerializeChunkedFramed,
/// same AsyncPipeReaderInput, same drain task, same consumer task, same multi-message wire format), this
/// row isolates the kernel-NamedPipe transport overhead from the chunked-streaming framework's pure
/// CPU cost. The expected delta vs : per-chunk overhead drops from
/// ~25-30 µs (kernel-syscall pair + IRP) to ~1-2 µs (managed slab handoff). Multi-chunk Large-message rows
/// should converge dramatically toward .
///
/// Real-world relevance: in-memory Pipe is the typical primitive used for cross-thread serializer
/// pipelines inside a single process (e.g. SignalR's Kestrel transport adapter, gRPC framework internals,
/// custom message brokers). The numbers from this row reflect that scenario, NOT the kernel-pipe loopback
/// of the NamedPipe benchmark.
///
internal sealed class AcBinaryInMemoryPipeBenchmark : ISerializerBenchmark, IDisposable
{
private readonly TestOrder _order;
private readonly AcBinarySerializerOptions _options;
private readonly byte[] _serialized; // for SerializedSize reporting only
// Long-lived in-memory pipe lifecycle (set up once in ctor — NOT timed).
private readonly Pipe _pipe;
private readonly PipeWriter _pipeWriter;
private readonly PipeReader _pipeReader;
// Long-lived multi-message receive infrastructure (set up once in ctor) — same pattern as the NamedPipe
// variant: drain pumps reader into AsyncPipeReaderInput, consumer task drives Deserialize(input).
private readonly AsyncPipeReaderInput _input;
private readonly CancellationTokenSource _cts;
private readonly Task _drainTask;
private readonly Task _consumerTask;
private readonly ManualResetEventSlim _consumeRequest = new(false);
private readonly ManualResetEventSlim _consumeDone = new(false);
private object? _lastResult;
private bool _captureResult;
private bool _disposed;
public string Engine => Configuration.EngineAcBinary;
public string IoMode => Configuration.IoInMemoryPipe;
public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes { get; }
public long SetupDeserializeAllocBytes { get; }
public bool IsRoundTripOnly => true;
public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=Pipe(in-memory,multiMessage,2-task)");
public AcBinaryInMemoryPipeBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
{
_order = order;
_options = options;
OptionsPreset = optionsPreset;
_serialized = AcBinarySerializer.Serialize(order, _options);
// === SERIALIZE-side setup measurement ===
// In-memory Pipe construction. NO kernel-pipe pair, NO Connect handshake — just a managed Pipe object
// and a reference to its Writer side. PipeWriterImpl (parallel-flush capable, NOT StreamPipeWriter).
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeSer = GC.GetAllocatedBytesForCurrentThread();
_pipe = new Pipe();
_pipeWriter = _pipe.Writer;
var afterSer = GC.GetAllocatedBytesForCurrentThread();
SetupSerializeAllocBytes = afterSer - beforeSer;
// === DESERIALIZE-side setup measurement ===
// PipeReader reference + AsyncPipeReaderInput (ArrayPool rent + ManualResetEventSlim) + drain task +
// consumer task scaffolding. Identical to the NamedPipe variant on the receive side.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeDes = GC.GetAllocatedBytesForCurrentThread();
_pipeReader = _pipe.Reader;
_input = new AsyncPipeReaderInput(_options.BufferWriterChunkSize * 2, multiMessage: true);
_cts = new CancellationTokenSource();
_drainTask = Task.Run(() => _input.DrainFromAsync(_pipeReader, _cts.Token));
_consumerTask = Task.Run(ConsumeLoop);
var afterDes = GC.GetAllocatedBytesForCurrentThread();
SetupDeserializeAllocBytes = afterDes - beforeDes;
}
// BG consumer: parks on _consumeRequest, runs Deserialize(_input) when signaled, signals _consumeDone.
// Mirror of AcBinaryNamedPipeBenchmark.ConsumeLoop — same pattern, same MRES protocol.
private void ConsumeLoop()
{
var ct = _cts.Token;
try
{
while (true)
{
_consumeRequest.Wait(ct);
if (ct.IsCancellationRequested) return;
_consumeRequest.Reset();
try
{
var result = AcBinaryDeserializer.Deserialize(_input, _options);
if (_captureResult) _lastResult = result;
}
catch
{
// Swallow — see ConsumeLoop in NamedPipe variant for rationale.
}
finally
{
_consumeDone.Set();
}
}
}
catch (OperationCanceledException)
{
// Cooperative cancel — Dispose path. Swallow.
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
// Same 2-task streaming pipeline as NamedPipe variant — only the transport differs (in-memory Pipe
// instead of kernel NamedPipe). Per-chunk SerializeChunkedFramed → PipeWriter slab → drain task
// reads from PipeReader → input.Feed → consumer Deserialize consumes byte-by-byte.
//
// Uses the Pipe-overload (instead of the PipeWriter-overload) so the FlushPolicy parameter is
// exposed for tuning. Toggle between FlushPolicy.PerChunk (bounded peak memory, per-chunk await
// FlushAsync) and FlushPolicy.Coalesced (fire-and-forget per chunk, pipe-coalesced flushes up to
// PauseWriterThreshold ~64 KB) to A/B-test the streaming-pipeline overhead. FlushPolicy.PerChunk
// is functionally equivalent to the PipeWriter-overload (both internally route to
// SerializeToPipeWriterCore with FlushPolicy.PerChunk).
_consumeDone.Reset();
_consumeRequest.Set();
AcBinarySerializer.SerializeChunkedFramed(_order, _pipe, _options, FlushPolicy.Coalesced);
_consumeDone.Wait();
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize()
{
// No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract.
}
public bool VerifyRoundTrip()
{
_captureResult = true;
try
{
Serialize();
var result = _lastResult as TestOrder;
return result != null && BenchmarkLoop.DeepEqualsViaJson(_order, result);
}
finally
{
_captureResult = false;
_lastResult = null;
}
}
public void Dispose()
{
if (_disposed) return;
_disposed = true;
// Cancel drain + consumer tasks → both exit. Pulse _consumeRequest in case consumer is parked.
try { _cts.Cancel(); } catch { /* swallow on teardown */ }
try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ }
try { _drainTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
// Complete writer + reader (in-memory Pipe — no underlying stream to dispose).
try { _pipeWriter.CompleteAsync().AsTask().Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
try { _pipeReader.Complete(); } catch { /* swallow on teardown */ }
try { _input.Dispose(); } catch { /* swallow on teardown */ }
try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ }
try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ }
try { _cts.Dispose(); } catch { /* swallow on teardown */ }
}
}
///
/// Raw byte[] over a long-lived NamedPipe — NO chunk-framing, NO AsyncPipeReaderInput,
/// NO sliding-window buffer. Calling thread serialises + writes; a long-lived background consumer task
/// reads and deserialises. Two-task pattern enables Ser↔Read overlap (kernel-pipe-pipelined) AND
/// avoids the kernel-buffer-full deadlock when bytes.Length > inBufferSize.
///
/// Side-by-side with (chunked-framed AsyncPipe stack) this
/// isolates two cost components on the SAME kernel-pipe transport with the SAME inBufferSize:
///
/// - This row vs (Byte[]) — pure kernel-NamedPipe
/// overhead (WriteFile / ReadFile syscalls + IRP queueing + buffer-copy + thread-handoff).
/// - This row vs (chunked-framed) — pure
/// AsyncPipe-framework overhead (chunk header writes + sliding-window Feed + MRES wait inside
/// AsyncPipeReaderInput) AND the streaming-pipeline benefit of intra-message Ser↔Des overlap (which
/// raw lacks — raw can only Ser↔Read overlap, with Des sequential after Read completes).
///
/// Per-iter byte[] allocation from AcBinarySerializer.Serialize is part of the cost (matches
/// 's API contract); the receive-side scratch buffer is also allocated per-iter
/// on the consumer-task (counted via GC.GetTotalAllocatedBytes in BenchmarkLoop.MeasureAllocationTotal).
///
internal sealed class AcBinaryNamedPipeRawByteArrayBenchmark : ISerializerBenchmark, IDisposable
{
private readonly TestOrder _order;
private readonly AcBinarySerializerOptions _options;
private readonly byte[] _serialized; // for SerializedSize reporting + receive-side size known upfront
// Long-lived pipe lifecycle (set up once in ctor — NOT timed).
private readonly NamedPipeServerStream _pipeServer;
private readonly NamedPipeClientStream _pipeClient;
// Long-lived consumer-task infrastructure (Read + Deserialize on BG thread, signaled per iter).
// Mirrors AcBinaryNamedPipeBenchmark's drain+consumer pair, but raw byte[] doesn't have an
// intermediate sliding-window buffer, so Read+Des happen sequentially in one BG task: Read N bytes
// → Deserialize(bytes) → signal done. Calling thread's Ser↔Write overlaps with this BG Read+Des
// through kernel-pipe pipelining.
private readonly CancellationTokenSource _cts;
private readonly Task _consumerTask;
private readonly ManualResetEventSlim _consumeRequest = new(false);
private readonly ManualResetEventSlim _consumeDone = new(false);
private int _pendingReadSize;
private object? _lastResult; // captured during VerifyRoundTrip; null in benchmark iters
private bool _captureResult; // toggle: when true, ConsumerLoop stores result; otherwise discards
private bool _disposed;
public string Engine => Configuration.EngineAcBinary;
public string IoMode => Configuration.IoNamedPipeRaw;
public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes { get; }
public long SetupDeserializeAllocBytes { get; }
public bool IsRoundTripOnly => true;
public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=NamedPipe(raw,2-task)");
public AcBinaryNamedPipeRawByteArrayBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
{
_order = order;
// BufferWriterChunkSize comes from the caller — same source-of-truth contract as
// AcBinaryNamedPipeBenchmark. The kernel pipe-buffer (inBufferSize) is wired to it so the
// raw-vs-chunked comparison runs on identical transport conditions.
_options = options;
OptionsPreset = optionsPreset;
_serialized = AcBinarySerializer.Serialize(order, _options);
var pipeName = $"AcBinaryBenchRaw-{Guid.NewGuid():N}";
// === SERIALIZE-side setup measurement ===
// pipe-pair (server + client) + connect handshake. NO PipeWriter wrapper — we use the raw
// Stream.Write API directly, matching the no-framing semantics of this benchmark.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeSer = GC.GetAllocatedBytesForCurrentThread();
_pipeServer = new NamedPipeServerStream(pipeName, PipeDirection.In, 1, PipeTransmissionMode.Byte,
System.IO.Pipes.PipeOptions.Asynchronous,
inBufferSize: _options.BufferWriterChunkSize,
outBufferSize: _options.BufferWriterChunkSize);
_pipeClient = new NamedPipeClientStream(".", pipeName, PipeDirection.Out, System.IO.Pipes.PipeOptions.Asynchronous);
var serverWait = _pipeServer.WaitForConnectionAsync();
_pipeClient.Connect();
serverWait.GetAwaiter().GetResult();
var afterSer = GC.GetAllocatedBytesForCurrentThread();
SetupSerializeAllocBytes = afterSer - beforeSer;
// === DESERIALIZE-side setup measurement ===
// 1× background consumer-task + 2× MRES (request / done) + cancellation source. Matches the
// chunked benchmark's deserialize-side setup cost shape.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeDes = GC.GetAllocatedBytesForCurrentThread();
_cts = new CancellationTokenSource();
_consumerTask = Task.Run(ConsumerLoop);
var afterDes = GC.GetAllocatedBytesForCurrentThread();
SetupDeserializeAllocBytes = afterDes - beforeDes;
}
// BG consumer: parks on _consumeRequest, reads N bytes from pipe, runs Deserialize(bytes), signals
// _consumeDone. The Read overlaps with the calling thread's Write through the kernel-pipe; Des happens
// sequentially after Read completes (raw byte[] needs the full message to deserialize).
private void ConsumerLoop()
{
var ct = _cts.Token;
try
{
while (true)
{
_consumeRequest.Wait(ct);
if (ct.IsCancellationRequested) return;
_consumeRequest.Reset();
try
{
var size = _pendingReadSize;
var bytes = new byte[size]; // per-iter alloc — counted by BenchmarkLoop.MeasureAllocationTotal
var totalRead = 0;
while (totalRead < size)
{
var n = _pipeServer.Read(bytes, totalRead, size - totalRead);
if (n == 0) break; // pipe closed / EOF — partial read swallowed
totalRead += n;
}
var result = AcBinaryDeserializer.Deserialize(bytes, _options);
if (_captureResult) _lastResult = result;
}
catch
{
// Swallow — calling thread sees the failure via missing/incorrect _lastResult during VerifyRoundTrip,
// or the benchmark loop just continues (timing impacted). Production teardown handled in Dispose.
}
finally
{
_consumeDone.Set();
}
}
}
catch (OperationCanceledException)
{
// Cooperative cancel — Dispose path. Swallow.
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
// 2-task streaming pipeline:
// 1. Calling thread serialises → fresh byte[] (per-iter alloc, matches AcBinaryBenchmark contract).
// 2. Calling thread hands off expected size + signals consumer task. Consumer task starts Read loop
// on the pipe (BG thread). Calling thread proceeds to Write the bytes — Read and Write overlap
// through the kernel-pipe (kernel buffer fills, drains as consumer reads, sender resumes).
// 3. Calling thread waits for _consumeDone (consumer task finished Read+Des).
//
// Note: unlike chunked, raw byte[] cannot do Ser↔Des overlap (Des needs the full bytes before
// starting). Only Write↔Read overlaps here. The Des sequence on BG thread is: Read full bytes →
// Des the full graph → signal done. This is the architectural difference between raw and chunked.
var bytes = AcBinarySerializer.Serialize(_order, _options);
_pendingReadSize = bytes.Length;
_consumeDone.Reset();
_consumeRequest.Set();
_pipeClient.Write(bytes, 0, bytes.Length);
_pipeClient.Flush();
_consumeDone.Wait();
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize()
{
// No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract.
}
public bool VerifyRoundTrip()
{
// Use the same 2-task streaming path as the benchmark, but capture the result for graph-equality.
_captureResult = true;
try
{
Serialize();
var result = _lastResult as TestOrder;
return result != null && BenchmarkLoop.DeepEqualsViaJson(_order, result);
}
finally
{
_captureResult = false;
_lastResult = null;
}
}
public void Dispose()
{
if (_disposed) return;
_disposed = true;
// Cancel the consumer task → ConsumerLoop exits its Wait via OperationCanceledException.
try { _cts.Cancel(); } catch { /* swallow on teardown */ }
try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ }
try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
// Symmetric teardown — close client first (writer side), then server.
try { _pipeClient.Dispose(); } catch { /* swallow on teardown */ }
try { _pipeServer.Dispose(); } catch { /* swallow on teardown */ }
try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ }
try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ }
try { _cts.Dispose(); } catch { /* swallow on teardown */ }
}
}
///
/// Raw byte[] over an in-memory cross-thread handoff — NO transport (no NamedPipe, no Pipe, no
/// Channel). Calling thread serialises into a fresh byte[], hands it to a
/// background consumer task via a single byte[] slot + MRES pair; the consumer deserialises and signals done.
///
/// Why this benchmark matters: completes the 2x2 transport × wire-format matrix:
///
/// - NamedPipe + Chunked =
/// - NamedPipe + Raw =
/// - In-memory Pipe + Chunked =
/// - In-memory + Raw = THIS row — apples-to-apples baseline for the in-memory chunked row
///
/// Side-by-side with this isolates the chunked-streaming
/// framework's pure CPU cost, with the same in-memory transport (zero kernel involvement) on both sides.
/// Side-by-side with this isolates the kernel-NamedPipe
/// overhead on the raw-byte[] side.
///
internal sealed class AcBinaryInMemoryRawByteArrayBenchmark : ISerializerBenchmark, IDisposable
{
private readonly TestOrder _order;
private readonly AcBinarySerializerOptions _options;
private readonly byte[] _serialized; // for SerializedSize reporting only
// Long-lived consumer-task infrastructure (Deserialize on BG thread, signaled per iter).
// No transport — just a byte[] slot for handoff between calling thread and consumer task.
private readonly CancellationTokenSource _cts;
private readonly Task _consumerTask;
private readonly ManualResetEventSlim _consumeRequest = new(false);
private readonly ManualResetEventSlim _consumeDone = new(false);
private byte[]? _pendingBytes; // calling thread → consumer task handoff slot
private object? _lastResult; // captured during VerifyRoundTrip; null in benchmark iters
private bool _captureResult;
private bool _disposed;
public string Engine => Configuration.EngineAcBinary;
public string IoMode => Configuration.IoInMemoryRaw;
public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes { get; }
public long SetupDeserializeAllocBytes { get; }
public bool IsRoundTripOnly => true;
public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=in-memory(raw,2-task)");
public AcBinaryInMemoryRawByteArrayBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
{
_order = order;
_options = options;
OptionsPreset = optionsPreset;
_serialized = AcBinarySerializer.Serialize(order, _options);
// === SERIALIZE-side setup measurement ===
// Nothing to set up — calling thread allocates byte[] per iter via AcBinarySerializer.Serialize.
SetupSerializeAllocBytes = 0;
// === DESERIALIZE-side setup measurement ===
// 1× background consumer-task + 2× MRES (request / done) + cancellation source.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeDes = GC.GetAllocatedBytesForCurrentThread();
_cts = new CancellationTokenSource();
_consumerTask = Task.Run(ConsumerLoop);
var afterDes = GC.GetAllocatedBytesForCurrentThread();
SetupDeserializeAllocBytes = afterDes - beforeDes;
}
// BG consumer: parks on _consumeRequest, picks up the byte[] from _pendingBytes, runs Deserialize(bytes),
// signals _consumeDone. Direct in-process handoff — no transport syscall, no buffer copy beyond the byte[]
// reference itself (zero-copy by reference).
private void ConsumerLoop()
{
var ct = _cts.Token;
try
{
while (true)
{
_consumeRequest.Wait(ct);
if (ct.IsCancellationRequested) return;
_consumeRequest.Reset();
try
{
var bytes = _pendingBytes;
if (bytes != null)
{
var result = AcBinaryDeserializer.Deserialize(bytes, _options);
if (_captureResult) _lastResult = result;
}
}
catch
{
// Swallow — see ConsumerLoop in NamedPipe variant for rationale.
}
finally
{
_consumeDone.Set();
}
}
}
catch (OperationCanceledException)
{
// Cooperative cancel — Dispose path. Swallow.
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
// 2-task in-memory pipeline:
// 1. Calling thread serialises → fresh byte[] (per-iter alloc, matches AcBinaryBenchmark contract).
// 2. Calling thread parks the byte[] into _pendingBytes and signals consumer task. Consumer task
// picks up the reference (zero-copy) and runs Deserialize(bytes).
// 3. Calling thread waits for _consumeDone (consumer task finished Des).
//
// Same architectural limitation as the NamedPipe-raw variant: Des cannot start until full bytes
// are available. Only the per-iter Ser↔Des thread-handoff overlaps slightly (calling thread starts
// signalling and waiting while consumer thread takes the byte[]).
var bytes = AcBinarySerializer.Serialize(_order, _options);
_pendingBytes = bytes;
_consumeDone.Reset();
_consumeRequest.Set();
_consumeDone.Wait();
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize()
{
// No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract.
}
public bool VerifyRoundTrip()
{
_captureResult = true;
try
{
Serialize();
var result = _lastResult as TestOrder;
return result != null && BenchmarkLoop.DeepEqualsViaJson(_order, result);
}
finally
{
_captureResult = false;
_lastResult = null;
}
}
public void Dispose()
{
if (_disposed) return;
_disposed = true;
try { _cts.Cancel(); } catch { /* swallow on teardown */ }
try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ }
try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ }
try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ }
try { _cts.Dispose(); } catch { /* swallow on teardown */ }
}
}
///
/// Benchmarks MemoryPack via the IBufferWriter overload, allocating a FRESH ArrayBufferWriter on EVERY call.
/// Apples-to-apples counterpart to AcBinaryFreshBufferWriterBenchmark.
///
internal sealed class MemoryPackFreshBufferWriterBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
private readonly MemoryPackSerializerOptions _options;
private readonly byte[] _serialized;
public string Engine => Configuration.EngineMemoryPack;
public string IoMode => Configuration.IoBufWrNew;
public string DispatchMode => Configuration.ModeSGen; // MemoryPack always uses [MemoryPackable] source-generated formatters
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes => 0;
public long SetupDeserializeAllocBytes => 0;
public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}";
public MemoryPackFreshBufferWriterBenchmark(TestOrder order, string optionsPreset)
{
_order = order;
OptionsPreset = optionsPreset;
_options = GetMemPackOptions();
_serialized = MemoryPackSerializer.Serialize(order, _options);
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
var abw = new ArrayBufferWriter();
MemoryPackSerializer.Serialize(abw, _order, _options);
}
// BufWr semantic: read from a ReadOnlySequence overload (apples-to-apples with AcBinary's
// BufWr Deser path). MemoryPack's ROS overload also single-segment-fast-paths internally.
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize() => MemoryPackSerializer.Deserialize(new ReadOnlySequence(_serialized), _options);
public bool VerifyRoundTrip()
{
var abw = new ArrayBufferWriter();
MemoryPackSerializer.Serialize(abw, _order, _options);
var roundTripped = MemoryPackSerializer.Deserialize(new ReadOnlySequence(abw.WrittenMemory), _options);
return BenchmarkLoop.DeepEqualsViaJson(_order, roundTripped);
}
}
internal sealed class AcBinaryBufferWriterBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
private readonly AcBinarySerializerOptions _options;
private readonly byte[] _serialized;
private readonly ArrayBufferWriter _bufferWriter;
public string Engine => Configuration.EngineAcBinary;
public string IoMode => Configuration.IoBufWrReuse;
public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes { get; }
public long SetupDeserializeAllocBytes => 0;
public string OptionsDescription => BuildAcBinaryOptionsDescription(_options);
public AcBinaryBufferWriterBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
{
_order = order;
_options = options;
OptionsPreset = optionsPreset;
_serialized = AcBinarySerializer.Serialize(order, options);
// Measure ONLY the BufferWriter infrastructure setup on the serialize side (excluding the
// helper Serialize above). Deserialize side reads directly from `_serialized` byte[] — no
// dedicated setup allocation, hence SetupDeserializeAllocBytes = 0.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeSetup = GC.GetAllocatedBytesForCurrentThread();
_bufferWriter = new ArrayBufferWriter(_serialized.Length * 2);
var afterSetup = GC.GetAllocatedBytesForCurrentThread();
SetupSerializeAllocBytes = afterSetup - beforeSetup;
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
_bufferWriter.ResetWrittenCount(); // reuse — no alloc, no zeroing
AcBinarySerializer.Serialize(_order, _bufferWriter, _options);
}
// BufWr semantic: read from a ReadOnlySequence (the ROS overload), NOT from byte[] —
// single-segment array-backed sequence triggers the fast-path in AcBinaryDeserializer.cs:298 which
// redirects to the byte[] overload. This means the bench actually exercises the ROS-input path
// (the production-realistic surface for SignalR / Pipe consumers) rather than secretly testing
// byte[] Deser under the BufWr label.
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize() => AcBinaryDeserializer.Deserialize(new ReadOnlySequence(_serialized), _options);
public bool VerifyRoundTrip()
{
_bufferWriter.ResetWrittenCount();
AcBinarySerializer.Serialize(_order, _bufferWriter, _options);
var roundTripped = AcBinaryDeserializer.Deserialize(new ReadOnlySequence(_bufferWriter.WrittenMemory), _options);
return BenchmarkLoop.DeepEqualsViaJson(_order, roundTripped);
}
}
///
/// Benchmarks MemoryPack via the IBufferWriter overload with a pre-allocated, reused ArrayBufferWriter.
/// Apples-to-apples counterpart to AcBinaryBufferWriterBenchmark — MemoryPack's IBufferWriter is the path it's designed for.
///
internal sealed class MemoryPackBufferWriterBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
private readonly MemoryPackSerializerOptions _options;
private readonly byte[] _serialized;
private readonly ArrayBufferWriter _bufferWriter;
public string Engine => Configuration.EngineMemoryPack;
public string IoMode => Configuration.IoBufWrReuse;
public string DispatchMode => Configuration.ModeSGen; // MemoryPack always uses [MemoryPackable] source-generated formatters
public string OptionsPreset { get; }
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes { get; }
public long SetupDeserializeAllocBytes => 0;
public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}";
public MemoryPackBufferWriterBenchmark(TestOrder order, string optionsPreset)
{
_order = order;
OptionsPreset = optionsPreset;
_options = GetMemPackOptions();
_serialized = MemoryPackSerializer.Serialize(order, _options);
// Serialize-side setup only — see AcBinaryBufferWriterBenchmark for the full rationale.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
var beforeSetup = GC.GetAllocatedBytesForCurrentThread();
_bufferWriter = new ArrayBufferWriter(_serialized.Length * 2);
var afterSetup = GC.GetAllocatedBytesForCurrentThread();
SetupSerializeAllocBytes = afterSetup - beforeSetup;
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize()
{
_bufferWriter.ResetWrittenCount();
MemoryPackSerializer.Serialize(_bufferWriter, _order, _options);
}
// BufWr semantic: read from a ReadOnlySequence overload (apples-to-apples with AcBinary's
// BufWr Deser path). MemoryPack's ROS overload also single-segment-fast-paths internally.
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize() => MemoryPackSerializer.Deserialize(new ReadOnlySequence(_serialized), _options);
public bool VerifyRoundTrip()
{
_bufferWriter.ResetWrittenCount();
MemoryPackSerializer.Serialize(_bufferWriter, _order, _options);
var roundTripped = MemoryPackSerializer.Deserialize(new ReadOnlySequence(_bufferWriter.WrittenMemory), _options);
return BenchmarkLoop.DeepEqualsViaJson(_order, roundTripped);
}
}
internal sealed class SystemTextJsonBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
private readonly JsonSerializerOptions _options;
private readonly string _serialized;
private readonly byte[] _serializedUtf8;
public string Engine => Configuration.EngineSystemTextJson;
public string IoMode => Configuration.IoString;
public string DispatchMode => Configuration.ModeRuntime; // System.Text.Json default uses reflection-based metadata (no source generator opt-in here)
public string OptionsPreset { get; }
public int SerializedSize => _serializedUtf8.Length;
public long SetupSerializeAllocBytes => 0;
public long SetupDeserializeAllocBytes => 0;
public SystemTextJsonBenchmark(TestOrder order, string optionsPreset)
{
_order = order;
OptionsPreset = optionsPreset;
_options = new JsonSerializerOptions
{
WriteIndented = false,
DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull,
ReferenceHandler = System.Text.Json.Serialization.ReferenceHandler.IgnoreCycles
};
_serialized = JsonSerializer.Serialize(order, _options);
_serializedUtf8 = Configuration.Utf8NoBom.GetBytes(_serialized);
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Serialize() => JsonSerializer.Serialize(_order, _options);
[MethodImpl(MethodImplOptions.NoInlining)]
public void Deserialize() => JsonSerializer.Deserialize(_serialized, _options);
public bool VerifyRoundTrip()
{
var json = JsonSerializer.Serialize(_order, _options);
var roundTripped = JsonSerializer.Deserialize(json, _options);
return BenchmarkLoop.DeepEqualsViaJson(_order, roundTripped);
}
}
#endregion
// Results / output formatters → Output.cs
// BenchmarkResult DTO → BenchmarkResult.cs
}