AyCode.Core/AyCode.Core.Serializers.Con.../Program.cs

585 lines
35 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using AyCode.Core.Compression;
using AyCode.Core.Serializers.Attributes;
using AyCode.Core.Serializers.Binaries;
using AyCode.Core.Tests.Serialization; // DrainFromAsync extension (test-only, used by benchmark)
using AyCode.Core.Tests.TestModels;
using MemoryPack;
#if !AYCODE_NATIVEAOT
using MessagePack;
using MessagePack.Resolvers;
#endif
using Microsoft.Extensions.Options;
using System.Buffers;
using System.Diagnostics;
using System.IO.Pipelines;
using System.IO.Pipes;
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Text;
using System.Text.Json;
using AyCode.Core.Serializers.Console.Benchmarks;
namespace AyCode.Core.Serializers.Console;
/// <summary>
/// Comprehensive benchmark application for all serializers.
/// Compares: AcBinary (all options), MemoryPack, MessagePack, Newtonsoft.Json, System.Text.Json
///
/// Usage:
/// dotnet run # Run all benchmarks
/// dotnet run -- quick # Quick mode (fewer iterations)
/// dotnet run -- serialize # Serialize only
/// dotnet run -- deserialize # Deserialize only
/// </summary>
public static class Program
{
// Configuration (constants, mutable state, attribute-flag aggregation) → Configuration.cs
// BuildAcBinary + GetMemPack helpers → Benchmarks/BenchmarkOptions.cs
public static void Main(string[] args)
{
// Set console encoding to UTF-8 for proper Unicode character display
System.Console.OutputEncoding = Encoding.UTF8;
// Setup validation — abort BEFORE any benchmark logic if MemoryPack baseline is invalid.
// Done early so user is told immediately, not after warmup.
BenchmarkLoop.ValidateMemoryPackSetup();
// CLI mode (args provided): run once, parse args, exit. Backward-compatible behaviour.
if (args.Length > 0)
{
if (!TryParseCliArgs(args, out var layer, out var opMode, out var serializerMode))
return; // invalid args
RunBenchmark(layer, opMode, serializerMode);
return;
}
// Interactive mode (no args): loop the menu so the user doesn't have to restart between runs.
// Q exits the menu (and the application).
while (true)
{
var selection = Menu.ShowInteractiveMenu();
if (selection == null) return; // user pressed Q
RunBenchmark(selection.Value.layer, "all", selection.Value.serializerMode);
System.Console.WriteLine();
System.Console.WriteLine("─────────────────────────────────────────────────────────────────────");
System.Console.WriteLine("Returning to menu — press any key to continue, or Q to quit...");
var key = System.Console.ReadKey(intercept: true);
if (key.Key == ConsoleKey.Q) return;
System.Console.WriteLine();
}
}
/// <summary>
/// Parses CLI arguments into (layer, opMode, serializerMode). Returns <c>false</c> if the args
/// are invalid; the caller should then exit without running the standard benchmark.
/// </summary>
private static bool TryParseCliArgs(string[] args, out string layer, out string opMode, out string serializerMode)
{
layer = "all";
opMode = "all";
serializerMode = "standard";
var arg = args[0].ToLower();
// Quick mode: short warmup, few iterations, small sample count
if (arg == "quick")
{
Configuration.WarmupIterations = 5;
Configuration.TestIterations = 100;
Configuration.BenchmarkSamples = 3;
layer = "all";
}
else if (arg is "core" or "comprehensive" or "edge" or "all"
or "small" or "medium" or "large" or "repeated" or "deep")
{
layer = arg;
}
else if (arg is "asyncpipe" or "pipe")
{
// AsyncPipe-only mode: streaming I/O isolation across all test data.
layer = "all";
serializerMode = "asyncpipe";
}
else if (arg is "ser" or "serialize")
{
opMode = "serialize";
layer = "all";
}
else if (arg is "des" or "deserialize")
{
opMode = "deserialize";
layer = "all";
}
else
{
// Backwards compat: unknown arg → treat as layer keyword
layer = arg;
}
return true;
}
/// <summary>
/// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup
/// + measurement → grouped results print → save to disk. Used by both the CLI and interactive
/// menu paths; the interactive loop calls this repeatedly without restarting the process.
/// </summary>
private static void RunBenchmark(string layer, string opMode, string serializerMode)
{
System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗");
System.Console.WriteLine("║ COMPREHENSIVE SERIALIZER BENCHMARK SUITE ║");
System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");
// Stabilization: pin the entire benchmark process to a single logical CPU and bump priority
// class. Single-core affinity stops Windows from migrating the bench thread between cores
// mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority
// reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise
// randomly inflate samples by 5-15%.
// Try/finally guarantees the original state is restored even if a benchmark throws — leaving
// a developer machine pinned to one core after a crashed run is a real foot-gun.
// Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot.
var process = Process.GetCurrentProcess();
var origAffinity = (IntPtr)0;
var origPriority = ProcessPriorityClass.Normal;
var stabilizationApplied = false;
// ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at
// runtime; skip the affinity step there but still raise priority class (which IS supported
// on macOS, just less effective for stabilization than affinity pinning).
if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
{
try
{
origAffinity = process.ProcessorAffinity;
origPriority = process.PriorityClass;
// Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one
// core, consistently" — not which one. If CPU 0 is heavily contended on the host
// (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak
// the mask here. The benchmark is single-threaded for the in-memory rows so single
// core is sufficient; round-trip-only NamedPipe rows have a server-drain thread
// that will share the core (acceptable — the bench measures end-to-end RT anyway).
process.ProcessorAffinity = (IntPtr)1;
process.PriorityClass = ProcessPriorityClass.High;
stabilizationApplied = true;
System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High.");
}
catch (Exception ex)
{
// Affinity/priority changes may fail on locked-down hosts (group policies, containers
// without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still
// works, just with the platform default scheduling.
System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}");
}
}
try
{
var allResults = new List<BenchmarkResult>();
var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
var testDataSets = BenchmarkLoop.FilterByLayer(allTestDataSets, layer);
System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard");
System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
System.Console.WriteLine();
// Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
// Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
// alone doesn't ensure that every Serialize<T>/IBufferWriter overload is fully Tier 1 by the time we
// start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
// on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
// Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
// them all in the background; the per-cell warmup that follows then locks in cache + branch state.
if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
{
System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");
foreach (var testData in testDataSets)
{
var preSerializers = CreateSerializers(testData, serializerMode);
try
{
foreach (var s in preSerializers)
{
// Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated:
// Ser path first, then Des path — same pattern as the per-cell warmup in
// RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming).
s.WarmupSerialize(2000);
s.WarmupDeserialize(2000);
}
}
finally
{
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
}
}
// Let background tiered-JIT compilation drain before we begin measuring.
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
System.Console.WriteLine("✓ Global pre-warmup complete.\n");
}
foreach (var testData in testDataSets)
{
System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");
var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
allResults.AddRange(results);
}
// Print grouped results
Output.PrintGroupedResults(allResults, testDataSets);
// Save results to file
Output.SaveResults(allResults, testDataSets);
System.Console.WriteLine("\n✓ Benchmark complete!");
}
finally
{
// Restore process state — affinity/priority changes are process-wide and persist across
// interactive-mode iterations of the menu. Without restore, the second menu run would
// already be on CPU-0 + High priority before its own try-block applied them, masking
// any stabilization-disabled comparison.
if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
{
try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ }
try { process.PriorityClass = origPriority; } catch { /* best-effort */ }
}
}
}
#region Benchmark Execution
private static List<BenchmarkResult> RunBenchmarksForTestData(TestDataSet testData, string mode, string serializerMode)
{
var results = new List<BenchmarkResult>();
var serializers = CreateSerializers(testData, serializerMode);
// Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure.
System.Console.WriteLine("Verifying round-trip correctness...");
foreach (var serializer in serializers)
{
if (!serializer.VerifyRoundTrip())
{
System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}");
System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting.");
Environment.Exit(1);
}
}
System.Console.WriteLine("✓ All serializers passed round-trip verification.");
// Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and
// Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary.
//
// Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor
// in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a
// cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs
// keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement
// (steady-state). Branch-predictor history also stays clean per path.
//
// GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer
// pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent
// heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations.
//
// Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT).
// Each phase's freshly-promoted methods settle before its timing starts.
System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n");
foreach (var serializer in serializers)
{
var result = new BenchmarkResult
{
TestDataName = testData.DisplayName, // Use DisplayName for IId% info
Engine = serializer.Engine,
IoMode = serializer.IoMode,
DispatchMode = serializer.DispatchMode,
OptionsPreset = serializer.OptionsPreset,
OptionsDescription = serializer.OptionsDescription,
SerializedSize = serializer.SerializedSize,
SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes,
SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes,
IsRoundTripOnly = serializer.IsRoundTripOnly
};
// Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark
// is visibly stuck on a specific row at a specific %% rather than silently hanging.
var groupLabel = $"{result.SerializerName}";
if (serializer.IsRoundTripOnly)
{
// Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT,
// Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the
// entire round-trip path, then record into the RT result columns.
if (mode is "all" or "serialize" or "ser")
{
BenchmarkLoop.ForceGcCollect();
serializer.WarmupSerialize(Configuration.WarmupIterations);
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
var rtIter = BenchmarkLoop.CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
var (rtMed, rtMin, rtMax, rtStd) = BenchmarkLoop.RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
result.RoundTripTimeMs = rtMed;
result.RoundTripTimeMinMs = rtMin;
result.RoundTripTimeMaxMs = rtMax;
result.RoundTripTimeStdDevMs = rtStd;
result.RoundTripIterations = rtIter;
// Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len])
// also show up — otherwise current-thread alloc would only count the client side and look ~halved.
result.RoundTripAllocBytesPerOp = BenchmarkLoop.MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]");
}
// mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently.
}
else
{
// ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
if (mode is "all" or "serialize" or "ser")
{
BenchmarkLoop.ForceGcCollect();
serializer.WarmupSerialize(Configuration.WarmupIterations);
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
var serIter = BenchmarkLoop.CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
var (serMed, serMin, serMax, serStd) = BenchmarkLoop.RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
result.SerializeTimeMs = serMed;
result.SerializeTimeMinMs = serMin;
result.SerializeTimeMaxMs = serMax;
result.SerializeTimeStdDevMs = serStd;
result.SerializeIterations = serIter;
// Dedicated alloc-only sample (separate from timing samples; keeps timing pure)
result.SerializeAllocBytesPerOp = BenchmarkLoop.MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
}
// ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
// The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the
// Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph).
if (mode is "all" or "deserialize" or "des")
{
BenchmarkLoop.ForceGcCollect();
serializer.WarmupDeserialize(Configuration.WarmupIterations);
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
var desIter = BenchmarkLoop.CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs);
var (desMed, desMin, desMax, desStd) = BenchmarkLoop.RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
result.DeserializeTimeMs = desMed;
result.DeserializeTimeMinMs = desMin;
result.DeserializeTimeMaxMs = desMax;
result.DeserializeTimeStdDevMs = desStd;
result.DeserializeIterations = desIter;
result.DeserializeAllocBytesPerOp = BenchmarkLoop.MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]");
}
// Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration,
// batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
// then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
// RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp.
var serPerOp = Output.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations);
var desPerOp = Output.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations);
var rtPerOp = serPerOp + desPerOp;
result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations);
result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
}
results.Add(result);
Output.PrintResult(result);
}
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released
// before the next test data builds new ones — otherwise pipes / handles leak across test cells).
foreach (var s in serializers) (s as IDisposable)?.Dispose();
return results;
}
private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, string serializerMode)
{
// FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
// TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[].
// - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head.
// Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
//
// FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint —
// we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor
// reference earlier. Re-enable when revisiting Fast wire-mode performance.
if (serializerMode == "fastestbyte")
{
var fastestByteOptions = AcBinarySerializerOptions.FastMode;
fastestByteOptions.WireMode = Configuration.SelectedWireMode;
return new List<ISerializerBenchmark>
{
new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"),
//new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"),
new MemoryPackBenchmark(testData.Order, "Default"),
};
}
// AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer).
// Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with
// the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it
// in isolation so the timing numbers reflect ONLY the streaming path.
if (serializerMode == "asyncpipe")
{
// NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here
// drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to
// the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the
// NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall
// fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level
// wire chunk AND kernel transfer unit; change ONLY this line when tuning.
var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode;
binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize;
binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode;
return new List<ISerializerBenchmark>
{
// Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync.
// Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer +
// MRES wait-on-byte-shortage — over a kernel NamedPipe.
new AcBinaryNamedPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
// Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport,
// same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize<T>(byte[]).
// No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with
// the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from
// kernel-transport-overhead (raw vs in-process Byte[]).
new AcBinaryNamedPipeRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
// Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel).
// Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the
// kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk
// on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this
// isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the
// in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback
// is the worst-case benchmark scenario for chunked-streaming and not representative of real network
// / file / cross-thread Pipe scenarios.
new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
// Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples
// baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw
// byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory].
new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
};
}
// Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the
// AsyncPipe menu / CLI mode, never bundled with the steady-state suite).
var binaryNoInternOption = AcBinarySerializerOptions.Default;
binaryNoInternOption.UseStringInterning = StringInterningMode.None;
binaryNoInternOption.WireMode = Configuration.SelectedWireMode;
var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default;
binaryDefaultNoSgenOption.UseGeneratedCode = false;
binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode;
var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode;
binaryFastModeNoSgenOption.UseGeneratedCode = false;
binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode;
var binaryFastModeOption = AcBinarySerializerOptions.FastMode;
binaryFastModeOption.WireMode = Configuration.SelectedWireMode;
// BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives
// the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call.
// Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration
// allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead
// vs syscall count).
var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode;
binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize;
binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode;
// In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment
// concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory
// System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task).
var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode;
binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize;
binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode;
var defaultOptions = AcBinarySerializerOptions.Default;
defaultOptions.UseStringInterning = StringInterningMode.None;
defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId;
defaultOptions.WireMode = Configuration.SelectedWireMode;
return new List<ISerializerBenchmark>
{
// ============================================================
// AcBinary — Byte[] API (uncomment to compare option presets side-by-side)
// ============================================================
// Fastest Byte[] — SGen path (UseGeneratedCode=true, default).
new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
// Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch.
// Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples.
// NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates
// when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish).
new AcBinaryBenchmark(testData.Order, binaryFastModeNoSgenOption, "FastMode"),
// Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) +
// UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size
// and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data.
new AcBinaryBenchmark(testData.Order, defaultOptions, "Default"),
//new AcBinaryBenchmark(testData.Order, binaryDefaultNoSgenOption, "Default"),
//new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"),
//new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"),
// AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario)
new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
// AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario).
// 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter
// allocation. Optimum for this scenario.
new AcBinaryFreshBufferWriterBenchmark(testData.Order, binaryFastModeBufWrChunk, "FastMode (4KB)"),
// AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the
// Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost
// (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns.
// The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode.
new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"),
// Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to-
// apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs
// chunked-streaming wire format. The IO column shows "Bytes(in-mem)".
new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"),
// AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED
// here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport
// measurements.
// ============================================================
// MemoryPack — three I/O modes for apples-to-apples comparison
// ============================================================
new MemoryPackBenchmark(testData.Order, "Default"),
new MemoryPackBufferWriterBenchmark(testData.Order, "Default"),
new MemoryPackFreshBufferWriterBenchmark(testData.Order, "Default"),
// ============================================================
// MessagePack — for legacy comparison
// ============================================================
#if !AYCODE_NATIVEAOT
// MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed
// ListFormatter<T> et al. — fails under NativeAOT publish with "No parameterless constructor".
// Excluded from the AOT build; available for regular JIT runs only.
new MessagePackBenchmark(testData.Order, "ContractBased"),
#endif
// System.Text.Json (commented — JSON serializer for reference; not in active suite)
//new SystemTextJsonBenchmark(testData.Order, "Default")
};
}
#endregion
// Serializer implementations (ISerializerBenchmark + 12 concrete benchmark classes) → Benchmarks/
// Results / output formatters → Output.cs
// BenchmarkResult DTO → BenchmarkResult.cs
}