diff --git a/AyCode.Core.Serializers.Console/BenchmarkLoop.cs b/AyCode.Core.Serializers.Console/BenchmarkLoop.cs
index 4e41e66..13fc83b 100644
--- a/AyCode.Core.Serializers.Console/BenchmarkLoop.cs
+++ b/AyCode.Core.Serializers.Console/BenchmarkLoop.cs
@@ -1,3 +1,5 @@
+using AyCode.Core.Serializers.Binaries;
+using AyCode.Core.Serializers.Console.Benchmarks;
using AyCode.Core.Tests.TestModels;
using MemoryPack;
using System.Diagnostics;
@@ -7,24 +9,469 @@ using System.Text.Json;
namespace AyCode.Core.Serializers.Console;
///
-/// Benchmark execution helpers: timing (), per-cell adaptive iteration
-/// calibration (), allocation measurement
-/// ( + ), in-place
-/// \r-progress reporting, full-GC phase-boundary helper (),
-/// startup validation (), and per-cell round-trip equality
-/// (). Pure benchmark-execution infrastructure — no display
-/// formatting (that lives in Output) and no per-engine glue (which lives with the
-/// individual ISerializerBenchmark implementations).
+/// Benchmark execution: end-to-end orchestration (), per-cell loop
+/// (), serializer factory (),
+/// and the timing / calibration / allocation helpers. Pure benchmark-execution infrastructure —
+/// no display formatting (that lives in Output) and no UX-flow (that lives in Program
+/// + Menu).
///
internal static class BenchmarkLoop
{
+ ///
+ /// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup
+ /// + measurement → grouped results print → save to disk. Used by both the CLI and interactive
+ /// menu paths; the interactive loop calls this repeatedly without restarting the process.
+ ///
+ internal static void RunBenchmark(string layer, string opMode, string serializerMode)
+ {
+ System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗");
+ System.Console.WriteLine("║ COMPREHENSIVE SERIALIZER BENCHMARK SUITE ║");
+ System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");
+
+ // Stabilization: pin the entire benchmark process to a single logical CPU and bump priority
+ // class. Single-core affinity stops Windows from migrating the bench thread between cores
+ // mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority
+ // reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise
+ // randomly inflate samples by 5-15%.
+ // Try/finally guarantees the original state is restored even if a benchmark throws — leaving
+ // a developer machine pinned to one core after a crashed run is a real foot-gun.
+ // Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot.
+ var process = Process.GetCurrentProcess();
+ var origAffinity = (IntPtr)0;
+ var origPriority = ProcessPriorityClass.Normal;
+ var stabilizationApplied = false;
+
+ // ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at
+ // runtime; skip the affinity step there but still raise priority class (which IS supported
+ // on macOS, just less effective for stabilization than affinity pinning).
+ if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
+ {
+ try
+ {
+ origAffinity = process.ProcessorAffinity;
+ origPriority = process.PriorityClass;
+ // Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one
+ // core, consistently" — not which one. If CPU 0 is heavily contended on the host
+ // (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak
+ // the mask here. The benchmark is single-threaded for the in-memory rows so single
+ // core is sufficient; round-trip-only NamedPipe rows have a server-drain thread
+ // that will share the core (acceptable — the bench measures end-to-end RT anyway).
+ process.ProcessorAffinity = (IntPtr)1;
+ process.PriorityClass = ProcessPriorityClass.High;
+ stabilizationApplied = true;
+ System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High.");
+ }
+ catch (Exception ex)
+ {
+ // Affinity/priority changes may fail on locked-down hosts (group policies, containers
+ // without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still
+ // works, just with the platform default scheduling.
+ System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}");
+ }
+ }
+
+ try
+ {
+ var allResults = new List();
+ var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
+ var testDataSets = FilterByLayer(allTestDataSets, layer);
+
+ System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard");
+ System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
+ System.Console.WriteLine();
+
+ // Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
+ // Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
+ // alone doesn't ensure that every Serialize/IBufferWriter overload is fully Tier 1 by the time we
+ // start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
+ // on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
+ // Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
+ // them all in the background; the per-cell warmup that follows then locks in cache + branch state.
+ if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
+ {
+ System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");
+
+ foreach (var testData in testDataSets)
+ {
+ var preSerializers = CreateSerializers(testData, serializerMode);
+ try
+ {
+ foreach (var s in preSerializers)
+ {
+ // Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated:
+ // Ser path first, then Des path — same pattern as the per-cell warmup in
+ // RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming).
+ s.WarmupSerialize(2000);
+ s.WarmupDeserialize(2000);
+ }
+ }
+ finally
+ {
+ // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
+ foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
+ }
+ }
+
+ // Let background tiered-JIT compilation drain before we begin measuring.
+ if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
+ System.Console.WriteLine("✓ Global pre-warmup complete.\n");
+ }
+
+ foreach (var testData in testDataSets)
+ {
+ System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
+ System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
+ System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");
+
+ var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
+ allResults.AddRange(results);
+ }
+
+ // Print grouped results
+ Output.PrintGroupedResults(allResults, testDataSets);
+
+ // Save results to file
+ Output.SaveResults(allResults, testDataSets);
+
+ System.Console.WriteLine("\n✓ Benchmark complete!");
+ }
+ finally
+ {
+ // Restore process state — affinity/priority changes are process-wide and persist across
+ // interactive-mode iterations of the menu. Without restore, the second menu run would
+ // already be on CPU-0 + High priority before its own try-block applied them, masking
+ // any stabilization-disabled comparison.
+ if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
+ {
+ try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ }
+ try { process.PriorityClass = origPriority; } catch { /* best-effort */ }
+ }
+ }
+ }
+
+ private static List RunBenchmarksForTestData(TestDataSet testData, string mode, string serializerMode)
+ {
+ var results = new List();
+ var serializers = CreateSerializers(testData, serializerMode);
+
+ // Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure.
+ System.Console.WriteLine("Verifying round-trip correctness...");
+
+ foreach (var serializer in serializers)
+ {
+ if (!serializer.VerifyRoundTrip())
+ {
+ System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}");
+ System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting.");
+
+ Environment.Exit(1);
+ }
+ }
+
+ System.Console.WriteLine("✓ All serializers passed round-trip verification.");
+
+ // Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and
+ // Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary.
+ //
+ // Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor
+ // in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a
+ // cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs
+ // keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement
+ // (steady-state). Branch-predictor history also stays clean per path.
+ //
+ // GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer
+ // pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent
+ // heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations.
+ //
+ // Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT).
+ // Each phase's freshly-promoted methods settle before its timing starts.
+ System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n");
+
+ foreach (var serializer in serializers)
+ {
+ var result = new BenchmarkResult
+ {
+ TestDataName = testData.DisplayName, // Use DisplayName for IId% info
+ Engine = serializer.Engine,
+ IoMode = serializer.IoMode,
+ DispatchMode = serializer.DispatchMode,
+ OptionsPreset = serializer.OptionsPreset,
+ OptionsDescription = serializer.OptionsDescription,
+ SerializedSize = serializer.SerializedSize,
+ SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes,
+ SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes,
+ IsRoundTripOnly = serializer.IsRoundTripOnly
+ };
+
+ // Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark
+ // is visibly stuck on a specific row at a specific %% rather than silently hanging.
+ var groupLabel = $"{result.SerializerName}";
+
+ if (serializer.IsRoundTripOnly)
+ {
+ // Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT,
+ // Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the
+ // entire round-trip path, then record into the RT result columns.
+ if (mode is "all" or "serialize" or "ser")
+ {
+ ForceGcCollect();
+ serializer.WarmupSerialize(Configuration.WarmupIterations);
+ if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
+
+ var rtIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
+ var (rtMed, rtMin, rtMax, rtStd) = RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
+ result.RoundTripTimeMs = rtMed;
+ result.RoundTripTimeMinMs = rtMin;
+ result.RoundTripTimeMaxMs = rtMax;
+ result.RoundTripTimeStdDevMs = rtStd;
+ result.RoundTripIterations = rtIter;
+ // Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len])
+ // also show up — otherwise current-thread alloc would only count the client side and look ~halved.
+ result.RoundTripAllocBytesPerOp = MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]");
+ }
+ // mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently.
+ }
+ else
+ {
+ // ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
+ if (mode is "all" or "serialize" or "ser")
+ {
+ ForceGcCollect();
+ serializer.WarmupSerialize(Configuration.WarmupIterations);
+ if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
+
+ var serIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
+ var (serMed, serMin, serMax, serStd) = RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
+ result.SerializeTimeMs = serMed;
+ result.SerializeTimeMinMs = serMin;
+ result.SerializeTimeMaxMs = serMax;
+ result.SerializeTimeStdDevMs = serStd;
+ result.SerializeIterations = serIter;
+ // Dedicated alloc-only sample (separate from timing samples; keeps timing pure)
+ result.SerializeAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
+ }
+
+ // ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
+ // The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the
+ // Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph).
+ if (mode is "all" or "deserialize" or "des")
+ {
+ ForceGcCollect();
+ serializer.WarmupDeserialize(Configuration.WarmupIterations);
+ if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
+
+ var desIter = CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs);
+ var (desMed, desMin, desMax, desStd) = RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
+ result.DeserializeTimeMs = desMed;
+ result.DeserializeTimeMinMs = desMin;
+ result.DeserializeTimeMaxMs = desMax;
+ result.DeserializeTimeStdDevMs = desStd;
+ result.DeserializeIterations = desIter;
+ result.DeserializeAllocBytesPerOp = MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]");
+ }
+
+ // Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration,
+ // batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
+ // then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
+ // RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp.
+ var serPerOp = Output.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations);
+ var desPerOp = Output.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations);
+ var rtPerOp = serPerOp + desPerOp;
+ result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations);
+ result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
+ result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
+ }
+
+ results.Add(result);
+ Output.PrintResult(result);
+ }
+
+ // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released
+ // before the next test data builds new ones — otherwise pipes / handles leak across test cells).
+ foreach (var s in serializers) (s as IDisposable)?.Dispose();
+
+ return results;
+ }
+
+ private static List CreateSerializers(TestDataSet testData, string serializerMode)
+ {
+ // FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
+ // TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[].
+ // - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head.
+ // Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
+ //
+ // FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint —
+ // we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor
+ // reference earlier. Re-enable when revisiting Fast wire-mode performance.
+ if (serializerMode == "fastestbyte")
+ {
+ var fastestByteOptions = AcBinarySerializerOptions.FastMode;
+ fastestByteOptions.WireMode = Configuration.SelectedWireMode;
+
+ return new List
+ {
+ new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"),
+ //new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"),
+ new MemoryPackBenchmark(testData.Order, "Default"),
+ };
+ }
+
+ // AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer).
+ // Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with
+ // the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it
+ // in isolation so the timing numbers reflect ONLY the streaming path.
+ if (serializerMode == "asyncpipe")
+ {
+ // NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here
+ // drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to
+ // the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the
+ // NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall
+ // fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level
+ // wire chunk AND kernel transfer unit; change ONLY this line when tuning.
+ var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode;
+ binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize;
+ binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode;
+
+ return new List
+ {
+ // Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync.
+ // Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer +
+ // MRES wait-on-byte-shortage — over a kernel NamedPipe.
+ new AcBinaryNamedPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
+ // Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport,
+ // same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize(byte[]).
+ // No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with
+ // the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from
+ // kernel-transport-overhead (raw vs in-process Byte[]).
+ new AcBinaryNamedPipeRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
+ // Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel).
+ // Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the
+ // kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk
+ // on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this
+ // isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the
+ // in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback
+ // is the worst-case benchmark scenario for chunked-streaming and not representative of real network
+ // / file / cross-thread Pipe scenarios.
+ new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
+ // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples
+ // baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw
+ // byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory].
+ new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
+ };
+ }
+
+ // Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the
+ // AsyncPipe menu / CLI mode, never bundled with the steady-state suite).
+
+ var binaryNoInternOption = AcBinarySerializerOptions.Default;
+ binaryNoInternOption.UseStringInterning = StringInterningMode.None;
+ binaryNoInternOption.WireMode = Configuration.SelectedWireMode;
+
+ var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default;
+ binaryDefaultNoSgenOption.UseGeneratedCode = false;
+ binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode;
+
+ var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode;
+ binaryFastModeNoSgenOption.UseGeneratedCode = false;
+ binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode;
+
+ var binaryFastModeOption = AcBinarySerializerOptions.FastMode;
+ binaryFastModeOption.WireMode = Configuration.SelectedWireMode;
+
+ // BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives
+ // the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call.
+ // Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration
+ // allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead
+ // vs syscall count).
+ var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode;
+ binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize;
+ binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode;
+
+ // In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment
+ // concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory
+ // System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task).
+ var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode;
+ binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize;
+ binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode;
+
+ var defaultOptions = AcBinarySerializerOptions.Default;
+ defaultOptions.UseStringInterning = StringInterningMode.None;
+ defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId;
+ defaultOptions.WireMode = Configuration.SelectedWireMode;
+
+ return new List
+ {
+ // ============================================================
+ // AcBinary — Byte[] API (uncomment to compare option presets side-by-side)
+ // ============================================================
+ // Fastest Byte[] — SGen path (UseGeneratedCode=true, default).
+ new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
+ // Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch.
+ // Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples.
+ // NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates
+ // when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish).
+ new AcBinaryBenchmark(testData.Order, binaryFastModeNoSgenOption, "FastMode"),
+ // Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) +
+ // UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size
+ // and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data.
+
+ new AcBinaryBenchmark(testData.Order, defaultOptions, "Default"),
+ //new AcBinaryBenchmark(testData.Order, binaryDefaultNoSgenOption, "Default"),
+ //new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"),
+ //new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"),
+
+ // AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario)
+ new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
+
+ // AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario).
+ // 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter
+ // allocation. Optimum for this scenario.
+ new AcBinaryFreshBufferWriterBenchmark(testData.Order, binaryFastModeBufWrChunk, "FastMode (4KB)"),
+
+ // AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the
+ // Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost
+ // (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns.
+ // The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode.
+ new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"),
+
+ // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to-
+ // apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs
+ // chunked-streaming wire format. The IO column shows "Bytes(in-mem)".
+ new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"),
+
+ // AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED
+ // here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport
+ // measurements.
+
+ // ============================================================
+ // MemoryPack — three I/O modes for apples-to-apples comparison
+ // ============================================================
+ new MemoryPackBenchmark(testData.Order, "Default"),
+ new MemoryPackBufferWriterBenchmark(testData.Order, "Default"),
+ new MemoryPackFreshBufferWriterBenchmark(testData.Order, "Default"),
+
+ // ============================================================
+ // MessagePack — for legacy comparison
+ // ============================================================
+#if !AYCODE_NATIVEAOT
+ // MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed
+ // ListFormatter et al. — fails under NativeAOT publish with "No parameterless constructor".
+ // Excluded from the AOT build; available for regular JIT runs only.
+ new MessagePackBenchmark(testData.Order, "ContractBased"),
+#endif
+
+ // System.Text.Json (commented — JSON serializer for reference; not in active suite)
+ //new SystemTextJsonBenchmark(testData.Order, "Default")
+ };
+ }
+
///
/// Forces a full GC cycle at a phase boundary in the benchmark loop. Two-pass collect with finalizer drain
/// in between: the first pass moves managed garbage to the finalization queue, WaitForPendingFinalizers
/// runs the finalizers, the second pass reclaims any objects the finalizers released. After this returns the
/// heap is in a known-quiescent state — the next warmup/measurement phase starts on a clean slate, isolated
/// from the previous phase's residual allocations (write-buffer pools, intern cache, write-plan arrays, etc.).
- /// Called between every Ser-phase / Des-phase boundary in RunBenchmarksForTestData.
+ /// Called between every Ser-phase / Des-phase boundary in .
///
[MethodImpl(MethodImplOptions.NoInlining)]
internal static void ForceGcCollect()
diff --git a/AyCode.Core.Serializers.Console/Program.cs b/AyCode.Core.Serializers.Console/Program.cs
index 77d86d6..1287dd1 100644
--- a/AyCode.Core.Serializers.Console/Program.cs
+++ b/AyCode.Core.Serializers.Console/Program.cs
@@ -52,7 +52,7 @@ public static class Program
if (!TryParseCliArgs(args, out var layer, out var opMode, out var serializerMode))
return; // invalid args
- RunBenchmark(layer, opMode, serializerMode);
+ BenchmarkLoop.RunBenchmark(layer, opMode, serializerMode);
return;
}
@@ -63,7 +63,7 @@ public static class Program
var selection = Menu.ShowInteractiveMenu();
if (selection == null) return; // user pressed Q
- RunBenchmark(selection.Value.layer, "all", selection.Value.serializerMode);
+ BenchmarkLoop.RunBenchmark(selection.Value.layer, "all", selection.Value.serializerMode);
System.Console.WriteLine();
System.Console.WriteLine("─────────────────────────────────────────────────────────────────────");
@@ -124,457 +124,7 @@ public static class Program
return true;
}
- ///
- /// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup
- /// + measurement → grouped results print → save to disk. Used by both the CLI and interactive
- /// menu paths; the interactive loop calls this repeatedly without restarting the process.
- ///
- private static void RunBenchmark(string layer, string opMode, string serializerMode)
- {
- System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗");
- System.Console.WriteLine("║ COMPREHENSIVE SERIALIZER BENCHMARK SUITE ║");
- System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");
-
- // Stabilization: pin the entire benchmark process to a single logical CPU and bump priority
- // class. Single-core affinity stops Windows from migrating the bench thread between cores
- // mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority
- // reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise
- // randomly inflate samples by 5-15%.
- // Try/finally guarantees the original state is restored even if a benchmark throws — leaving
- // a developer machine pinned to one core after a crashed run is a real foot-gun.
- // Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot.
- var process = Process.GetCurrentProcess();
- var origAffinity = (IntPtr)0;
- var origPriority = ProcessPriorityClass.Normal;
- var stabilizationApplied = false;
-
- // ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at
- // runtime; skip the affinity step there but still raise priority class (which IS supported
- // on macOS, just less effective for stabilization than affinity pinning).
- if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
- {
- try
- {
- origAffinity = process.ProcessorAffinity;
- origPriority = process.PriorityClass;
- // Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one
- // core, consistently" — not which one. If CPU 0 is heavily contended on the host
- // (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak
- // the mask here. The benchmark is single-threaded for the in-memory rows so single
- // core is sufficient; round-trip-only NamedPipe rows have a server-drain thread
- // that will share the core (acceptable — the bench measures end-to-end RT anyway).
- process.ProcessorAffinity = (IntPtr)1;
- process.PriorityClass = ProcessPriorityClass.High;
- stabilizationApplied = true;
- System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High.");
- }
- catch (Exception ex)
- {
- // Affinity/priority changes may fail on locked-down hosts (group policies, containers
- // without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still
- // works, just with the platform default scheduling.
- System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}");
- }
- }
-
- try
- {
- var allResults = new List();
- var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
- var testDataSets = BenchmarkLoop.FilterByLayer(allTestDataSets, layer);
-
- System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard");
- System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
- System.Console.WriteLine();
-
- // Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
- // Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
- // alone doesn't ensure that every Serialize/IBufferWriter overload is fully Tier 1 by the time we
- // start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
- // on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
- // Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
- // them all in the background; the per-cell warmup that follows then locks in cache + branch state.
- if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
- {
- System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");
-
- foreach (var testData in testDataSets)
- {
- var preSerializers = CreateSerializers(testData, serializerMode);
- try
- {
- foreach (var s in preSerializers)
- {
- // Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated:
- // Ser path first, then Des path — same pattern as the per-cell warmup in
- // RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming).
- s.WarmupSerialize(2000);
- s.WarmupDeserialize(2000);
- }
- }
- finally
- {
- // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
- foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
- }
- }
-
- // Let background tiered-JIT compilation drain before we begin measuring.
- if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
- System.Console.WriteLine("✓ Global pre-warmup complete.\n");
- }
-
- foreach (var testData in testDataSets)
- {
- System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
- System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
- System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");
-
- var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
- allResults.AddRange(results);
- }
-
- // Print grouped results
- Output.PrintGroupedResults(allResults, testDataSets);
-
- // Save results to file
- Output.SaveResults(allResults, testDataSets);
-
- System.Console.WriteLine("\n✓ Benchmark complete!");
- }
- finally
- {
- // Restore process state — affinity/priority changes are process-wide and persist across
- // interactive-mode iterations of the menu. Without restore, the second menu run would
- // already be on CPU-0 + High priority before its own try-block applied them, masking
- // any stabilization-disabled comparison.
- if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
- {
- try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ }
- try { process.PriorityClass = origPriority; } catch { /* best-effort */ }
- }
- }
- }
-
- #region Benchmark Execution
-
-private static List RunBenchmarksForTestData(TestDataSet testData, string mode, string serializerMode)
- {
- var results = new List();
- var serializers = CreateSerializers(testData, serializerMode);
-
- // Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure.
- System.Console.WriteLine("Verifying round-trip correctness...");
-
- foreach (var serializer in serializers)
- {
- if (!serializer.VerifyRoundTrip())
- {
- System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}");
- System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting.");
-
- Environment.Exit(1);
- }
- }
-
- System.Console.WriteLine("✓ All serializers passed round-trip verification.");
-
- // Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and
- // Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary.
- //
- // Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor
- // in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a
- // cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs
- // keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement
- // (steady-state). Branch-predictor history also stays clean per path.
- //
- // GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer
- // pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent
- // heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations.
- //
- // Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT).
- // Each phase's freshly-promoted methods settle before its timing starts.
- System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n");
-
- foreach (var serializer in serializers)
- {
- var result = new BenchmarkResult
- {
- TestDataName = testData.DisplayName, // Use DisplayName for IId% info
- Engine = serializer.Engine,
- IoMode = serializer.IoMode,
- DispatchMode = serializer.DispatchMode,
- OptionsPreset = serializer.OptionsPreset,
- OptionsDescription = serializer.OptionsDescription,
- SerializedSize = serializer.SerializedSize,
- SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes,
- SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes,
- IsRoundTripOnly = serializer.IsRoundTripOnly
- };
-
- // Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark
- // is visibly stuck on a specific row at a specific %% rather than silently hanging.
- var groupLabel = $"{result.SerializerName}";
-
- if (serializer.IsRoundTripOnly)
- {
- // Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT,
- // Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the
- // entire round-trip path, then record into the RT result columns.
- if (mode is "all" or "serialize" or "ser")
- {
- BenchmarkLoop.ForceGcCollect();
- serializer.WarmupSerialize(Configuration.WarmupIterations);
- if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
-
- var rtIter = BenchmarkLoop.CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
- var (rtMed, rtMin, rtMax, rtStd) = BenchmarkLoop.RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
- result.RoundTripTimeMs = rtMed;
- result.RoundTripTimeMinMs = rtMin;
- result.RoundTripTimeMaxMs = rtMax;
- result.RoundTripTimeStdDevMs = rtStd;
- result.RoundTripIterations = rtIter;
- // Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len])
- // also show up — otherwise current-thread alloc would only count the client side and look ~halved.
- result.RoundTripAllocBytesPerOp = BenchmarkLoop.MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]");
- }
- // mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently.
- }
- else
- {
- // ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
- if (mode is "all" or "serialize" or "ser")
- {
- BenchmarkLoop.ForceGcCollect();
- serializer.WarmupSerialize(Configuration.WarmupIterations);
- if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
-
- var serIter = BenchmarkLoop.CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
- var (serMed, serMin, serMax, serStd) = BenchmarkLoop.RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
- result.SerializeTimeMs = serMed;
- result.SerializeTimeMinMs = serMin;
- result.SerializeTimeMaxMs = serMax;
- result.SerializeTimeStdDevMs = serStd;
- result.SerializeIterations = serIter;
- // Dedicated alloc-only sample (separate from timing samples; keeps timing pure)
- result.SerializeAllocBytesPerOp = BenchmarkLoop.MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
- }
-
- // ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
- // The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the
- // Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph).
- if (mode is "all" or "deserialize" or "des")
- {
- BenchmarkLoop.ForceGcCollect();
- serializer.WarmupDeserialize(Configuration.WarmupIterations);
- if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
-
- var desIter = BenchmarkLoop.CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs);
- var (desMed, desMin, desMax, desStd) = BenchmarkLoop.RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
- result.DeserializeTimeMs = desMed;
- result.DeserializeTimeMinMs = desMin;
- result.DeserializeTimeMaxMs = desMax;
- result.DeserializeTimeStdDevMs = desStd;
- result.DeserializeIterations = desIter;
- result.DeserializeAllocBytesPerOp = BenchmarkLoop.MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]");
- }
-
- // Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration,
- // batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
- // then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
- // RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp.
- var serPerOp = Output.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations);
- var desPerOp = Output.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations);
- var rtPerOp = serPerOp + desPerOp;
- result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations);
- result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
- result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
- }
-
- results.Add(result);
- Output.PrintResult(result);
- }
-
- // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released
- // before the next test data builds new ones — otherwise pipes / handles leak across test cells).
- foreach (var s in serializers) (s as IDisposable)?.Dispose();
-
- return results;
- }
-
- private static List CreateSerializers(TestDataSet testData, string serializerMode)
- {
- // FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
- // TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[].
- // - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head.
- // Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
- //
- // FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint —
- // we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor
- // reference earlier. Re-enable when revisiting Fast wire-mode performance.
- if (serializerMode == "fastestbyte")
- {
- var fastestByteOptions = AcBinarySerializerOptions.FastMode;
- fastestByteOptions.WireMode = Configuration.SelectedWireMode;
-
- return new List
- {
- new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"),
- //new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"),
- new MemoryPackBenchmark(testData.Order, "Default"),
- };
- }
-
- // AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer).
- // Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with
- // the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it
- // in isolation so the timing numbers reflect ONLY the streaming path.
- if (serializerMode == "asyncpipe")
- {
- // NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here
- // drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to
- // the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the
- // NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall
- // fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level
- // wire chunk AND kernel transfer unit; change ONLY this line when tuning.
- var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode;
- binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize;
- binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode;
-
- return new List
- {
- // Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync.
- // Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer +
- // MRES wait-on-byte-shortage — over a kernel NamedPipe.
- new AcBinaryNamedPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
- // Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport,
- // same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize(byte[]).
- // No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with
- // the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from
- // kernel-transport-overhead (raw vs in-process Byte[]).
- new AcBinaryNamedPipeRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
- // Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel).
- // Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the
- // kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk
- // on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this
- // isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the
- // in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback
- // is the worst-case benchmark scenario for chunked-streaming and not representative of real network
- // / file / cross-thread Pipe scenarios.
- new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
- // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples
- // baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw
- // byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory].
- new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
- };
- }
-
- // Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the
- // AsyncPipe menu / CLI mode, never bundled with the steady-state suite).
-
- var binaryNoInternOption = AcBinarySerializerOptions.Default;
- binaryNoInternOption.UseStringInterning = StringInterningMode.None;
- binaryNoInternOption.WireMode = Configuration.SelectedWireMode;
-
- var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default;
- binaryDefaultNoSgenOption.UseGeneratedCode = false;
- binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode;
-
- var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode;
- binaryFastModeNoSgenOption.UseGeneratedCode = false;
- binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode;
-
- var binaryFastModeOption = AcBinarySerializerOptions.FastMode;
- binaryFastModeOption.WireMode = Configuration.SelectedWireMode;
-
- // BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives
- // the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call.
- // Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration
- // allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead
- // vs syscall count).
- var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode;
- binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize;
- binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode;
-
- // In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment
- // concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory
- // System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task).
- var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode;
- binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize;
- binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode;
-
- var defaultOptions = AcBinarySerializerOptions.Default;
- defaultOptions.UseStringInterning = StringInterningMode.None;
- defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId;
- defaultOptions.WireMode = Configuration.SelectedWireMode;
-
- return new List
- {
- // ============================================================
- // AcBinary — Byte[] API (uncomment to compare option presets side-by-side)
- // ============================================================
- // Fastest Byte[] — SGen path (UseGeneratedCode=true, default).
- new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
- // Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch.
- // Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples.
- // NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates
- // when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish).
- new AcBinaryBenchmark(testData.Order, binaryFastModeNoSgenOption, "FastMode"),
- // Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) +
- // UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size
- // and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data.
-
- new AcBinaryBenchmark(testData.Order, defaultOptions, "Default"),
- //new AcBinaryBenchmark(testData.Order, binaryDefaultNoSgenOption, "Default"),
- //new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"),
- //new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"),
-
- // AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario)
- new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
-
- // AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario).
- // 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter
- // allocation. Optimum for this scenario.
- new AcBinaryFreshBufferWriterBenchmark(testData.Order, binaryFastModeBufWrChunk, "FastMode (4KB)"),
-
- // AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the
- // Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost
- // (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns.
- // The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode.
- new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"),
-
- // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to-
- // apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs
- // chunked-streaming wire format. The IO column shows "Bytes(in-mem)".
- new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"),
-
- // AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED
- // here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport
- // measurements.
-
- // ============================================================
- // MemoryPack — three I/O modes for apples-to-apples comparison
- // ============================================================
- new MemoryPackBenchmark(testData.Order, "Default"),
- new MemoryPackBufferWriterBenchmark(testData.Order, "Default"),
- new MemoryPackFreshBufferWriterBenchmark(testData.Order, "Default"),
-
- // ============================================================
- // MessagePack — for legacy comparison
- // ============================================================
-#if !AYCODE_NATIVEAOT
- // MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed
- // ListFormatter et al. — fails under NativeAOT publish with "No parameterless constructor".
- // Excluded from the AOT build; available for regular JIT runs only.
- new MessagePackBenchmark(testData.Order, "ContractBased"),
-#endif
-
- // System.Text.Json (commented — JSON serializer for reference; not in active suite)
- //new SystemTextJsonBenchmark(testData.Order, "Default")
- };
- }
-
- #endregion
+ // RunBenchmark + RunBenchmarksForTestData + CreateSerializers → BenchmarkLoop.cs
// Serializer implementations (ISerializerBenchmark + 12 concrete benchmark classes) → Benchmarks/