[LOADED_DOCS: 2 files, no new loads]

Refactor: move benchmark logic to BenchmarkLoop.cs Moved all benchmark execution logic (RunBenchmark, RunBenchmarksForTestData, CreateSerializers) from Program.cs into a new static class BenchmarkLoop in BenchmarkLoop.cs. Program.cs now delegates benchmark runs to BenchmarkLoop, improving separation of concerns. No changes to benchmark functionality.
2026-05-12 14:09:43 +02:00 · 2026-05-12 14:09:43 +02:00 · ad9e05413c
parent c722f775f6
commit ad9e05413c
2 changed files with 459 additions and 462 deletions
--- a/AyCode.Core.Serializers.Console/BenchmarkLoop.cs
+++ b/AyCode.Core.Serializers.Console/BenchmarkLoop.cs
@ -1,3 +1,5 @@
+using AyCode.Core.Serializers.Binaries;
+using AyCode.Core.Serializers.Console.Benchmarks;
 using AyCode.Core.Tests.TestModels;
 using MemoryPack;
 using System.Diagnostics;
@ -7,24 +9,469 @@ using System.Text.Json;
 namespace AyCode.Core.Serializers.Console;

 /// <summary>
-/// Benchmark execution helpers: timing (<see cref="RunTimed"/>), per-cell adaptive iteration
-/// calibration (<see cref="CalibrateIterations"/>), allocation measurement
-/// (<see cref="MeasureAllocation"/> + <see cref="MeasureAllocationTotal"/>), in-place
-/// <c>\r</c>-progress reporting, full-GC phase-boundary helper (<see cref="ForceGcCollect"/>),
-/// startup validation (<see cref="ValidateMemoryPackSetup"/>), and per-cell round-trip equality
-/// (<see cref="DeepEqualsViaJson"/>). Pure benchmark-execution infrastructure — no display
-/// formatting (that lives in <c>Output</c>) and no per-engine glue (which lives with the
-/// individual <c>ISerializerBenchmark</c> implementations).
+/// Benchmark execution: end-to-end orchestration (<see cref="RunBenchmark"/>), per-cell loop
+/// (<see cref="RunBenchmarksForTestData"/>), serializer factory (<see cref="CreateSerializers"/>),
+/// and the timing / calibration / allocation helpers. Pure benchmark-execution infrastructure —
+/// no display formatting (that lives in <c>Output</c>) and no UX-flow (that lives in <c>Program</c>
+/// + <c>Menu</c>).
 /// </summary>
 internal static class BenchmarkLoop
 {
+    /// <summary>
+    /// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup
+    /// + measurement → grouped results print → save to disk. Used by both the CLI and interactive
+    /// menu paths; the interactive loop calls this repeatedly without restarting the process.
+    /// </summary>
+    internal static void RunBenchmark(string layer, string opMode, string serializerMode)
+    {
+        System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗");
+        System.Console.WriteLine("║          COMPREHENSIVE SERIALIZER BENCHMARK SUITE                    ║");
+        System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");
+
+        // Stabilization: pin the entire benchmark process to a single logical CPU and bump priority
+        // class. Single-core affinity stops Windows from migrating the bench thread between cores
+        // mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority
+        // reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise
+        // randomly inflate samples by 5-15%.
+        // Try/finally guarantees the original state is restored even if a benchmark throws — leaving
+        // a developer machine pinned to one core after a crashed run is a real foot-gun.
+        // Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot.
+        var process = Process.GetCurrentProcess();
+        var origAffinity = (IntPtr)0;
+        var origPriority = ProcessPriorityClass.Normal;
+        var stabilizationApplied = false;
+
+        // ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at
+        // runtime; skip the affinity step there but still raise priority class (which IS supported
+        // on macOS, just less effective for stabilization than affinity pinning).
+        if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
+        {
+            try
+            {
+                origAffinity = process.ProcessorAffinity;
+                origPriority = process.PriorityClass;
+                // Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one
+                // core, consistently" — not which one. If CPU 0 is heavily contended on the host
+                // (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak
+                // the mask here. The benchmark is single-threaded for the in-memory rows so single
+                // core is sufficient; round-trip-only NamedPipe rows have a server-drain thread
+                // that will share the core (acceptable — the bench measures end-to-end RT anyway).
+                process.ProcessorAffinity = (IntPtr)1;
+                process.PriorityClass = ProcessPriorityClass.High;
+                stabilizationApplied = true;
+                System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High.");
+            }
+            catch (Exception ex)
+            {
+                // Affinity/priority changes may fail on locked-down hosts (group policies, containers
+                // without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still
+                // works, just with the platform default scheduling.
+                System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}");
+            }
+        }
+
+        try
+        {
+            var allResults = new List<BenchmarkResult>();
+            var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
+            var testDataSets = FilterByLayer(allTestDataSets, layer);
+
+            System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard");
+            System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
+            System.Console.WriteLine();
+
+            // Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
+            // Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
+            // alone doesn't ensure that every Serialize<T>/IBufferWriter overload is fully Tier 1 by the time we
+            // start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
+            // on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
+            // Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
+            // them all in the background; the per-cell warmup that follows then locks in cache + branch state.
+            if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
+            {
+                System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");
+
+                foreach (var testData in testDataSets)
+                {
+                    var preSerializers = CreateSerializers(testData, serializerMode);
+                    try
+                    {
+                        foreach (var s in preSerializers)
+                        {
+                            // Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated:
+                            // Ser path first, then Des path — same pattern as the per-cell warmup in
+                            // RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming).
+                            s.WarmupSerialize(2000);
+                            s.WarmupDeserialize(2000);
+                        }
+                    }
+                    finally
+                    {
+                        // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
+                        foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
+                    }
+                }
+
+                // Let background tiered-JIT compilation drain before we begin measuring.
+                if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
+                System.Console.WriteLine("✓ Global pre-warmup complete.\n");
+            }
+
+            foreach (var testData in testDataSets)
+            {
+                System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
+                System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
+                System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");
+
+                var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
+                allResults.AddRange(results);
+            }
+
+            // Print grouped results
+            Output.PrintGroupedResults(allResults, testDataSets);
+
+            // Save results to file
+            Output.SaveResults(allResults, testDataSets);
+
+            System.Console.WriteLine("\n✓ Benchmark complete!");
+        }
+        finally
+        {
+            // Restore process state — affinity/priority changes are process-wide and persist across
+            // interactive-mode iterations of the menu. Without restore, the second menu run would
+            // already be on CPU-0 + High priority before its own try-block applied them, masking
+            // any stabilization-disabled comparison.
+            if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
+            {
+                try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ }
+                try { process.PriorityClass = origPriority; } catch { /* best-effort */ }
+            }
+        }
+    }
+
+    private static List<BenchmarkResult> RunBenchmarksForTestData(TestDataSet testData, string mode, string serializerMode)
+    {
+        var results = new List<BenchmarkResult>();
+        var serializers = CreateSerializers(testData, serializerMode);
+
+        // Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure.
+        System.Console.WriteLine("Verifying round-trip correctness...");
+
+        foreach (var serializer in serializers)
+        {
+            if (!serializer.VerifyRoundTrip())
+            {
+                System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}");
+                System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting.");
+
+                Environment.Exit(1);
+            }
+        }
+
+        System.Console.WriteLine("✓ All serializers passed round-trip verification.");
+
+        // Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and
+        // Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary.
+        //
+        // Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor
+        // in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a
+        // cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs
+        // keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement
+        // (steady-state). Branch-predictor history also stays clean per path.
+        //
+        // GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer
+        // pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent
+        // heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations.
+        //
+        // Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT).
+        // Each phase's freshly-promoted methods settle before its timing starts.
+        System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n");
+
+        foreach (var serializer in serializers)
+        {
+            var result = new BenchmarkResult
+            {
+                TestDataName = testData.DisplayName,  // Use DisplayName for IId% info
+                Engine = serializer.Engine,
+                IoMode = serializer.IoMode,
+                DispatchMode = serializer.DispatchMode,
+                OptionsPreset = serializer.OptionsPreset,
+                OptionsDescription = serializer.OptionsDescription,
+                SerializedSize = serializer.SerializedSize,
+                SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes,
+                SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes,
+                IsRoundTripOnly = serializer.IsRoundTripOnly
+            };
+
+            // Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark
+            // is visibly stuck on a specific row at a specific %% rather than silently hanging.
+            var groupLabel = $"{result.SerializerName}";
+
+            if (serializer.IsRoundTripOnly)
+            {
+                // Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT,
+                // Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the
+                // entire round-trip path, then record into the RT result columns.
+                if (mode is "all" or "serialize" or "ser")
+                {
+                    ForceGcCollect();
+                    serializer.WarmupSerialize(Configuration.WarmupIterations);
+                    if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
+
+                    var rtIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
+                    var (rtMed, rtMin, rtMax, rtStd) = RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
+                    result.RoundTripTimeMs = rtMed;
+                    result.RoundTripTimeMinMs = rtMin;
+                    result.RoundTripTimeMaxMs = rtMax;
+                    result.RoundTripTimeStdDevMs = rtStd;
+                    result.RoundTripIterations = rtIter;
+                    // Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len])
+                    // also show up — otherwise current-thread alloc would only count the client side and look ~halved.
+                    result.RoundTripAllocBytesPerOp = MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]");
+                }
+                // mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently.
+            }
+            else
+            {
+                // ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
+                if (mode is "all" or "serialize" or "ser")
+                {
+                    ForceGcCollect();
+                    serializer.WarmupSerialize(Configuration.WarmupIterations);
+                    if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
+
+                    var serIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
+                    var (serMed, serMin, serMax, serStd) = RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
+                    result.SerializeTimeMs = serMed;
+                    result.SerializeTimeMinMs = serMin;
+                    result.SerializeTimeMaxMs = serMax;
+                    result.SerializeTimeStdDevMs = serStd;
+                    result.SerializeIterations = serIter;
+                    // Dedicated alloc-only sample (separate from timing samples; keeps timing pure)
+                    result.SerializeAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
+                }
+
+                // ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
+                // The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the
+                // Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph).
+                if (mode is "all" or "deserialize" or "des")
+                {
+                    ForceGcCollect();
+                    serializer.WarmupDeserialize(Configuration.WarmupIterations);
+                    if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
+
+                    var desIter = CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs);
+                    var (desMed, desMin, desMax, desStd) = RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
+                    result.DeserializeTimeMs = desMed;
+                    result.DeserializeTimeMinMs = desMin;
+                    result.DeserializeTimeMaxMs = desMax;
+                    result.DeserializeTimeStdDevMs = desStd;
+                    result.DeserializeIterations = desIter;
+                    result.DeserializeAllocBytesPerOp = MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]");
+                }
+
+                // Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration,
+                // batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
+                // then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
+                // RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp.
+                var serPerOp = Output.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations);
+                var desPerOp = Output.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations);
+                var rtPerOp = serPerOp + desPerOp;
+                result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations);
+                result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
+                result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
+            }
+
+            results.Add(result);
+            Output.PrintResult(result);
+        }
+
+        // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released
+        // before the next test data builds new ones — otherwise pipes / handles leak across test cells).
+        foreach (var s in serializers) (s as IDisposable)?.Dispose();
+
+        return results;
+    }
+
+    private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, string serializerMode)
+    {
+        // FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
+        // TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[].
+        //   - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head.
+        // Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
+        //
+        // FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint —
+        // we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor
+        // reference earlier. Re-enable when revisiting Fast wire-mode performance.
+        if (serializerMode == "fastestbyte")
+        {
+            var fastestByteOptions = AcBinarySerializerOptions.FastMode;
+            fastestByteOptions.WireMode = Configuration.SelectedWireMode;
+
+            return new List<ISerializerBenchmark>
+            {
+                new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"),
+                //new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"),
+                new MemoryPackBenchmark(testData.Order, "Default"),
+            };
+        }
+
+        // AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer).
+        // Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with
+        // the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it
+        // in isolation so the timing numbers reflect ONLY the streaming path.
+        if (serializerMode == "asyncpipe")
+        {
+            // NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here
+            // drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to
+            // the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the
+            // NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall
+            // fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level
+            // wire chunk AND kernel transfer unit; change ONLY this line when tuning.
+            var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode;
+            binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize;
+            binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode;
+
+            return new List<ISerializerBenchmark>
+            {
+                // Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync.
+                // Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer +
+                // MRES wait-on-byte-shortage — over a kernel NamedPipe.
+                new AcBinaryNamedPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
+                // Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport,
+                // same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize<T>(byte[]).
+                // No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with
+                // the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from
+                // kernel-transport-overhead (raw vs in-process Byte[]).
+                new AcBinaryNamedPipeRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
+                // Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel).
+                // Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the
+                // kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk
+                // on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this
+                // isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the
+                // in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback
+                // is the worst-case benchmark scenario for chunked-streaming and not representative of real network
+                // / file / cross-thread Pipe scenarios.
+                new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
+                // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples
+                // baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw
+                // byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory].
+                new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
+            };
+        }
+
+        // Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the
+        // AsyncPipe menu / CLI mode, never bundled with the steady-state suite).
+
+        var binaryNoInternOption = AcBinarySerializerOptions.Default;
+        binaryNoInternOption.UseStringInterning = StringInterningMode.None;
+        binaryNoInternOption.WireMode = Configuration.SelectedWireMode;
+
+        var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default;
+        binaryDefaultNoSgenOption.UseGeneratedCode = false;
+        binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode;
+
+        var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode;
+        binaryFastModeNoSgenOption.UseGeneratedCode = false;
+        binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode;
+
+        var binaryFastModeOption = AcBinarySerializerOptions.FastMode;
+        binaryFastModeOption.WireMode = Configuration.SelectedWireMode;
+
+        // BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives
+        // the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call.
+        // Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration
+        // allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead
+        // vs syscall count).
+        var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode;
+        binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize;
+        binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode;
+
+        // In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment
+        // concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory
+        // System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task).
+        var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode;
+        binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize;
+        binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode;
+
+        var defaultOptions = AcBinarySerializerOptions.Default;
+        defaultOptions.UseStringInterning = StringInterningMode.None;
+        defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId;
+        defaultOptions.WireMode = Configuration.SelectedWireMode;
+
+        return new List<ISerializerBenchmark>
+        {
+            // ============================================================
+            // AcBinary — Byte[] API (uncomment to compare option presets side-by-side)
+            // ============================================================
+            // Fastest Byte[] — SGen path (UseGeneratedCode=true, default).
+            new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
+            // Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch.
+            // Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples.
+            // NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates
+            // when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish).
+            new AcBinaryBenchmark(testData.Order, binaryFastModeNoSgenOption, "FastMode"),
+            // Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) +
+            // UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size
+            // and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data.
+
+            new AcBinaryBenchmark(testData.Order, defaultOptions, "Default"),
+            //new AcBinaryBenchmark(testData.Order, binaryDefaultNoSgenOption, "Default"),
+            //new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"),
+            //new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"),
+
+            // AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario)
+            new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
+
+            // AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario).
+            // 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter
+            // allocation. Optimum for this scenario.
+            new AcBinaryFreshBufferWriterBenchmark(testData.Order, binaryFastModeBufWrChunk, "FastMode (4KB)"),
+
+            // AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the
+            // Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost
+            // (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns.
+            // The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode.
+            new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"),
+
+            // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to-
+            // apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs
+            // chunked-streaming wire format. The IO column shows "Bytes(in-mem)".
+            new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"),
+
+            // AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED
+            // here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport
+            // measurements.
+
+            // ============================================================
+            // MemoryPack — three I/O modes for apples-to-apples comparison
+            // ============================================================
+            new MemoryPackBenchmark(testData.Order, "Default"),
+            new MemoryPackBufferWriterBenchmark(testData.Order, "Default"),
+            new MemoryPackFreshBufferWriterBenchmark(testData.Order, "Default"),
+
+            // ============================================================
+            // MessagePack — for legacy comparison
+            // ============================================================
+#if !AYCODE_NATIVEAOT
+            // MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed
+            // ListFormatter<T> et al. — fails under NativeAOT publish with "No parameterless constructor".
+            // Excluded from the AOT build; available for regular JIT runs only.
+            new MessagePackBenchmark(testData.Order, "ContractBased"),
+#endif
+
+            // System.Text.Json (commented — JSON serializer for reference; not in active suite)
+            //new SystemTextJsonBenchmark(testData.Order, "Default")
+        };
+    }
+
    /// <summary>
    /// Forces a full GC cycle at a phase boundary in the benchmark loop. Two-pass collect with finalizer drain
    /// in between: the first pass moves managed garbage to the finalization queue, <c>WaitForPendingFinalizers</c>
    /// runs the finalizers, the second pass reclaims any objects the finalizers released. After this returns the
    /// heap is in a known-quiescent state — the next warmup/measurement phase starts on a clean slate, isolated
    /// from the previous phase's residual allocations (write-buffer pools, intern cache, write-plan arrays, etc.).
-    /// Called between every Ser-phase / Des-phase boundary in <c>RunBenchmarksForTestData</c>.
+    /// Called between every Ser-phase / Des-phase boundary in <see cref="RunBenchmarksForTestData"/>.
    /// </summary>
    [MethodImpl(MethodImplOptions.NoInlining)]
    internal static void ForceGcCollect()
--- a/AyCode.Core.Serializers.Console/Program.cs
+++ b/AyCode.Core.Serializers.Console/Program.cs
@ -52,7 +52,7 @@ public static class Program
            if (!TryParseCliArgs(args, out var layer, out var opMode, out var serializerMode))
                return;  // invalid args

-            RunBenchmark(layer, opMode, serializerMode);
+            BenchmarkLoop.RunBenchmark(layer, opMode, serializerMode);
            return;
        }

@ -63,7 +63,7 @@ public static class Program
            var selection = Menu.ShowInteractiveMenu();
            if (selection == null) return;  // user pressed Q

-            RunBenchmark(selection.Value.layer, "all", selection.Value.serializerMode);
+            BenchmarkLoop.RunBenchmark(selection.Value.layer, "all", selection.Value.serializerMode);

            System.Console.WriteLine();
            System.Console.WriteLine("─────────────────────────────────────────────────────────────────────");
@ -124,457 +124,7 @@ public static class Program
        return true;
    }

-    /// <summary>
-    /// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup
-    /// + measurement → grouped results print → save to disk. Used by both the CLI and interactive
-    /// menu paths; the interactive loop calls this repeatedly without restarting the process.
-    /// </summary>
-    private static void RunBenchmark(string layer, string opMode, string serializerMode)
-    {
-        System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗");
-        System.Console.WriteLine("║          COMPREHENSIVE SERIALIZER BENCHMARK SUITE                    ║");
-        System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");
-
-        // Stabilization: pin the entire benchmark process to a single logical CPU and bump priority
-        // class. Single-core affinity stops Windows from migrating the bench thread between cores
-        // mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority
-        // reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise
-        // randomly inflate samples by 5-15%.
-        // Try/finally guarantees the original state is restored even if a benchmark throws — leaving
-        // a developer machine pinned to one core after a crashed run is a real foot-gun.
-        // Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot.
-        var process = Process.GetCurrentProcess();
-        var origAffinity = (IntPtr)0;
-        var origPriority = ProcessPriorityClass.Normal;
-        var stabilizationApplied = false;
-
-        // ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at
-        // runtime; skip the affinity step there but still raise priority class (which IS supported
-        // on macOS, just less effective for stabilization than affinity pinning).
-        if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
-        {
-            try
-            {
-                origAffinity = process.ProcessorAffinity;
-                origPriority = process.PriorityClass;
-                // Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one
-                // core, consistently" — not which one. If CPU 0 is heavily contended on the host
-                // (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak
-                // the mask here. The benchmark is single-threaded for the in-memory rows so single
-                // core is sufficient; round-trip-only NamedPipe rows have a server-drain thread
-                // that will share the core (acceptable — the bench measures end-to-end RT anyway).
-                process.ProcessorAffinity = (IntPtr)1;
-                process.PriorityClass = ProcessPriorityClass.High;
-                stabilizationApplied = true;
-                System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High.");
-            }
-            catch (Exception ex)
-            {
-                // Affinity/priority changes may fail on locked-down hosts (group policies, containers
-                // without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still
-                // works, just with the platform default scheduling.
-                System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}");
-            }
-        }
-
-        try
-        {
-            var allResults = new List<BenchmarkResult>();
-            var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
-            var testDataSets = BenchmarkLoop.FilterByLayer(allTestDataSets, layer);
-
-            System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard");
-            System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
-            System.Console.WriteLine();
-
-            // Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
-            // Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
-            // alone doesn't ensure that every Serialize<T>/IBufferWriter overload is fully Tier 1 by the time we
-            // start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
-            // on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
-            // Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
-            // them all in the background; the per-cell warmup that follows then locks in cache + branch state.
-            if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
-            {
-                System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");
-
-                foreach (var testData in testDataSets)
-                {
-                    var preSerializers = CreateSerializers(testData, serializerMode);
-                    try
-                    {
-                        foreach (var s in preSerializers)
-                        {
-                            // Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated:
-                            // Ser path first, then Des path — same pattern as the per-cell warmup in
-                            // RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming).
-                            s.WarmupSerialize(2000);
-                            s.WarmupDeserialize(2000);
-                        }
-                    }
-                    finally
-                    {
-                        // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
-                        foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
-                    }
-                }
-
-                // Let background tiered-JIT compilation drain before we begin measuring.
-                if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
-                System.Console.WriteLine("✓ Global pre-warmup complete.\n");
-            }
-
-            foreach (var testData in testDataSets)
-            {
-                System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
-                System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
-                System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");
-
-                var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
-                allResults.AddRange(results);
-            }
-
-            // Print grouped results
-            Output.PrintGroupedResults(allResults, testDataSets);
-
-            // Save results to file
-            Output.SaveResults(allResults, testDataSets);
-
-            System.Console.WriteLine("\n✓ Benchmark complete!");
-        }
-        finally
-        {
-            // Restore process state — affinity/priority changes are process-wide and persist across
-            // interactive-mode iterations of the menu. Without restore, the second menu run would
-            // already be on CPU-0 + High priority before its own try-block applied them, masking
-            // any stabilization-disabled comparison.
-            if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
-            {
-                try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ }
-                try { process.PriorityClass = origPriority; } catch { /* best-effort */ }
-            }
-        }
-    }
-
-    #region Benchmark Execution
-
-private static List<BenchmarkResult> RunBenchmarksForTestData(TestDataSet testData, string mode, string serializerMode)
-    {
-        var results = new List<BenchmarkResult>();
-        var serializers = CreateSerializers(testData, serializerMode);
-
-        // Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure.
-        System.Console.WriteLine("Verifying round-trip correctness...");
-
-        foreach (var serializer in serializers)
-        {
-            if (!serializer.VerifyRoundTrip())
-            {
-                System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}");
-                System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting.");
-
-                Environment.Exit(1);
-            }
-        }
-
-        System.Console.WriteLine("✓ All serializers passed round-trip verification.");
-
-        // Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and
-        // Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary.
-        //
-        // Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor
-        // in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a
-        // cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs
-        // keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement
-        // (steady-state). Branch-predictor history also stays clean per path.
-        //
-        // GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer
-        // pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent
-        // heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations.
-        //
-        // Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT).
-        // Each phase's freshly-promoted methods settle before its timing starts.
-        System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n");
-
-        foreach (var serializer in serializers)
-        {
-            var result = new BenchmarkResult
-            {
-                TestDataName = testData.DisplayName,  // Use DisplayName for IId% info
-                Engine = serializer.Engine,
-                IoMode = serializer.IoMode,
-                DispatchMode = serializer.DispatchMode,
-                OptionsPreset = serializer.OptionsPreset,
-                OptionsDescription = serializer.OptionsDescription,
-                SerializedSize = serializer.SerializedSize,
-                SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes,
-                SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes,
-                IsRoundTripOnly = serializer.IsRoundTripOnly
-            };
-
-            // Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark
-            // is visibly stuck on a specific row at a specific %% rather than silently hanging.
-            var groupLabel = $"{result.SerializerName}";
-
-            if (serializer.IsRoundTripOnly)
-            {
-                // Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT,
-                // Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the
-                // entire round-trip path, then record into the RT result columns.
-                if (mode is "all" or "serialize" or "ser")
-                {
-                    BenchmarkLoop.ForceGcCollect();
-                    serializer.WarmupSerialize(Configuration.WarmupIterations);
-                    if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
-
-                    var rtIter = BenchmarkLoop.CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
-                    var (rtMed, rtMin, rtMax, rtStd) = BenchmarkLoop.RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
-                    result.RoundTripTimeMs = rtMed;
-                    result.RoundTripTimeMinMs = rtMin;
-                    result.RoundTripTimeMaxMs = rtMax;
-                    result.RoundTripTimeStdDevMs = rtStd;
-                    result.RoundTripIterations = rtIter;
-                    // Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len])
-                    // also show up — otherwise current-thread alloc would only count the client side and look ~halved.
-                    result.RoundTripAllocBytesPerOp = BenchmarkLoop.MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]");
-                }
-                // mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently.
-            }
-            else
-            {
-                // ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
-                if (mode is "all" or "serialize" or "ser")
-                {
-                    BenchmarkLoop.ForceGcCollect();
-                    serializer.WarmupSerialize(Configuration.WarmupIterations);
-                    if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
-
-                    var serIter = BenchmarkLoop.CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
-                    var (serMed, serMin, serMax, serStd) = BenchmarkLoop.RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
-                    result.SerializeTimeMs = serMed;
-                    result.SerializeTimeMinMs = serMin;
-                    result.SerializeTimeMaxMs = serMax;
-                    result.SerializeTimeStdDevMs = serStd;
-                    result.SerializeIterations = serIter;
-                    // Dedicated alloc-only sample (separate from timing samples; keeps timing pure)
-                    result.SerializeAllocBytesPerOp = BenchmarkLoop.MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
-                }
-
-                // ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
-                // The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the
-                // Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph).
-                if (mode is "all" or "deserialize" or "des")
-                {
-                    BenchmarkLoop.ForceGcCollect();
-                    serializer.WarmupDeserialize(Configuration.WarmupIterations);
-                    if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
-
-                    var desIter = BenchmarkLoop.CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs);
-                    var (desMed, desMin, desMax, desStd) = BenchmarkLoop.RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
-                    result.DeserializeTimeMs = desMed;
-                    result.DeserializeTimeMinMs = desMin;
-                    result.DeserializeTimeMaxMs = desMax;
-                    result.DeserializeTimeStdDevMs = desStd;
-                    result.DeserializeIterations = desIter;
-                    result.DeserializeAllocBytesPerOp = BenchmarkLoop.MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]");
-                }
-
-                // Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration,
-                // batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
-                // then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
-                // RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp.
-                var serPerOp = Output.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations);
-                var desPerOp = Output.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations);
-                var rtPerOp = serPerOp + desPerOp;
-                result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations);
-                result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
-                result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
-            }
-
-            results.Add(result);
-            Output.PrintResult(result);
-        }
-
-        // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released
-        // before the next test data builds new ones — otherwise pipes / handles leak across test cells).
-        foreach (var s in serializers) (s as IDisposable)?.Dispose();
-
-        return results;
-    }
-
-    private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, string serializerMode)
-    {
-        // FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
-        // TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[].
-        //   - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head.
-        // Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
-        //
-        // FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint —
-        // we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor
-        // reference earlier. Re-enable when revisiting Fast wire-mode performance.
-        if (serializerMode == "fastestbyte")
-        {
-            var fastestByteOptions = AcBinarySerializerOptions.FastMode;
-            fastestByteOptions.WireMode = Configuration.SelectedWireMode;
-
-            return new List<ISerializerBenchmark>
-            {
-                new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"),
-                //new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"),
-                new MemoryPackBenchmark(testData.Order, "Default"),
-            };
-        }
-
-        // AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer).
-        // Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with
-        // the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it
-        // in isolation so the timing numbers reflect ONLY the streaming path.
-        if (serializerMode == "asyncpipe")
-        {
-            // NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here
-            // drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to
-            // the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the
-            // NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall
-            // fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level
-            // wire chunk AND kernel transfer unit; change ONLY this line when tuning.
-            var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode;
-            binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize;
-            binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode;
-
-            return new List<ISerializerBenchmark>
-            {
-                // Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync.
-                // Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer +
-                // MRES wait-on-byte-shortage — over a kernel NamedPipe.
-                new AcBinaryNamedPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
-                // Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport,
-                // same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize<T>(byte[]).
-                // No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with
-                // the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from
-                // kernel-transport-overhead (raw vs in-process Byte[]).
-                new AcBinaryNamedPipeRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
-                // Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel).
-                // Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the
-                // kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk
-                // on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this
-                // isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the
-                // in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback
-                // is the worst-case benchmark scenario for chunked-streaming and not representative of real network
-                // / file / cross-thread Pipe scenarios.
-                new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
-                // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples
-                // baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw
-                // byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory].
-                new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
-            };
-        }
-
-        // Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the
-        // AsyncPipe menu / CLI mode, never bundled with the steady-state suite).
-
-        var binaryNoInternOption = AcBinarySerializerOptions.Default;
-        binaryNoInternOption.UseStringInterning = StringInterningMode.None;
-        binaryNoInternOption.WireMode = Configuration.SelectedWireMode;
-
-        var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default;
-        binaryDefaultNoSgenOption.UseGeneratedCode = false;
-        binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode;
-
-        var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode;
-        binaryFastModeNoSgenOption.UseGeneratedCode = false;
-        binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode;
-
-        var binaryFastModeOption = AcBinarySerializerOptions.FastMode;
-        binaryFastModeOption.WireMode = Configuration.SelectedWireMode;
-
-        // BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives
-        // the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call.
-        // Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration
-        // allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead
-        // vs syscall count).
-        var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode;
-        binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize;
-        binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode;
-
-        // In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment
-        // concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory
-        // System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task).
-        var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode;
-        binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize;
-        binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode;
-
-        var defaultOptions = AcBinarySerializerOptions.Default;
-        defaultOptions.UseStringInterning = StringInterningMode.None;
-        defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId;
-        defaultOptions.WireMode = Configuration.SelectedWireMode;
-
-        return new List<ISerializerBenchmark>
-        {
-            // ============================================================
-            // AcBinary — Byte[] API (uncomment to compare option presets side-by-side)
-            // ============================================================
-            // Fastest Byte[] — SGen path (UseGeneratedCode=true, default).
-            new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
-            // Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch.
-            // Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples.
-            // NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates
-            // when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish).
-            new AcBinaryBenchmark(testData.Order, binaryFastModeNoSgenOption, "FastMode"),
-            // Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) +
-            // UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size
-            // and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data.
-            
-            new AcBinaryBenchmark(testData.Order, defaultOptions, "Default"),
-            //new AcBinaryBenchmark(testData.Order, binaryDefaultNoSgenOption, "Default"),
-            //new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"),
-            //new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"),
-
-            // AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario)
-            new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
-
-            // AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario).
-            // 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter
-            // allocation. Optimum for this scenario.
-            new AcBinaryFreshBufferWriterBenchmark(testData.Order, binaryFastModeBufWrChunk, "FastMode (4KB)"),
-
-            // AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the
-            // Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost
-            // (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns.
-            // The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode.
-            new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"),
-
-            // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to-
-            // apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs
-            // chunked-streaming wire format. The IO column shows "Bytes(in-mem)".
-            new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"),
-
-            // AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED
-            // here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport
-            // measurements.
-
-            // ============================================================
-            // MemoryPack — three I/O modes for apples-to-apples comparison
-            // ============================================================
-            new MemoryPackBenchmark(testData.Order, "Default"),
-            new MemoryPackBufferWriterBenchmark(testData.Order, "Default"),
-            new MemoryPackFreshBufferWriterBenchmark(testData.Order, "Default"),
-
-            // ============================================================
-            // MessagePack — for legacy comparison
-            // ============================================================
-#if !AYCODE_NATIVEAOT
-            // MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed
-            // ListFormatter<T> et al. — fails under NativeAOT publish with "No parameterless constructor".
-            // Excluded from the AOT build; available for regular JIT runs only.
-            new MessagePackBenchmark(testData.Order, "ContractBased"),
-#endif
-
-            // System.Text.Json (commented — JSON serializer for reference; not in active suite)
-            //new SystemTextJsonBenchmark(testData.Order, "Default")
-        };
-    }
-
-    #endregion
+    // RunBenchmark + RunBenchmarksForTestData + CreateSerializers → BenchmarkLoop.cs

    // Serializer implementations (ISerializerBenchmark + 12 concrete benchmark classes) → Benchmarks/