using AyCode.Core.Compression;
using AyCode.Core.Serializers.Attributes;
using AyCode.Core.Serializers.Binaries;
using AyCode.Core.Tests.Serialization;   // DrainFromAsync extension (test-only, used by benchmark)
using AyCode.Core.Tests.TestModels;
using MemoryPack;
#if !AYCODE_NATIVEAOT
using MessagePack;
using MessagePack.Resolvers;
#endif
using Microsoft.Extensions.Options;
using System.Buffers;
using System.Diagnostics;
using System.IO.Pipelines;
using System.IO.Pipes;
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Text;
using System.Text.Json;

namespace AyCode.Core.Serializers.Console;

/// <summary>
/// Comprehensive benchmark application for all serializers.
/// Compares: AcBinary (all options), MemoryPack, MessagePack, Newtonsoft.Json, System.Text.Json
/// 
/// Usage:
///   dotnet run                    # Run all benchmarks
///   dotnet run -- quick           # Quick mode (fewer iterations)
///   dotnet run -- serialize       # Serialize only
///   dotnet run -- deserialize     # Deserialize only
/// </summary>
public static class Program
{
    // Configuration (constants, mutable state, attribute-flag aggregation) → Configuration.cs

    /// <summary>
    /// Common Options-column formatter for every AcBinary serializer benchmark row. Renders the
    /// configured options-level value AND the effective attribute-level enable flag side-by-side
    /// (e.g. <c>Interning=All(opt) | False (attr)</c>) so attribute-suppressed features cannot
    /// silently mislead. Pass any benchmark-specific extras (e.g. <c>", BufferSize=4096B"</c>)
    /// in <paramref name="extra"/> — they are appended after the common fields.
    /// </summary>
    private static string BuildAcBinaryOptionsDescription(AcBinarySerializerOptions options, string extra = "")
    {
        // PropertyFilter: opt-side is "Set"/"None" depending on whether a callback is registered (the callback
        // itself isn't a meaningful display value); attr-side is the cross-type-aggregated bool (true = every
        // tagged type has the feature enabled, false = at least one type opted out via
        // [AcBinarySerializable(enablePropertyFilterFeature: false)] → SGen-emit + Runtime hot-loop both gate).
        var propFilterOpt = options.PropertyFilter == null ? "None" : "Set";

        return $"WireMode={options.WireMode}, " +
               $"RefHandling={options.ReferenceHandling}(opt) | {Configuration.AttrFlags.refHandling} (attr), " +
               $"Interning={options.UseStringInterning}(opt) | {Configuration.AttrFlags.internString} (attr), " +
               $"Metadata={options.UseMetadata}(opt) | {Configuration.AttrFlags.metadata} (attr), " +
               $"PropertyFilter={propFilterOpt}(opt) | {Configuration.AttrFlags.propertyFilter} (attr), " +
               $"SGen={options.UseGeneratedCode}, " +
               $"Compression={options.UseCompression}{extra}";
    }

    /// <summary>
    /// Returns MemoryPack serializer options aligned with <see cref="Configuration.SelectedWireMode"/> for a fair
    /// apples-to-apples wire-format comparison:
    /// <list type="bullet">
    ///   <item><see cref="WireMode.Compact"/> → <see cref="MemoryPackSerializerOptions.Default"/> (UTF-8) — both
    ///   engines encode UTF-8, comparison is purely about header / tier / dispatch overhead.</item>
    ///   <item><see cref="WireMode.Fast"/> → <see cref="MemoryPackSerializerOptions.Utf16"/> (UTF-16 raw memcpy) —
    ///   both engines write UTF-16 raw bytes, so wire-size and CPU comparison reflect the same string-encoding family.</item>
    /// </list>
    /// Without this alignment the FastWire vs MemPack-default comparison conflates two unrelated dimensions
    /// (UTF-16 raw vs UTF-8 encoded) and produces a misleading +40% wire-size delta that is structurally
    /// the encoding-family difference, NOT an AcBinary-specific overhead.
    /// </summary>
    private static MemoryPackSerializerOptions GetMemPackOptions() =>
        Configuration.SelectedWireMode == WireMode.Fast
            ? MemoryPackSerializerOptions.Utf16
            : MemoryPackSerializerOptions.Default;

    /// <summary>
    /// Converts a total-time (in ms across <see cref="Configuration.TestIterations"/>) into per-operation microseconds.
    /// Formula: <c>totalMs / iterations × 1000</c>. The benchmark stores <c>*TimeMs</c> as the cumulative
    /// median over the timing run; the display layer renders per-op µs to make numbers iteration-count
    /// independent (e.g. switching <c>Configuration.TestIterations</c> 1000 → 100 leaves the displayed µs/op unchanged
    /// — only its sample noise grows). Symmetric with the already-per-op <c>*AllocBytesPerOp</c> fields.
    /// </summary> 
    [MethodImpl(MethodImplOptions.AggressiveInlining)]

    /// <summary>
    /// Converts a total-time (in ms across <paramref name="iterations"/>) into per-operation microseconds.
    /// Per-op µs is the iter-independent unit: 1000 iter and 50000 iter of the same operation should
    /// produce the same per-op µs (within noise). Necessary because per-cell adaptive iteration makes
    /// <c>iterations</c> a per-row property — there is no longer a single global Configuration.TestIterations to divide by.
    /// </summary>
    // Output helpers (PrintResult, SaveResults, OverallStats, FormatMicrosWithRange, etc.) → Output.cs
    // BenchmarkResult DTO → BenchmarkResult.cs

    public static void Main(string[] args)
    {
        // Set console encoding to UTF-8 for proper Unicode character display
        System.Console.OutputEncoding = Encoding.UTF8;

        // Setup validation — abort BEFORE any benchmark logic if MemoryPack baseline is invalid.
        // Done early so user is told immediately, not after warmup.
        ValidateMemoryPackSetup();

        // CLI mode (args provided): run once, parse args, exit. Backward-compatible behaviour.
        if (args.Length > 0)
        {
            if (!TryParseCliArgs(args, out var layer, out var opMode, out var serializerMode))
                return;  // invalid args

            RunBenchmark(layer, opMode, serializerMode);
            return;
        }

        // Interactive mode (no args): loop the menu so the user doesn't have to restart between runs.
        // Q exits the menu (and the application).
        while (true)
        {
            var selection = Menu.ShowInteractiveMenu();
            if (selection == null) return;  // user pressed Q

            RunBenchmark(selection.Value.layer, "all", selection.Value.serializerMode);

            System.Console.WriteLine();
            System.Console.WriteLine("─────────────────────────────────────────────────────────────────────");
            System.Console.WriteLine("Returning to menu — press any key to continue, or Q to quit...");
            var key = System.Console.ReadKey(intercept: true);
            if (key.Key == ConsoleKey.Q) return;
            System.Console.WriteLine();
        }
    }

    /// <summary>
    /// Parses CLI arguments into (layer, opMode, serializerMode). Returns <c>false</c> if the args
    /// are invalid; the caller should then exit without running the standard benchmark.
    /// </summary>
    private static bool TryParseCliArgs(string[] args, out string layer, out string opMode, out string serializerMode)
    {
        layer = "all";
        opMode = "all";
        serializerMode = "standard";

        var arg = args[0].ToLower();

        // Quick mode: short warmup, few iterations, small sample count
        if (arg == "quick")
        {
            Configuration.WarmupIterations = 5;
            Configuration.TestIterations = 100;
            Configuration.BenchmarkSamples = 3;
            layer = "all";
        }
        else if (arg is "core" or "comprehensive" or "edge" or "all"
                       or "small" or "medium" or "large" or "repeated" or "deep")
        {
            layer = arg;
        }
        else if (arg is "asyncpipe" or "pipe")
        {
            // AsyncPipe-only mode: streaming I/O isolation across all test data.
            layer = "all";
            serializerMode = "asyncpipe";
        }
        else if (arg is "ser" or "serialize")
        {
            opMode = "serialize";
            layer = "all";
        }
        else if (arg is "des" or "deserialize")
        {
            opMode = "deserialize";
            layer = "all";
        }
        else
        {
            // Backwards compat: unknown arg → treat as layer keyword
            layer = arg;
        }

        return true;
    }

    /// <summary>
    /// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup
    /// + measurement → grouped results print → save to disk. Used by both the CLI and interactive
    /// menu paths; the interactive loop calls this repeatedly without restarting the process.
    /// </summary>
    private static void RunBenchmark(string layer, string opMode, string serializerMode)
    {
        System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗");
        System.Console.WriteLine("║          COMPREHENSIVE SERIALIZER BENCHMARK SUITE                    ║");
        System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");

        // Stabilization: pin the entire benchmark process to a single logical CPU and bump priority
        // class. Single-core affinity stops Windows from migrating the bench thread between cores
        // mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority
        // reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise
        // randomly inflate samples by 5-15%.
        // Try/finally guarantees the original state is restored even if a benchmark throws — leaving
        // a developer machine pinned to one core after a crashed run is a real foot-gun.
        // Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot.
        var process = Process.GetCurrentProcess();
        var origAffinity = (IntPtr)0;
        var origPriority = ProcessPriorityClass.Normal;
        var stabilizationApplied = false;

        // ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at
        // runtime; skip the affinity step there but still raise priority class (which IS supported
        // on macOS, just less effective for stabilization than affinity pinning).
        if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
        {
            try
            {
                origAffinity = process.ProcessorAffinity;
                origPriority = process.PriorityClass;
                // Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one
                // core, consistently" — not which one. If CPU 0 is heavily contended on the host
                // (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak
                // the mask here. The benchmark is single-threaded for the in-memory rows so single
                // core is sufficient; round-trip-only NamedPipe rows have a server-drain thread
                // that will share the core (acceptable — the bench measures end-to-end RT anyway).
                process.ProcessorAffinity = (IntPtr)1;
                process.PriorityClass = ProcessPriorityClass.High;
                stabilizationApplied = true;
                System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High.");
            }
            catch (Exception ex)
            {
                // Affinity/priority changes may fail on locked-down hosts (group policies, containers
                // without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still
                // works, just with the platform default scheduling.
                System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}");
            }
        }

        try
        {
            var allResults = new List<BenchmarkResult>();
            var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
            var testDataSets = FilterByLayer(allTestDataSets, layer);

            System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard");
            System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
            System.Console.WriteLine();

            // Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
            // Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
            // alone doesn't ensure that every Serialize<T>/IBufferWriter overload is fully Tier 1 by the time we
            // start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
            // on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
            // Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
            // them all in the background; the per-cell warmup that follows then locks in cache + branch state.
            if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
            {
                System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");

                foreach (var testData in testDataSets)
                {
                    var preSerializers = CreateSerializers(testData, serializerMode);
                    try
                    {
                        foreach (var s in preSerializers)
                        {
                            // Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated:
                            // Ser path first, then Des path — same pattern as the per-cell warmup in
                            // RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming).
                            s.WarmupSerialize(2000);
                            s.WarmupDeserialize(2000);
                        }
                    }
                    finally
                    {
                        // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
                        foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
                    }
                }

                // Let background tiered-JIT compilation drain before we begin measuring.
                if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
                System.Console.WriteLine("✓ Global pre-warmup complete.\n");
            }

            foreach (var testData in testDataSets)
            {
                System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
                System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
                System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");

                var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
                allResults.AddRange(results);
            }

            // Print grouped results
            Output.PrintGroupedResults(allResults, testDataSets);

            // Save results to file
            Output.SaveResults(allResults, testDataSets);

            System.Console.WriteLine("\n✓ Benchmark complete!");
        }
        finally
        {
            // Restore process state — affinity/priority changes are process-wide and persist across
            // interactive-mode iterations of the menu. Without restore, the second menu run would
            // already be on CPU-0 + High priority before its own try-block applied them, masking
            // any stabilization-disabled comparison.
            if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
            {
                try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ }
                try { process.PriorityClass = origPriority; } catch { /* best-effort */ }
            }
        }
    }

    #region Benchmark Execution

    /// <summary>
    /// Forces a full GC cycle at a phase boundary in the benchmark loop. Two-pass collect with finalizer drain
    /// in between: the first pass moves managed garbage to the finalization queue, <c>WaitForPendingFinalizers</c>
    /// runs the finalizers, the second pass reclaims any objects the finalizers released. After this returns the
    /// heap is in a known-quiescent state — the next warmup/measurement phase starts on a clean slate, isolated
    /// from the previous phase's residual allocations (write-buffer pools, intern cache, write-plan arrays, etc.).
    /// Called between every Ser-phase / Des-phase boundary in <see cref="RunBenchmarksForTestData"/>.
    /// </summary>
    [MethodImpl(MethodImplOptions.NoInlining)]
    private static void ForceGcCollect()
    {
        GC.Collect(2, GCCollectionMode.Forced, blocking: true);
        GC.WaitForPendingFinalizers();
        GC.Collect(2, GCCollectionMode.Forced, blocking: true);
    }

    private static List<BenchmarkResult> RunBenchmarksForTestData(TestDataSet testData, string mode, string serializerMode)
    {
        var results = new List<BenchmarkResult>();
        var serializers = CreateSerializers(testData, serializerMode);

        // Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure.
        System.Console.WriteLine("Verifying round-trip correctness...");

        foreach (var serializer in serializers)
        {
            if (!serializer.VerifyRoundTrip())
            {
                System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}");
                System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting.");

                Environment.Exit(1);
            }
        }

        System.Console.WriteLine("✓ All serializers passed round-trip verification.");

        // Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and
        // Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary.
        //
        // Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor
        // in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a
        // cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs
        // keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement
        // (steady-state). Branch-predictor history also stays clean per path.
        //
        // GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer
        // pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent
        // heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations.
        //
        // Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT).
        // Each phase's freshly-promoted methods settle before its timing starts.
        System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n");

        foreach (var serializer in serializers)
        {
            var result = new BenchmarkResult
            {
                TestDataName = testData.DisplayName,  // Use DisplayName for IId% info
                Engine = serializer.Engine,
                IoMode = serializer.IoMode,
                DispatchMode = serializer.DispatchMode,
                OptionsPreset = serializer.OptionsPreset,
                OptionsDescription = serializer.OptionsDescription,
                SerializedSize = serializer.SerializedSize,
                SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes,
                SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes,
                IsRoundTripOnly = serializer.IsRoundTripOnly
            };

            // Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark
            // is visibly stuck on a specific row at a specific %% rather than silently hanging.
            var groupLabel = $"{result.SerializerName}";

            if (serializer.IsRoundTripOnly)
            {
                // Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT,
                // Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the
                // entire round-trip path, then record into the RT result columns.
                if (mode is "all" or "serialize" or "ser")
                {
                    ForceGcCollect();
                    serializer.WarmupSerialize(Configuration.WarmupIterations);
                    if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);

                    var rtIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
                    var (rtMed, rtMin, rtMax, rtStd) = RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
                    result.RoundTripTimeMs = rtMed;
                    result.RoundTripTimeMinMs = rtMin;
                    result.RoundTripTimeMaxMs = rtMax;
                    result.RoundTripTimeStdDevMs = rtStd;
                    result.RoundTripIterations = rtIter;
                    // Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len])
                    // also show up — otherwise current-thread alloc would only count the client side and look ~halved.
                    result.RoundTripAllocBytesPerOp = MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]");
                }
                // mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently.
            }
            else
            {
                // ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
                if (mode is "all" or "serialize" or "ser")
                {
                    ForceGcCollect();
                    serializer.WarmupSerialize(Configuration.WarmupIterations);
                    if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);

                    var serIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
                    var (serMed, serMin, serMax, serStd) = RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
                    result.SerializeTimeMs = serMed;
                    result.SerializeTimeMinMs = serMin;
                    result.SerializeTimeMaxMs = serMax;
                    result.SerializeTimeStdDevMs = serStd;
                    result.SerializeIterations = serIter;
                    // Dedicated alloc-only sample (separate from timing samples; keeps timing pure)
                    result.SerializeAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
                }

                // ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
                // The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the
                // Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph).
                if (mode is "all" or "deserialize" or "des")
                {
                    ForceGcCollect();
                    serializer.WarmupDeserialize(Configuration.WarmupIterations);
                    if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);

                    var desIter = CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs);
                    var (desMed, desMin, desMax, desStd) = RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
                    result.DeserializeTimeMs = desMed;
                    result.DeserializeTimeMinMs = desMin;
                    result.DeserializeTimeMaxMs = desMax;
                    result.DeserializeTimeStdDevMs = desStd;
                    result.DeserializeIterations = desIter;
                    result.DeserializeAllocBytesPerOp = MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]");
                }

                // Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration,
                // batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
                // then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
                // RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp.
                var serPerOp = Output.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations);
                var desPerOp = Output.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations);
                var rtPerOp = serPerOp + desPerOp;
                result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations);
                result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
                result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
            }

            results.Add(result);
            Output.PrintResult(result);
        }

        // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released
        // before the next test data builds new ones — otherwise pipes / handles leak across test cells).
        foreach (var s in serializers) (s as IDisposable)?.Dispose();

        return results;
    }

    private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, string serializerMode)
    {
        // FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
        // TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[].
        //   - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head.
        // Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
        //
        // FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint —
        // we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor
        // reference earlier. Re-enable when revisiting Fast wire-mode performance.
        if (serializerMode == "fastestbyte")
        {
            var fastestByteOptions = AcBinarySerializerOptions.FastMode;
            fastestByteOptions.WireMode = Configuration.SelectedWireMode;

            return new List<ISerializerBenchmark>
            {
                new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"),
                //new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"),
                new MemoryPackBenchmark(testData.Order, "Default"),
            };
        }

        // AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer).
        // Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with
        // the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it
        // in isolation so the timing numbers reflect ONLY the streaming path.
        if (serializerMode == "asyncpipe")
        {
            // NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here
            // drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to
            // the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the
            // NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall
            // fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level
            // wire chunk AND kernel transfer unit; change ONLY this line when tuning.
            var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode;
            binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize;
            binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode;

            return new List<ISerializerBenchmark>
            {
                // Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync.
                // Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer +
                // MRES wait-on-byte-shortage — over a kernel NamedPipe.
                new AcBinaryNamedPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
                // Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport,
                // same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize<T>(byte[]).
                // No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with
                // the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from
                // kernel-transport-overhead (raw vs in-process Byte[]).
                new AcBinaryNamedPipeRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
                // Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel).
                // Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the
                // kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk
                // on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this
                // isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the
                // in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback
                // is the worst-case benchmark scenario for chunked-streaming and not representative of real network
                // / file / cross-thread Pipe scenarios.
                new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
                // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples
                // baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw
                // byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory].
                new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
            };
        }

        // Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the
        // AsyncPipe menu / CLI mode, never bundled with the steady-state suite).

        var binaryNoInternOption = AcBinarySerializerOptions.Default;
        binaryNoInternOption.UseStringInterning = StringInterningMode.None;
        binaryNoInternOption.WireMode = Configuration.SelectedWireMode;

        var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default;
        binaryDefaultNoSgenOption.UseGeneratedCode = false;
        binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode;

        var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode;
        binaryFastModeNoSgenOption.UseGeneratedCode = false;
        binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode;

        var binaryFastModeOption = AcBinarySerializerOptions.FastMode;
        binaryFastModeOption.WireMode = Configuration.SelectedWireMode;

        // BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives
        // the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call.
        // Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration
        // allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead
        // vs syscall count).
        var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode;
        binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize;
        binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode;

        // In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment
        // concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory
        // System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task).
        var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode;
        binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize;
        binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode;

        var defaultOptions = AcBinarySerializerOptions.Default;
        defaultOptions.UseStringInterning = StringInterningMode.None;
        defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId;
        defaultOptions.WireMode = Configuration.SelectedWireMode;

        return new List<ISerializerBenchmark>
        {
            // ============================================================
            // AcBinary — Byte[] API (uncomment to compare option presets side-by-side)
            // ============================================================
            // Fastest Byte[] — SGen path (UseGeneratedCode=true, default).
            new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
            // Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch.
            // Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples.
            // NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates
            // when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish).
            new AcBinaryBenchmark(testData.Order, binaryFastModeNoSgenOption, "FastMode"),
            // Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) +
            // UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size
            // and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data.
            
            new AcBinaryBenchmark(testData.Order, defaultOptions, "Default"),
            //new AcBinaryBenchmark(testData.Order, binaryDefaultNoSgenOption, "Default"),
            //new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"),
            //new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"),

            // AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario)
            new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"),

            // AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario).
            // 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter
            // allocation. Optimum for this scenario.
            new AcBinaryFreshBufferWriterBenchmark(testData.Order, binaryFastModeBufWrChunk, "FastMode (4KB)"),

            // AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the
            // Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost
            // (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns.
            // The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode.
            new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"),

            // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to-
            // apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs
            // chunked-streaming wire format. The IO column shows "Bytes(in-mem)".
            new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"),

            // AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED
            // here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport
            // measurements.

            // ============================================================
            // MemoryPack — three I/O modes for apples-to-apples comparison
            // ============================================================
            new MemoryPackBenchmark(testData.Order, "Default"),
            new MemoryPackBufferWriterBenchmark(testData.Order, "Default"),
            new MemoryPackFreshBufferWriterBenchmark(testData.Order, "Default"),

            // ============================================================
            // MessagePack — for legacy comparison
            // ============================================================
#if !AYCODE_NATIVEAOT
            // MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed
            // ListFormatter<T> et al. — fails under NativeAOT publish with "No parameterless constructor".
            // Excluded from the AOT build; available for regular JIT runs only.
            new MessagePackBenchmark(testData.Order, "ContractBased"),
#endif

            // System.Text.Json (commented — JSON serializer for reference; not in active suite)
            //new SystemTextJsonBenchmark(testData.Order, "Default")
        };
    }

    /// <summary>
    /// Runs the action <paramref name="iterations"/> times for <see cref="Configuration.BenchmarkSamples"/> independent samples,
    /// returning the median, min, and max elapsed time. Multi-sample design reduces single-run variance
    /// from ~±15% to ~±5% by smoothing transient effects (background activity, thermal/turbo state).
    /// When <see cref="Configuration.BenchmarkSamples"/> &lt;= 1, falls back to single-sample timing (Debug / quick mode).
    /// When <paramref name="progressLabel"/> is non-null, emits in-place <c>\r</c> progress updates so a
    /// stuck benchmark (e.g. deadlocked NamedPipe row) is visibly stuck at a specific %% rather than
    /// silently hanging.
    ///
    /// Stabilization (added 2026-05-07):
    ///   1) Pilot sample is run BEFORE the recorded loop and discarded. The first measurement after
    ///      warmup tends to absorb residual JIT bookkeeping and GC bookkeeping; dropping it tightens
    ///      the min/max range without throwing away signal (the median is the SAME data as before).
    ///   2) GC.Collect / WaitForPendingFinalizers / GC.Collect runs BEFORE every recorded sample.
    ///      Without this, GC pressure from sample N occasionally triggered a Gen-2 pause inside
    ///      sample N+1, painting it as an outlier; collecting up-front gives every sample the
    ///      same starting heap shape.
    ///   3) Returns (median, min, max) so the caller can surface the inter-sample range — visible
    ///      noise floor for the row, replacing the previous "median only" view.
    /// </summary>
    private static (double medianMs, double minMs, double maxMs, double stdDevMs) RunTimed(Action action, int iterations, string? progressLabel = null)
    {
        var samples = Configuration.BenchmarkSamples;
        if (samples <= 1)
        {
            // Single-sample fast path (Debug or trivial run) — no allocation, no sort, no stddev.
            var sw = Stopwatch.StartNew();
            RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);
            sw.Stop();
            var ms = sw.Elapsed.TotalMilliseconds;
            EndProgress(progressLabel, ms);
            return (ms, ms, ms, 0);
        }

        // Pilot sample (discarded). Counts as sample index 0 of (samples + 1) for progress display
        // so the user sees an extra "warmup-ish" tick before the recorded samples start.
        GC.Collect();
        GC.WaitForPendingFinalizers();
        GC.Collect();

        var pilotSw = Stopwatch.StartNew();
        RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: 0);
        pilotSw.Stop();
        // intentionally not stored

        var times = new double[samples];
        for (var s = 0; s < samples; s++)
        {
            // Per-sample GC settle. Forces every sample to start from the same heap state, so
            // a Gen-2 pause caused by the previous sample doesn't bleed into the next sample's
            // timing. Cost is paid OUTSIDE the Stopwatch window — no impact on the measurement.
            GC.Collect();
            GC.WaitForPendingFinalizers();
            GC.Collect();

            var sw = Stopwatch.StartNew();
            RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: s + 1);
            sw.Stop();
            times[s] = sw.Elapsed.TotalMilliseconds;
        }

        // Capture min/max/sum/sumSq BEFORE sort to avoid order ambiguity (Array.Sort is in-place).
        var minMs = double.MaxValue;
        var maxMs = double.MinValue;
        var sum = 0.0;
        var sumSq = 0.0;

        for (var i = 0; i < times.Length; i++)
        {
            var t = times[i];
            sum += t;
            sumSq += t * t;
            if (t < minMs) minMs = t;
            if (t > maxMs) maxMs = t;
        }
        // Population stddev (not sample-stddev — we treat the captured samples as the population for
        // CV computation). variance = E[X²] - E[X]² with Math.Max(0, ...) guard against tiny negative
        // values from FP rounding when samples are nearly identical.
        var mean = sum / times.Length;
        var variance = (sumSq / times.Length) - (mean * mean);
        var stdDevMs = Math.Sqrt(Math.Max(0.0, variance));

        Array.Sort(times);
        // Median: middle value for odd sample counts, average of two middles for even counts.
        var medianMs = samples % 2 == 1 ? times[samples / 2] : (times[samples / 2 - 1] + times[samples / 2]) / 2.0;
        EndProgress(progressLabel, medianMs);

        return (medianMs, minMs, maxMs, stdDevMs);
    }

    /// <summary>
    /// Per-cell adaptive iteration calibration. Runs a 100-iter measurement after warmup and computes
    /// how many iterations are needed to reach <see cref="Configuration.TargetSampleMs"/> wall-clock per sample.
    /// Returns iter rounded UP to the nearest 1000, floored at 1000 (the prior fixed minimum) and
    /// ceiling-capped at 200_000 (sanity bound for pathologically fast ops). In Debug single-sample mode
    /// (<c>Configuration.BenchmarkSamples &lt;= 1</c>) returns the global <see cref="Configuration.TestIterations"/> unchanged —
    /// calibration overhead is unjustified there. Calibration runs OUTSIDE the timed sample loop and
    /// does NOT count toward warmup; its sole purpose is to measure per-op cost.
    /// </summary>
    private static int CalibrateIterations(Action action, int targetMs)
    {
        if (Configuration.BenchmarkSamples <= 1) return Configuration.TestIterations; // Debug fast path

        GC.Collect();
        GC.WaitForPendingFinalizers();
        GC.Collect();

        const int calibIter = 100;
        var sw = Stopwatch.StartNew();
        for (var i = 0; i < calibIter; i++) action();
        sw.Stop();
        var ms = sw.Elapsed.TotalMilliseconds;

        // Pathologically-fast op below Stopwatch resolution — cap at ceiling (further calibration won't help).
        if (ms <= 0.0001) return 200_000;

        var iterPerMs = calibIter / ms;
        var raw = (int)Math.Ceiling(targetMs * iterPerMs);
        // Round UP to nearest 1000 — keeps numbers human-readable in the markdown output.
        var rounded = ((raw + 999) / 1000) * 1000;

        return rounded switch
        {
            < 1000 => 1000,
            > 200_000 => 200_000,
            _ => rounded
        };
    }

    /// <summary>
    /// Measures per-call allocation in bytes after a clean GC. Single dedicated sample (no median) — keeps timing samples pure.
    /// </summary>
    private static long MeasureAllocation(Action action, int iterations, string? progressLabel = null)
    {
        GC.Collect();
        GC.WaitForPendingFinalizers();
        GC.Collect();

        var sw = Stopwatch.StartNew();
        var before = GC.GetAllocatedBytesForCurrentThread();
        RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);

        var after = GC.GetAllocatedBytesForCurrentThread();
        sw.Stop();
        EndProgress(progressLabel, sw.Elapsed.TotalMilliseconds);
        return (after - before) / iterations;
    }

    /// <summary>
    /// Process-wide allocation measurement — needed for round-trip-only benchmarks (NamedPipe etc.) where
    /// the work happens across multiple threads. <see cref="GC.GetAllocatedBytesForCurrentThread"/> would
    /// only count the caller-thread allocations, missing the server-side <c>new byte[len]</c> buffers and
    /// any drain-pump-thread allocations. <see cref="GC.GetTotalAllocatedBytes"/> covers the entire process.
    /// Slightly noisier than the per-thread variant (background threads / GC bookkeeping leak in), but
    /// over 1000 iterations the signal dominates.
    /// </summary>
    private static long MeasureAllocationTotal(Action action, int iterations, string? progressLabel = null)
    {
        GC.Collect();
        GC.WaitForPendingFinalizers();
        GC.Collect();

        var sw = Stopwatch.StartNew();
        var before = GC.GetTotalAllocatedBytes(precise: true);
        RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);

        var after = GC.GetTotalAllocatedBytes(precise: true);
        sw.Stop();
        EndProgress(progressLabel, sw.Elapsed.TotalMilliseconds);
        return (after - before) / iterations;
    }

    // ============================================================================================
    // Progress reporting — \r-driven in-place updates so a stuck benchmark surfaces the exact phase
    // and % where it stopped, instead of appearing as a silent hang. Used by RunTimed and the
    // MeasureAllocation* helpers when the caller passes a non-null progressLabel.
    // ============================================================================================

    // Tracks the longest line written by the current progress session, so EndProgress can clear
    // any leftover characters from a prior longer line (avoids "ghost" trailing chars after \r).
    private static int _progressLastLineLen;

    /// <summary>
    /// Runs <paramref name="action"/> <paramref name="iterations"/> times, emitting \r-overwriting
    /// progress every ~10% (approx. 10 progress prints per sample). When <paramref name="label"/>
    /// is null, runs without any progress output (zero overhead beyond a null check per iter).
    /// </summary>
    private static void RunWithProgress(Action action, int iterations, string? label, int samples, int sampleIndex)
    {
        if (label is null)
        {
            for (var i = 0; i < iterations; i++) action();
            return;
        }

        // ~10 progress emits per sample run. Avoid emitting on every iter (Console.Write is
        // expensive enough to skew sub-µs benchmarks if overdone).
        var step = Math.Max(1, iterations / 10);
        for (var i = 0; i < iterations; i++)
        {
            action();
            if ((i + 1) % step == 0 || i == iterations - 1)
            {
                var pct = (int)((i + 1) * 100L / iterations);
                var line = samples > 1
                    ? $"    > {label}  sample {sampleIndex + 1}/{samples}  {pct,3}%  ({i + 1}/{iterations})"
                    : $"    > {label}  {pct,3}%  ({i + 1}/{iterations})";

                System.Console.Write('\r');
                System.Console.Write(line);

                if (line.Length < _progressLastLineLen)
                    System.Console.Write(new string(' ', _progressLastLineLen - line.Length));

                _progressLastLineLen = line.Length;
            }
        }
    }

    /// <summary>
    /// Closes a progress line cleanly: clears any leftover chars and writes a final "done" line on
    /// the same row, terminated by \n so subsequent <c>WriteLine</c> calls render below.
    /// </summary>
    private static void EndProgress(string? label, double elapsedMs)
    {
        if (label is null) return;
        var done = $"    > {label}  done in {elapsedMs,7:F1} ms";

        System.Console.Write('\r');
        System.Console.Write(done);

        if (done.Length < _progressLastLineLen)
            System.Console.Write(new string(' ', _progressLastLineLen - done.Length));

        System.Console.WriteLine();
        _progressLastLineLen = 0;
    }

#if !AYCODE_NATIVEAOT
    private static readonly JsonSerializerOptions VerifyJsonOpts = new()
    {
        WriteIndented = false,

        DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull,
        ReferenceHandler = System.Text.Json.Serialization.ReferenceHandler.IgnoreCycles
    };
#endif

    /// <summary>
    /// Round-trip equality check: serialize both via System.Text.Json (canonical form) and compare strings.
    /// Slower than property-by-property compare, but universal — works for any object graph without custom comparer.
    /// </summary>
    /// <remarks>
    /// AOT publish skip: <c>System.Text.Json</c>'s reflection path uses runtime closed-generic instantiation
    /// (<c>JsonPropertyInfo&lt;TestStatus&gt;</c> et al.) that the trimmer drops, causing
    /// <c>NotSupportedException: missing native code or metadata</c>. The validation is JIT-only — the actual
    /// benchmark Serialize/Deserialize loops don't touch this path. Under AOT we return <c>true</c> so all
    /// <c>VerifyRoundTrip()</c> calls pass without running the cross-format validation.
    /// </remarks>
    private static bool DeepEqualsViaJson(object? a, object? b)
    {
#if AYCODE_NATIVEAOT
        // Skip cross-format validation under AOT — STJ reflection path is incompatible. The roundtrip
        // itself still runs (caller-side Serialize+Deserialize), just the JSON-canonical compare is bypassed.
        return true;
#else
        if (a == null && b == null) return true;
        if (a == null || b == null) return false;

        var jsonA = JsonSerializer.Serialize(a, VerifyJsonOpts);
        var jsonB = JsonSerializer.Serialize(b, VerifyJsonOpts);

        return jsonA == jsonB;
#endif
    }

    /// <summary>
    /// Validates MemoryPack setup at startup. Aborts the benchmark if TestOrder is not [MemoryPackable].
    /// Without this attribute, MemoryPack falls back to runtime resolver (slower) — comparison would be INVALID.
    /// </summary>
    private static void ValidateMemoryPackSetup()
    {
        var typesToCheck = new[] { typeof(TestOrder) };

        foreach (var type in typesToCheck)
        {
            var hasAttr = type.GetCustomAttributes(typeof(MemoryPackableAttribute), inherit: true).Any();
            if (!hasAttr)
            {
                System.Console.Error.WriteLine($"❌ FATAL: {type.FullName} is not [MemoryPackable] — MemoryPack would fall back to runtime resolver, comparison is INVALID for SGen-vs-SGen claim.");
                System.Console.Error.WriteLine("Add [MemoryPackable] to the type and any nested types referenced from it.");

                Environment.Exit(1);
            }
        }
    }

    /// <summary>
    /// Filters test data sets by layer keyword. Layered approach lets you run only what's needed for the iteration cadence.
    /// P1: only "Core" data exists (Small/Medium/Large/Repeated/Deep). Comprehensive and Edge layers will be expanded in P2.
    /// </summary>
    private static List<TestDataSet> FilterByLayer(List<TestDataSet> all, string layer)
    {
        if (layer == "all") return all.ToList();

        var coreNames = new[] { "Small", "Medium", "Large", "Repeated", "Deep" };
        // P2 will add: "Flat", "Polymorphic", "Collection", "Numeric", "NonAscii", etc.
        var comprehensiveExtras = new string[] { /* P2 */ };
        // P3 will add: "ColdStart", "VeryLarge", "PathologicalString", etc.
        var edgeExtras = new string[] { /* P3 */ };

        return layer switch
        {
            "core" => all.Where(t => StartsWithAny(t.Name, coreNames)).ToList(),
            "comprehensive" => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras)).ToList(),
            "edge" => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras) || StartsWithAny(t.Name, edgeExtras)).ToList(),
            // Single-cell A/B mini-suite filters — match by case-insensitive prefix on Name.
            // Use case: tight optimization-iteration loop on one specific cell (e.g. `dotnet run -- repeated`
            // or interactive menu shortcut), avoiding the full ~110 sec suite when only one cell is in scope.
            "small" => all.Where(t => t.Name.StartsWith("Small", StringComparison.OrdinalIgnoreCase)).ToList(),
            "medium" => all.Where(t => t.Name.StartsWith("Medium", StringComparison.OrdinalIgnoreCase)).ToList(),
            "large" => all.Where(t => t.Name.StartsWith("Large", StringComparison.OrdinalIgnoreCase)).ToList(),
            "repeated" => all.Where(t => t.Name.StartsWith("Repeated", StringComparison.OrdinalIgnoreCase)).ToList(),
            "deep" => all.Where(t => t.Name.StartsWith("Deep", StringComparison.OrdinalIgnoreCase)).ToList(),
            _ => all.ToList()
        };

        static bool StartsWithAny(string name, string[] prefixes) => prefixes.Any(name.StartsWith);
    }

    #endregion

    #region Serializer Implementations

    private interface ISerializerBenchmark
    {
        /// <summary>Serializer engine — e.g. "AcBinary", "MemoryPack", "MessagePack".</summary>
        string Engine { get; }
        /// <summary>I/O mode — e.g. "Byte[]", "BufWr reuse", "BufWr new", "NamedPipe", "FileStream".</summary>
        string IoMode { get; }
        /// <summary>Dispatch mode — "SGen", "Runtime", or "Hybrid". For AcBinary derived from <c>UseGeneratedCode</c> + child-type SGen coverage; non-AcBinary engines report their own native dispatch model.</summary>
        string DispatchMode { get; }
        /// <summary>Options preset name — e.g. "FastMode", "Default", "NoIntern", "WithCompression".</summary>
        string OptionsPreset { get; }
        /// <summary>Synthesized display name from Engine + IoMode + OptionsPreset.</summary>
        string Name => $"{Engine} ({IoMode}, {OptionsPreset})";
        int SerializedSize { get; }
        string? OptionsDescription => null;
        /// <summary>One-time SERIALIZER-side setup allocation cost (e.g., pre-allocated ArrayBufferWriter with internal buffer). Captured in constructor; 0 for byte[] API and Fresh-BufWriter variants.</summary>
        long SetupSerializeAllocBytes { get; }
        /// <summary>One-time DESERIALIZER-side setup allocation cost (e.g., long-lived AsyncPipeReaderInput's ArrayPool rent + ManualResetEventSlim, drain-task scaffolding). Captured in constructor; 0 for byte[] API and any setup-free deserialize path.</summary>
        long SetupDeserializeAllocBytes { get; }
        /// <summary>True when Serialize() does a full round-trip (e.g. NamedPipe) and Deserialize() is a no-op.
        /// Used by the SUMMARY: WINNERS section to skip such cells from "Fastest Serialize" and "Fastest Deserialize"
        /// rankings (because both metrics are misleading there) — they still participate in "Fastest Round-trip".
        /// Default false for in-memory IO modes which measure Ser and Des separately.</summary>
        bool IsRoundTripOnly => false;
        /// <summary>Warm only the Serialize path. Default body iterates <see cref="Serialize"/> N times.
        /// Overrides are only needed when the implementor wants Ser-specific warmup-state (e.g. pre-allocate buffers).
        /// On <see cref="IsRoundTripOnly"/> benchmarks (NamedPipe-style) <see cref="Serialize"/> performs the full RT,
        /// so this warms the entire round-trip path.</summary>
        void WarmupSerialize(int iterations)
        {
            for (var i = 0; i < iterations; i++) Serialize();
        }

        /// <summary>Warm only the Deserialize path. Default body iterates <see cref="Deserialize"/> N times.
        /// On <see cref="IsRoundTripOnly"/> benchmarks <see cref="Deserialize"/> is a no-op, so the bench loop
        /// skips the Des-phase entirely for those cells.</summary>
        void WarmupDeserialize(int iterations)
        {
            for (var i = 0; i < iterations; i++) Deserialize();
        }

        void Serialize();
        void Deserialize();
        /// <summary>Round-trip correctness check — called once per cell before warmup. Returns true if Serialize+Deserialize preserves data.</summary>
        bool VerifyRoundTrip();
    }

    private sealed class AcBinaryBenchmark : ISerializerBenchmark
    {
        private readonly TestOrder _order;
        private readonly AcBinarySerializerOptions _options;
        private readonly byte[] _serialized;

        public string Engine => Configuration.EngineAcBinary;
        public string IoMode => Configuration.IoByteArray;
        public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
        public string OptionsPreset { get; }
        public int SerializedSize => _serialized.Length;
        public long SetupSerializeAllocBytes => 0;
        public long SetupDeserializeAllocBytes => 0;
        public string OptionsDescription => BuildAcBinaryOptionsDescription(_options);

        public AcBinaryBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
        {
            _order = order;
            _options = options;
            OptionsPreset = optionsPreset;
            _serialized = AcBinarySerializer.Serialize(order, options);

            //_options.UseCompression = Lz4CompressionMode.Block;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Serialize()
        {
            AcBinarySerializer.Serialize(_order, _options);

            //if (_options.ReferenceHandling != ReferenceHandlingMode.None || _options.UseStringInterning != StringInterningMode.None)
            //{
            //    AcBinarySerializer.ScanOnly(_order, _options);
            //}
            //else AcBinarySerializer.Serialize(_order, _options);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Deserialize() => AcBinaryDeserializer.Deserialize<TestOrder>(_serialized, _options);

        public bool VerifyRoundTrip()
        {
            var bytes = AcBinarySerializer.Serialize(_order, _options);
            var roundTripped = AcBinaryDeserializer.Deserialize<TestOrder>(bytes, _options);
            return DeepEqualsViaJson(_order, roundTripped);
        }
    }

    private sealed class MemoryPackBenchmark : ISerializerBenchmark
    {
        private readonly TestOrder _order;
        private readonly MemoryPackSerializerOptions _options;
        private readonly byte[] _serialized;

        public string Engine => Configuration.EngineMemoryPack;
        public string IoMode => Configuration.IoByteArray;
        public string DispatchMode => Configuration.ModeSGen; // MemoryPack always uses [MemoryPackable] source-generated formatters
        public string OptionsPreset { get; }
        public int SerializedSize => _serialized.Length;
        public long SetupSerializeAllocBytes => 0;
        public long SetupDeserializeAllocBytes => 0;
        public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}";

        public MemoryPackBenchmark(TestOrder order, string optionsPreset)
        {
            _order = order;
            OptionsPreset = optionsPreset;
            _options = GetMemPackOptions();
            _serialized = MemoryPackSerializer.Serialize(order, _options);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Serialize() => MemoryPackSerializer.Serialize(_order, _options);

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Deserialize() => MemoryPackSerializer.Deserialize<TestOrder>(_serialized, _options);

        public bool VerifyRoundTrip()
        {
            var bytes = MemoryPackSerializer.Serialize(_order, _options);
            var roundTripped = MemoryPackSerializer.Deserialize<TestOrder>(bytes, _options);
            return DeepEqualsViaJson(_order, roundTripped);
        }
    }

#if !AYCODE_NATIVEAOT
    // MessagePack benchmark — excluded from NativeAOT build because v3's StandardResolver falls back
    // to DynamicGenericResolver for closed-generic types (List<TestOrderItem> et al.), which uses
    // Activator.CreateInstance on formatter types the AOT trimmer drops → MissingMethodException at runtime.
    // Available for regular JIT runs (`dotnet run`) only.
    private sealed class MessagePackBenchmark : ISerializerBenchmark
    {
        private readonly TestOrder _order;
        private readonly MessagePackSerializerOptions _options;
        private readonly byte[] _serialized;

        public string Engine => Configuration.EngineMessagePack;
        public string IoMode => Configuration.IoByteArray;
        public string DispatchMode => Configuration.ModeSGen; // MessagePack uses [MessagePackObject] source-generated formatters (StandardResolver)
        public string OptionsPreset { get; }
        public int SerializedSize => _serialized.Length;
        public long SetupSerializeAllocBytes => 0;
        public long SetupDeserializeAllocBytes => 0;
        public string OptionsDescription { get; }

        public MessagePackBenchmark(TestOrder order, string optionsPreset)
        {
            _order = order;
            OptionsPreset = optionsPreset;

            //_options = ContractlessStandardResolver.Options.WithCompression(MessagePackCompression.None);
            //_options = ContractlessStandardResolver.Options.WithCompression(MessagePackCompression.Lz4Block);
            _options = MessagePackSerializerOptions.Standard.WithCompression(MessagePackCompression.None);

            var isContractless = _options.Resolver is ContractlessStandardResolver;
            OptionsDescription = $"Mode={( isContractless ? "Contractless" : "ContractBased")}, Compression={_options.Compression}";

            _serialized = MessagePackSerializer.Serialize(order, _options);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Serialize() => MessagePackSerializer.Serialize(_order, _options);

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Deserialize() => MessagePackSerializer.Deserialize<TestOrder>(_serialized, _options);

        public bool VerifyRoundTrip()
        {
            var bytes = MessagePackSerializer.Serialize(_order, _options);
            var roundTripped = MessagePackSerializer.Deserialize<TestOrder>(bytes, _options);
            return DeepEqualsViaJson(_order, roundTripped);
        }
    }
#endif

    /// <summary>
    /// Benchmarks AcBinary via the IBufferWriter overload with a pre-allocated, reused ArrayBufferWriter.
    /// Realistic IBufferWriter usage pattern: caller owns + reuses the writer (zero alloc per call after warmup).
    /// </summary>
    /// <summary>
    /// Benchmarks AcBinary via the IBufferWriter overload, allocating a FRESH ArrayBufferWriter on EVERY call.
    /// One-shot scenario — represents code that doesn't reuse a writer across calls.
    /// Uses BufferWriterChunkSize=4096 (production-realistic, SignalR-aligned) instead of the 65535 default —
    /// otherwise AcBinary would request 64KB upfront via GetSpan(), forcing the fresh ABW to allocate 64KB
    /// regardless of payload size (heavy over-allocation for small payloads).
    /// </summary>
    private sealed class AcBinaryFreshBufferWriterBenchmark : ISerializerBenchmark
    {
        private readonly TestOrder _order;
        private readonly AcBinarySerializerOptions _options;
        private readonly byte[] _serialized;

        public string Engine => Configuration.EngineAcBinary;
        public string IoMode => Configuration.IoBufWrNew;
        public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
        public string OptionsPreset { get; }
        public int SerializedSize => _serialized.Length;
        public long SetupSerializeAllocBytes => 0;
        public long SetupDeserializeAllocBytes => 0;
        public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B");

        public AcBinaryFreshBufferWriterBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
        {
            _order = order;
            // BufferWriterChunkSize comes from the caller (central source of truth in CreateSerializers
            // — the binaryFastMode4KbChunk options instance). Do NOT mutate _options here; tune the chunk
            // size in CreateSerializers only.
            _options = options;
            OptionsPreset = optionsPreset;
            _serialized = AcBinarySerializer.Serialize(order, _options);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Serialize()
        {
            var abw = new ArrayBufferWriter<byte>();  // FRESH every call — alloc + grow as needed
            AcBinarySerializer.Serialize(_order, abw, _options);
        }

        // BufWr semantic: read from a ReadOnlySequence<byte> (the ROS overload), NOT from byte[] —
        // single-segment array-backed sequence triggers the fast-path in AcBinaryDeserializer.cs:298 which
        // redirects to the byte[] overload. This means the bench actually exercises the ROS-input path
        // (the production-realistic surface for SignalR / Pipe consumers) rather than secretly testing
        // byte[] Deser under the BufWr label.
        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Deserialize() => AcBinaryDeserializer.Deserialize<TestOrder>(new ReadOnlySequence<byte>(_serialized), _options);

        public bool VerifyRoundTrip()
        {
            var abw = new ArrayBufferWriter<byte>();
            AcBinarySerializer.Serialize(_order, abw, _options);
            var roundTripped = AcBinaryDeserializer.Deserialize<TestOrder>(new ReadOnlySequence<byte>(abw.WrittenMemory), _options);
            return DeepEqualsViaJson(_order, roundTripped);
        }
    }

    /// <summary>
    /// Benchmarks AcBinary over a long-lived NamedPipe IPC connection using the AcBinary native streaming API
    /// (<see cref="AcBinarySerializer.SerializeChunked{T}(T, System.IO.Pipelines.PipeWriter, AcBinarySerializerOptions)"/>
    /// + <see cref="AsyncPipeReaderInput"/> + <see cref="AsyncPipeReaderInputExtensions.DrainFromAsync"/>).
    /// Mirrors what a real consumer (e.g. <c>DeserializeFromPipeReaderAsync</c>) does per message:
    /// long-lived <see cref="AsyncPipeReaderInput"/> with multi-message wire framing on top of a long-lived NamedPipe.
    ///
    /// <para><b>Architecture</b>:</para>
    /// <list type="bullet">
    ///   <item>Constructor (NOT timed): sets up <see cref="NamedPipeServerStream"/> + <see cref="NamedPipeClientStream"/>,
    ///     waits for connection, creates one long-lived <see cref="System.IO.Pipelines.PipeWriter"/> /
    ///     <see cref="System.IO.Pipelines.PipeReader"/> pair, ONE long-lived <see cref="AsyncPipeReaderInput"/>
    ///     in <c>multiMessage = true</c> mode, ONE drain Task that pumps <see cref="AsyncPipeReaderInputExtensions.DrainFromAsync"/>
    ///     forever, and ONE deserialize Task that loops <c>AcBinaryDeserializer.Deserialize&lt;T&gt;(input, opts)</c>
    ///     producing into a <see cref="System.Threading.Channels.Channel{T}"/>.</item>
    ///   <item>Per-iteration <see cref="Serialize"/> (timed): sender writes via
    ///     <see cref="AcBinarySerializer.SerializeChunkedFramed{T}(T, System.IO.Pipelines.PipeWriter, AcBinarySerializerOptions)"/>
    ///     — multi-message wire (<c>[201][UINT16][data]...[202]</c>); the <c>[202]</c> end marker arms the input's
    ///     <c>_readPos = -1</c> sentinel, so the next message's first <c>AppendToBuffer</c> recycles the buffer to 0.
    ///     Then receiver awaits the channel for the deserialized result.</item>
    ///   <item><see cref="Deserialize"/> is a no-op (full round-trip captured in <see cref="Serialize"/>);
    ///     <see cref="IsRoundTripOnly"/>=true → Ser ms / SerAlloc oszlopok N/A, RT ms = full round-trip.</item>
    /// </list>
    ///
    /// <para><b>Per-iter overhead</b>: 0 new <c>Task.Run</c>, 0 new <c>AsyncPipeReaderInput</c>, 0 new <c>CancellationTokenSource</c>.
    /// Pure cost = <c>SerializeChunkedFramed</c> (CPU + chunk-onkénti flush) + kernel write/read syscalls + 1 sync barrier
    /// (channel) + deserialized graph alloc. The "multi-message reuse" pattern enabled by Q4T8 fix (R5K2 minimum: <c>_readPos = -1</c>
    /// sentinel + <c>AppendToBuffer</c> sliding-window cycling).</para>
    ///
    /// <para><b>Approximation note</b>: single-process loopback NamedPipe. Real cross-process / cross-machine SignalR
    /// adds further transport latency (TCP, WebSocket framing) on top. The benchmark gives a lower bound.</para>
    /// </summary>
    private sealed class AcBinaryNamedPipeBenchmark : ISerializerBenchmark, IDisposable
    {
        private readonly TestOrder _order;
        private readonly AcBinarySerializerOptions _options;
        private readonly byte[] _serialized; // for SerializedSize reporting only

        // Long-lived pipe lifecycle (set up once in ctor — NOT timed).
        private readonly NamedPipeServerStream _pipeServer;
        private readonly NamedPipeClientStream _pipeClient;
        private readonly PipeWriter _pipeWriter;
        private readonly PipeReader _pipeReader;

        // Long-lived multi-message receive infrastructure (set up once in ctor).
        private readonly AsyncPipeReaderInput _input;
        private readonly CancellationTokenSource _cts;
        private readonly Task _drainTask;       // BG: PipeReader → input.Feed (continuous pump)
        private readonly Task _consumerTask;    // BG: per-iter Deserialize<T>(input) loop, signaled by calling thread
        private readonly ManualResetEventSlim _consumeRequest = new(false);
        private readonly ManualResetEventSlim _consumeDone = new(false);
        private object? _lastResult;            // captured during VerifyRoundTrip; null in benchmark iters
        private bool _captureResult;            // toggle: when true, ConsumeLoop stores result; otherwise discards
        private bool _disposed;

        public string Engine => Configuration.EngineAcBinary;
        public string IoMode => Configuration.IoNamedPipe;
        public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
        public string OptionsPreset { get; }
        public int SerializedSize => _serialized.Length;
        public long SetupSerializeAllocBytes { get; }
        public long SetupDeserializeAllocBytes { get; }
        public bool IsRoundTripOnly => true;
        public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=NamedPipe(long-lived,multiMessage,2-task)");

        public AcBinaryNamedPipeBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
        {
            _order = order;
            // BufferWriterChunkSize comes from the caller (central source of truth in CreateSerializers
            // — the binaryFastMode4KbChunk options instance). Do NOT mutate _options here; tune the chunk
            // size in CreateSerializers only.
            _options = options;
            OptionsPreset = optionsPreset;

            _serialized = AcBinarySerializer.Serialize(order, _options);

            // 1× pipe setup. Kernel-side pipe buffer (inBufferSize / outBufferSize on the server ctor — the
            // client inherits the server-defined buffer size at connect time) matches BufferWriterChunkSize
            // exactly: AsyncPipeWriterOutput now treats chunkSize as the chunk-on-wire total size (header +
            // data), so one WriteFile(chunkSize) syscall lands in exactly one kernel-page slot — page-aligned,
            // no fragmentation, no IRP reordering. _options.BufferWriterChunkSize is the single tunable source.
            var pipeName = $"AcBinaryBench-{Guid.NewGuid():N}";

            // === SERIALIZE-side setup measurement ===
            // pipe-pair (server + client) + connect handshake + writer-side PipeWriter wrapper.
            GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
            var beforeSer = GC.GetAllocatedBytesForCurrentThread();

            _pipeServer = new NamedPipeServerStream(pipeName, PipeDirection.In, 1, PipeTransmissionMode.Byte,
                System.IO.Pipes.PipeOptions.Asynchronous,
                inBufferSize:  _options.BufferWriterChunkSize,
                outBufferSize: _options.BufferWriterChunkSize);

            _pipeClient = new NamedPipeClientStream(".", pipeName, PipeDirection.Out, System.IO.Pipes.PipeOptions.Asynchronous);

            var serverWait = _pipeServer.WaitForConnectionAsync();
            _pipeClient.Connect();
            serverWait.GetAwaiter().GetResult();

            _pipeWriter = PipeWriter.Create(_pipeClient);
            var afterSer = GC.GetAllocatedBytesForCurrentThread();
            SetupSerializeAllocBytes = afterSer - beforeSer;

            // === DESERIALIZE-side setup measurement ===
            // PipeReader wrapper + AsyncPipeReaderInput (ArrayPool rent + ManualResetEventSlim) + drain
            // task + consumer task scaffolding. Two long-lived BG tasks total: drain pumps bytes from the
            // kernel pipe into input; consumer drives Deserialize<T>(input) per iter on signal.
            GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
            var beforeDes = GC.GetAllocatedBytesForCurrentThread();

            _pipeReader = PipeReader.Create(_pipeServer);
            _input = new AsyncPipeReaderInput(_options.BufferWriterChunkSize * 2, multiMessage: true);
            _cts = new CancellationTokenSource();

            // Drain task: pumps PipeReader → input.Feed forever (or until cancel). Single Task.Run for
            // the full benchmark lifetime — its overhead is amortised across all messages.
            _drainTask = Task.Run(() => _input.DrainFromAsync(_pipeReader, _cts.Token));

            // Consumer task: per-iter Deserialize<T>(input) loop. Started here once; signaled per-iter via
            // _consumeRequest. Enables Ser↔Des streaming overlap — calling thread runs SerializeChunkedFramed
            // while THIS task simultaneously runs Deserialize<T>, both consuming/producing through the
            // sliding-window buffer pipelined by the drain task.
            _consumerTask = Task.Run(ConsumeLoop);

            var afterDes = GC.GetAllocatedBytesForCurrentThread();
            SetupDeserializeAllocBytes = afterDes - beforeDes;
        }

        // BG consumer: parks on _consumeRequest, runs Deserialize<T>(_input) when signaled, signals _consumeDone.
        // The Deserialize call internally blocks on the input's MRES whenever the drain hasn't yet fed enough
        // bytes for the next read — that's where the streaming-pipeline overlap with the calling thread (Ser)
        // happens.
        private void ConsumeLoop()
        {
            var ct = _cts.Token;
            try
            {
                while (true)
                {
                    _consumeRequest.Wait(ct);
                    if (ct.IsCancellationRequested) return;
                    _consumeRequest.Reset();

                    try
                    {
                        var result = AcBinaryDeserializer.Deserialize<TestOrder>(_input, _options);
                        if (_captureResult) _lastResult = result;
                    }
                    catch
                    {
                        // Swallow — calling thread sees the failure via missing/incorrect _lastResult during VerifyRoundTrip,
                        // or the benchmark loop just continues (timing impacted). Production teardown handled in Dispose.
                    }
                    finally
                    {
                        _consumeDone.Set();
                    }
                }
            }
            catch (OperationCanceledException)
            {
                // Cooperative cancel — Dispose path. Swallow.
            }
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Serialize()
        {
            // 2-task streaming pipeline:
            // 1. Calling thread signals consumer task to begin Deserialize<T>(input). Consumer immediately
            //    starts; first read blocks on input's MRES because no bytes flowed yet.
            // 2. Calling thread starts SerializeChunkedFramed → chunks flow through PipeWriter → kernel pipe →
            //    drain task (BG) feeds input.Feed → MRES pulses → consumer's Deserialize<T> consumes bytes
            //    chunk by chunk. Ser↔Des truly overlap here.
            // 3. Calling thread waits for _consumeDone (signaling Deserialize<T> returned).
            _consumeDone.Reset();
            _consumeRequest.Set();

            AcBinarySerializer.SerializeChunkedFramed(_order, _pipeWriter, _options);

            _consumeDone.Wait();
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Deserialize()
        {
            // No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract.
        }

        public bool VerifyRoundTrip()
        {
            // Use the same 2-task streaming path as the benchmark, but capture the result for graph-equality.
            _captureResult = true;
            try
            {
                Serialize();
                var result = _lastResult as TestOrder;
                return result != null && DeepEqualsViaJson(_order, result);
            }
            finally
            {
                _captureResult = false;
                _lastResult = null;
            }
        }

        public void Dispose()
        {
            if (_disposed) return;
            _disposed = true;

            // Cancel drain + consumer tasks → both exit. Pulse _consumeRequest in case consumer is parked.
            try { _cts.Cancel(); } catch { /* swallow on teardown */ }
            try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ }
            try { _drainTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
            try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }

            // Complete writer + dispose pipe lifecycle.
            try { _pipeWriter.CompleteAsync().AsTask().Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
            try { _pipeReader.Complete(); } catch { /* swallow on teardown */ }
            try { _pipeClient.Dispose(); } catch { /* swallow on teardown */ }
            try { _pipeServer.Dispose(); } catch { /* swallow on teardown */ }
            try { _input.Dispose(); } catch { /* swallow on teardown */ }
            try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ }
            try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ }
            try { _cts.Dispose(); } catch { /* swallow on teardown */ }
        }
    }

    /// <summary>
    /// Same chunked-framed AsyncPipe code path as <see cref="AcBinaryNamedPipeBenchmark"/>, but the transport
    /// is an in-memory <see cref="System.IO.Pipelines.Pipe"/> instead of a kernel <c>NamedPipe</c>. The Pipe's
    /// <c>Writer</c>/<c>Reader</c> pair is a managed-only zero-copy slab handoff — no syscalls, no kernel
    /// buffer copy, no IRP queueing.
    ///
    /// <para><b>Why this benchmark matters</b>: by holding ALL other variables constant (same SerializeChunkedFramed,
    /// same AsyncPipeReaderInput, same drain task, same consumer task, same multi-message wire format), this
    /// row isolates the <b>kernel-NamedPipe transport overhead</b> from the chunked-streaming framework's pure
    /// CPU cost. The expected delta vs <see cref="AcBinaryNamedPipeBenchmark"/>: per-chunk overhead drops from
    /// ~25-30 µs (kernel-syscall pair + IRP) to ~1-2 µs (managed slab handoff). Multi-chunk Large-message rows
    /// should converge dramatically toward <see cref="AcBinaryNamedPipeRawByteArrayBenchmark"/>.</para>
    ///
    /// <para><b>Real-world relevance</b>: in-memory Pipe is the typical primitive used for cross-thread serializer
    /// pipelines inside a single process (e.g. SignalR's Kestrel transport adapter, gRPC framework internals,
    /// custom message brokers). The numbers from this row reflect that scenario, NOT the kernel-pipe loopback
    /// of the NamedPipe benchmark.</para>
    /// </summary>
    private sealed class AcBinaryInMemoryPipeBenchmark : ISerializerBenchmark, IDisposable
    {
        private readonly TestOrder _order;
        private readonly AcBinarySerializerOptions _options;
        private readonly byte[] _serialized; // for SerializedSize reporting only

        // Long-lived in-memory pipe lifecycle (set up once in ctor — NOT timed).
        private readonly Pipe _pipe;
        private readonly PipeWriter _pipeWriter;
        private readonly PipeReader _pipeReader;

        // Long-lived multi-message receive infrastructure (set up once in ctor) — same pattern as the NamedPipe
        // variant: drain pumps reader into AsyncPipeReaderInput, consumer task drives Deserialize<T>(input).
        private readonly AsyncPipeReaderInput _input;
        private readonly CancellationTokenSource _cts;
        private readonly Task _drainTask;
        private readonly Task _consumerTask;
        private readonly ManualResetEventSlim _consumeRequest = new(false);
        private readonly ManualResetEventSlim _consumeDone = new(false);
        private object? _lastResult;
        private bool _captureResult;
        private bool _disposed;

        public string Engine => Configuration.EngineAcBinary;
        public string IoMode => Configuration.IoInMemoryPipe;
        public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
        public string OptionsPreset { get; }
        public int SerializedSize => _serialized.Length;
        public long SetupSerializeAllocBytes { get; }
        public long SetupDeserializeAllocBytes { get; }
        public bool IsRoundTripOnly => true;
        public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=Pipe(in-memory,multiMessage,2-task)");

        public AcBinaryInMemoryPipeBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
        {
            _order = order;
            _options = options;
            OptionsPreset = optionsPreset;

            _serialized = AcBinarySerializer.Serialize(order, _options);

            // === SERIALIZE-side setup measurement ===
            // In-memory Pipe construction. NO kernel-pipe pair, NO Connect handshake — just a managed Pipe object
            // and a reference to its Writer side. PipeWriterImpl (parallel-flush capable, NOT StreamPipeWriter).
            GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
            var beforeSer = GC.GetAllocatedBytesForCurrentThread();
            _pipe = new Pipe();
            _pipeWriter = _pipe.Writer;
            var afterSer = GC.GetAllocatedBytesForCurrentThread();
            SetupSerializeAllocBytes = afterSer - beforeSer;

            // === DESERIALIZE-side setup measurement ===
            // PipeReader reference + AsyncPipeReaderInput (ArrayPool rent + ManualResetEventSlim) + drain task +
            // consumer task scaffolding. Identical to the NamedPipe variant on the receive side.
            GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
            var beforeDes = GC.GetAllocatedBytesForCurrentThread();

            _pipeReader = _pipe.Reader;
            _input = new AsyncPipeReaderInput(_options.BufferWriterChunkSize * 2, multiMessage: true);
            _cts = new CancellationTokenSource();
            _drainTask = Task.Run(() => _input.DrainFromAsync(_pipeReader, _cts.Token));
            _consumerTask = Task.Run(ConsumeLoop);

            var afterDes = GC.GetAllocatedBytesForCurrentThread();
            SetupDeserializeAllocBytes = afterDes - beforeDes;
        }

        // BG consumer: parks on _consumeRequest, runs Deserialize<T>(_input) when signaled, signals _consumeDone.
        // Mirror of AcBinaryNamedPipeBenchmark.ConsumeLoop — same pattern, same MRES protocol.
        private void ConsumeLoop()
        {
            var ct = _cts.Token;
            try
            {
                while (true)
                {
                    _consumeRequest.Wait(ct);
                    if (ct.IsCancellationRequested) return;
                    _consumeRequest.Reset();

                    try
                    {
                        var result = AcBinaryDeserializer.Deserialize<TestOrder>(_input, _options);
                        if (_captureResult) _lastResult = result;
                    }
                    catch
                    {
                        // Swallow — see ConsumeLoop in NamedPipe variant for rationale.
                    }
                    finally
                    {
                        _consumeDone.Set();
                    }
                }
            }
            catch (OperationCanceledException)
            {
                // Cooperative cancel — Dispose path. Swallow.
            }
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Serialize()
        {
            // Same 2-task streaming pipeline as NamedPipe variant — only the transport differs (in-memory Pipe
            // instead of kernel NamedPipe). Per-chunk SerializeChunkedFramed → PipeWriter slab → drain task
            // reads from PipeReader → input.Feed → consumer Deserialize<T> consumes byte-by-byte.
            //
            // Uses the Pipe-overload (instead of the PipeWriter-overload) so the FlushPolicy parameter is
            // exposed for tuning. Toggle between FlushPolicy.PerChunk (bounded peak memory, per-chunk await
            // FlushAsync) and FlushPolicy.Coalesced (fire-and-forget per chunk, pipe-coalesced flushes up to
            // PauseWriterThreshold ~64 KB) to A/B-test the streaming-pipeline overhead. FlushPolicy.PerChunk
            // is functionally equivalent to the PipeWriter-overload (both internally route to
            // SerializeToPipeWriterCore with FlushPolicy.PerChunk).
            _consumeDone.Reset();
            _consumeRequest.Set();

            AcBinarySerializer.SerializeChunkedFramed(_order, _pipe, _options, FlushPolicy.Coalesced);

            _consumeDone.Wait();
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Deserialize()
        {
            // No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract.
        }

        public bool VerifyRoundTrip()
        {
            _captureResult = true;
            try
            {
                Serialize();
                var result = _lastResult as TestOrder;
                return result != null && DeepEqualsViaJson(_order, result);
            }
            finally
            {
                _captureResult = false;
                _lastResult = null;
            }
        }

        public void Dispose()
        {
            if (_disposed) return;
            _disposed = true;

            // Cancel drain + consumer tasks → both exit. Pulse _consumeRequest in case consumer is parked.
            try { _cts.Cancel(); } catch { /* swallow on teardown */ }
            try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ }
            try { _drainTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
            try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }

            // Complete writer + reader (in-memory Pipe — no underlying stream to dispose).
            try { _pipeWriter.CompleteAsync().AsTask().Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
            try { _pipeReader.Complete(); } catch { /* swallow on teardown */ }
            try { _input.Dispose(); } catch { /* swallow on teardown */ }
            try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ }
            try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ }
            try { _cts.Dispose(); } catch { /* swallow on teardown */ }
        }
    }

    /// <summary>
    /// Raw <c>byte[]</c> over a long-lived NamedPipe — NO chunk-framing, NO <c>AsyncPipeReaderInput</c>,
    /// NO sliding-window buffer. Calling thread serialises + writes; a long-lived background consumer task
    /// reads and deserialises. Two-task pattern enables Ser↔Read overlap (kernel-pipe-pipelined) AND
    /// avoids the kernel-buffer-full deadlock when <c>bytes.Length &gt; inBufferSize</c>.
    ///
    /// Side-by-side with <see cref="AcBinaryNamedPipeBenchmark"/> (chunked-framed AsyncPipe stack) this
    /// isolates two cost components on the SAME kernel-pipe transport with the SAME <c>inBufferSize</c>:
    /// <list type="bullet">
    ///   <item><description><b>This row vs <see cref="AcBinaryBenchmark"/> (Byte[])</b> — pure kernel-NamedPipe
    ///     overhead (WriteFile / ReadFile syscalls + IRP queueing + buffer-copy + thread-handoff).</description></item>
    ///   <item><description><b>This row vs <see cref="AcBinaryNamedPipeBenchmark"/> (chunked-framed)</b> — pure
    ///     AsyncPipe-framework overhead (chunk header writes + sliding-window <c>Feed</c> + MRES wait inside
    ///     <c>AsyncPipeReaderInput</c>) AND the streaming-pipeline benefit of intra-message Ser↔Des overlap (which
    ///     raw lacks — raw can only Ser↔Read overlap, with Des sequential after Read completes).</description></item>
    /// </list>
    /// Per-iter <c>byte[]</c> allocation from <c>AcBinarySerializer.Serialize</c> is part of the cost (matches
    /// <see cref="AcBinaryBenchmark"/>'s API contract); the receive-side scratch buffer is also allocated per-iter
    /// on the consumer-task (counted via <c>GC.GetTotalAllocatedBytes</c> in <c>MeasureAllocationTotal</c>).
    /// </summary>
    private sealed class AcBinaryNamedPipeRawByteArrayBenchmark : ISerializerBenchmark, IDisposable
    {
        private readonly TestOrder _order;
        private readonly AcBinarySerializerOptions _options;
        private readonly byte[] _serialized; // for SerializedSize reporting + receive-side size known upfront

        // Long-lived pipe lifecycle (set up once in ctor — NOT timed).
        private readonly NamedPipeServerStream _pipeServer;
        private readonly NamedPipeClientStream _pipeClient;

        // Long-lived consumer-task infrastructure (Read + Deserialize on BG thread, signaled per iter).
        // Mirrors AcBinaryNamedPipeBenchmark's drain+consumer pair, but raw byte[] doesn't have an
        // intermediate sliding-window buffer, so Read+Des happen sequentially in one BG task: Read N bytes
        // → Deserialize<T>(bytes) → signal done. Calling thread's Ser↔Write overlaps with this BG Read+Des
        // through kernel-pipe pipelining.
        private readonly CancellationTokenSource _cts;
        private readonly Task _consumerTask;
        private readonly ManualResetEventSlim _consumeRequest = new(false);
        private readonly ManualResetEventSlim _consumeDone = new(false);
        private int _pendingReadSize;
        private object? _lastResult;            // captured during VerifyRoundTrip; null in benchmark iters
        private bool _captureResult;            // toggle: when true, ConsumerLoop stores result; otherwise discards
        private bool _disposed;

        public string Engine => Configuration.EngineAcBinary;
        public string IoMode => Configuration.IoNamedPipeRaw;
        public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
        public string OptionsPreset { get; }
        public int SerializedSize => _serialized.Length;
        public long SetupSerializeAllocBytes { get; }
        public long SetupDeserializeAllocBytes { get; }
        public bool IsRoundTripOnly => true;
        public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=NamedPipe(raw,2-task)");

        public AcBinaryNamedPipeRawByteArrayBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
        {
            _order = order;
            // BufferWriterChunkSize comes from the caller — same source-of-truth contract as
            // AcBinaryNamedPipeBenchmark. The kernel pipe-buffer (inBufferSize) is wired to it so the
            // raw-vs-chunked comparison runs on identical transport conditions.
            _options = options;
            OptionsPreset = optionsPreset;

            _serialized = AcBinarySerializer.Serialize(order, _options);

            var pipeName = $"AcBinaryBenchRaw-{Guid.NewGuid():N}";

            // === SERIALIZE-side setup measurement ===
            // pipe-pair (server + client) + connect handshake. NO PipeWriter wrapper — we use the raw
            // Stream.Write API directly, matching the no-framing semantics of this benchmark.
            GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
            var beforeSer = GC.GetAllocatedBytesForCurrentThread();
            _pipeServer = new NamedPipeServerStream(pipeName, PipeDirection.In, 1, PipeTransmissionMode.Byte,
                System.IO.Pipes.PipeOptions.Asynchronous,
                inBufferSize:  _options.BufferWriterChunkSize,
                outBufferSize: _options.BufferWriterChunkSize);
            _pipeClient = new NamedPipeClientStream(".", pipeName, PipeDirection.Out, System.IO.Pipes.PipeOptions.Asynchronous);

            var serverWait = _pipeServer.WaitForConnectionAsync();
            _pipeClient.Connect();
            serverWait.GetAwaiter().GetResult();
            var afterSer = GC.GetAllocatedBytesForCurrentThread();
            SetupSerializeAllocBytes = afterSer - beforeSer;

            // === DESERIALIZE-side setup measurement ===
            // 1× background consumer-task + 2× MRES (request / done) + cancellation source. Matches the
            // chunked benchmark's deserialize-side setup cost shape.
            GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
            var beforeDes = GC.GetAllocatedBytesForCurrentThread();
            _cts = new CancellationTokenSource();
            _consumerTask = Task.Run(ConsumerLoop);
            var afterDes = GC.GetAllocatedBytesForCurrentThread();
            SetupDeserializeAllocBytes = afterDes - beforeDes;
        }

        // BG consumer: parks on _consumeRequest, reads N bytes from pipe, runs Deserialize<T>(bytes), signals
        // _consumeDone. The Read overlaps with the calling thread's Write through the kernel-pipe; Des happens
        // sequentially after Read completes (raw byte[] needs the full message to deserialize).
        private void ConsumerLoop()
        {
            var ct = _cts.Token;
            try
            {
                while (true)
                {
                    _consumeRequest.Wait(ct);
                    if (ct.IsCancellationRequested) return;
                    _consumeRequest.Reset();

                    try
                    {
                        var size = _pendingReadSize;
                        var bytes = new byte[size]; // per-iter alloc — counted by MeasureAllocationTotal
                        var totalRead = 0;
                        while (totalRead < size)
                        {
                            var n = _pipeServer.Read(bytes, totalRead, size - totalRead);
                            if (n == 0) break; // pipe closed / EOF — partial read swallowed
                            totalRead += n;
                        }
                        var result = AcBinaryDeserializer.Deserialize<TestOrder>(bytes, _options);
                        if (_captureResult) _lastResult = result;
                    }
                    catch
                    {
                        // Swallow — calling thread sees the failure via missing/incorrect _lastResult during VerifyRoundTrip,
                        // or the benchmark loop just continues (timing impacted). Production teardown handled in Dispose.
                    }
                    finally
                    {
                        _consumeDone.Set();
                    }
                }
            }
            catch (OperationCanceledException)
            {
                // Cooperative cancel — Dispose path. Swallow.
            }
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Serialize()
        {
            // 2-task streaming pipeline:
            // 1. Calling thread serialises → fresh byte[] (per-iter alloc, matches AcBinaryBenchmark contract).
            // 2. Calling thread hands off expected size + signals consumer task. Consumer task starts Read loop
            //    on the pipe (BG thread). Calling thread proceeds to Write the bytes — Read and Write overlap
            //    through the kernel-pipe (kernel buffer fills, drains as consumer reads, sender resumes).
            // 3. Calling thread waits for _consumeDone (consumer task finished Read+Des).
            //
            // Note: unlike chunked, raw byte[] cannot do Ser↔Des overlap (Des needs the full bytes before
            // starting). Only Write↔Read overlaps here. The Des sequence on BG thread is: Read full bytes →
            // Des the full graph → signal done. This is the architectural difference between raw and chunked.
            var bytes = AcBinarySerializer.Serialize(_order, _options);

            _pendingReadSize = bytes.Length;
            _consumeDone.Reset();
            _consumeRequest.Set();

            _pipeClient.Write(bytes, 0, bytes.Length);
            _pipeClient.Flush();

            _consumeDone.Wait();
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Deserialize()
        {
            // No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract.
        }

        public bool VerifyRoundTrip()
        {
            // Use the same 2-task streaming path as the benchmark, but capture the result for graph-equality.
            _captureResult = true;
            try
            {
                Serialize();
                var result = _lastResult as TestOrder;
                return result != null && DeepEqualsViaJson(_order, result);
            }
            finally
            {
                _captureResult = false;
                _lastResult = null;
            }
        }

        public void Dispose()
        {
            if (_disposed) return;
            _disposed = true;

            // Cancel the consumer task → ConsumerLoop exits its Wait via OperationCanceledException.
            try { _cts.Cancel(); } catch { /* swallow on teardown */ }
            try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ }
            try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }

            // Symmetric teardown — close client first (writer side), then server.
            try { _pipeClient.Dispose(); } catch { /* swallow on teardown */ }
            try { _pipeServer.Dispose(); } catch { /* swallow on teardown */ }
            try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ }
            try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ }
            try { _cts.Dispose(); } catch { /* swallow on teardown */ }
        }
    }

    /// <summary>
    /// Raw <c>byte[]</c> over an in-memory cross-thread handoff — NO transport (no NamedPipe, no Pipe, no
    /// Channel<see langword="&lt;T&gt;"/>). Calling thread serialises into a fresh <c>byte[]</c>, hands it to a
    /// background consumer task via a single byte[] slot + MRES pair; the consumer deserialises and signals done.
    ///
    /// <para><b>Why this benchmark matters</b>: completes the 2x2 transport × wire-format matrix:</para>
    /// <list type="bullet">
    ///   <item><description><b>NamedPipe + Chunked</b> = <see cref="AcBinaryNamedPipeBenchmark"/></description></item>
    ///   <item><description><b>NamedPipe + Raw</b> = <see cref="AcBinaryNamedPipeRawByteArrayBenchmark"/></description></item>
    ///   <item><description><b>In-memory Pipe + Chunked</b> = <see cref="AcBinaryInMemoryPipeBenchmark"/></description></item>
    ///   <item><description><b>In-memory + Raw</b> = THIS row — apples-to-apples baseline for the in-memory chunked row</description></item>
    /// </list>
    /// <para>Side-by-side with <see cref="AcBinaryInMemoryPipeBenchmark"/> this isolates the chunked-streaming
    /// framework's pure CPU cost, with the same in-memory transport (zero kernel involvement) on both sides.
    /// Side-by-side with <see cref="AcBinaryNamedPipeRawByteArrayBenchmark"/> this isolates the kernel-NamedPipe
    /// overhead on the raw-byte[] side.</para>
    /// </summary>
    private sealed class AcBinaryInMemoryRawByteArrayBenchmark : ISerializerBenchmark, IDisposable
    {
        private readonly TestOrder _order;
        private readonly AcBinarySerializerOptions _options;
        private readonly byte[] _serialized; // for SerializedSize reporting only

        // Long-lived consumer-task infrastructure (Deserialize on BG thread, signaled per iter).
        // No transport — just a byte[] slot for handoff between calling thread and consumer task.
        private readonly CancellationTokenSource _cts;
        private readonly Task _consumerTask;
        private readonly ManualResetEventSlim _consumeRequest = new(false);
        private readonly ManualResetEventSlim _consumeDone = new(false);
        private byte[]? _pendingBytes;             // calling thread → consumer task handoff slot
        private object? _lastResult;               // captured during VerifyRoundTrip; null in benchmark iters
        private bool _captureResult;
        private bool _disposed;

        public string Engine => Configuration.EngineAcBinary;
        public string IoMode => Configuration.IoInMemoryRaw;
        public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
        public string OptionsPreset { get; }
        public int SerializedSize => _serialized.Length;
        public long SetupSerializeAllocBytes { get; }
        public long SetupDeserializeAllocBytes { get; }
        public bool IsRoundTripOnly => true;
        public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=in-memory(raw,2-task)");

        public AcBinaryInMemoryRawByteArrayBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
        {
            _order = order;
            _options = options;
            OptionsPreset = optionsPreset;

            _serialized = AcBinarySerializer.Serialize(order, _options);

            // === SERIALIZE-side setup measurement ===
            // Nothing to set up — calling thread allocates byte[] per iter via AcBinarySerializer.Serialize.
            SetupSerializeAllocBytes = 0;

            // === DESERIALIZE-side setup measurement ===
            // 1× background consumer-task + 2× MRES (request / done) + cancellation source.
            GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
            var beforeDes = GC.GetAllocatedBytesForCurrentThread();
            _cts = new CancellationTokenSource();
            _consumerTask = Task.Run(ConsumerLoop);
            var afterDes = GC.GetAllocatedBytesForCurrentThread();
            SetupDeserializeAllocBytes = afterDes - beforeDes;
        }

        // BG consumer: parks on _consumeRequest, picks up the byte[] from _pendingBytes, runs Deserialize<T>(bytes),
        // signals _consumeDone. Direct in-process handoff — no transport syscall, no buffer copy beyond the byte[]
        // reference itself (zero-copy by reference).
        private void ConsumerLoop()
        {
            var ct = _cts.Token;
            try
            {
                while (true)
                {
                    _consumeRequest.Wait(ct);
                    if (ct.IsCancellationRequested) return;
                    _consumeRequest.Reset();

                    try
                    {
                        var bytes = _pendingBytes;
                        if (bytes != null)
                        {
                            var result = AcBinaryDeserializer.Deserialize<TestOrder>(bytes, _options);
                            if (_captureResult) _lastResult = result;
                        }
                    }
                    catch
                    {
                        // Swallow — see ConsumerLoop in NamedPipe variant for rationale.
                    }
                    finally
                    {
                        _consumeDone.Set();
                    }
                }
            }
            catch (OperationCanceledException)
            {
                // Cooperative cancel — Dispose path. Swallow.
            }
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Serialize()
        {
            // 2-task in-memory pipeline:
            // 1. Calling thread serialises → fresh byte[] (per-iter alloc, matches AcBinaryBenchmark contract).
            // 2. Calling thread parks the byte[] into _pendingBytes and signals consumer task. Consumer task
            //    picks up the reference (zero-copy) and runs Deserialize<T>(bytes).
            // 3. Calling thread waits for _consumeDone (consumer task finished Des).
            //
            // Same architectural limitation as the NamedPipe-raw variant: Des cannot start until full bytes
            // are available. Only the per-iter Ser↔Des thread-handoff overlaps slightly (calling thread starts
            // signalling and waiting while consumer thread takes the byte[]).
            var bytes = AcBinarySerializer.Serialize(_order, _options);

            _pendingBytes = bytes;
            _consumeDone.Reset();
            _consumeRequest.Set();

            _consumeDone.Wait();
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Deserialize()
        {
            // No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract.
        }

        public bool VerifyRoundTrip()
        {
            _captureResult = true;
            try
            {
                Serialize();
                var result = _lastResult as TestOrder;
                return result != null && DeepEqualsViaJson(_order, result);
            }
            finally
            {
                _captureResult = false;
                _lastResult = null;
            }
        }

        public void Dispose()
        {
            if (_disposed) return;
            _disposed = true;

            try { _cts.Cancel(); } catch { /* swallow on teardown */ }
            try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ }
            try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }

            try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ }
            try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ }
            try { _cts.Dispose(); } catch { /* swallow on teardown */ }
        }
    }

    /// <summary>
    /// Benchmarks MemoryPack via the IBufferWriter overload, allocating a FRESH ArrayBufferWriter on EVERY call.
    /// Apples-to-apples counterpart to AcBinaryFreshBufferWriterBenchmark.
    /// </summary>
    private sealed class MemoryPackFreshBufferWriterBenchmark : ISerializerBenchmark
    {
        private readonly TestOrder _order;
        private readonly MemoryPackSerializerOptions _options;
        private readonly byte[] _serialized;

        public string Engine => Configuration.EngineMemoryPack;
        public string IoMode => Configuration.IoBufWrNew;
        public string DispatchMode => Configuration.ModeSGen; // MemoryPack always uses [MemoryPackable] source-generated formatters
        public string OptionsPreset { get; }
        public int SerializedSize => _serialized.Length;
        public long SetupSerializeAllocBytes => 0;
        public long SetupDeserializeAllocBytes => 0;
        public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}";

        public MemoryPackFreshBufferWriterBenchmark(TestOrder order, string optionsPreset)
        {
            _order = order;
            OptionsPreset = optionsPreset;
            _options = GetMemPackOptions();
            _serialized = MemoryPackSerializer.Serialize(order, _options);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Serialize()
        {
            var abw = new ArrayBufferWriter<byte>();
            MemoryPackSerializer.Serialize(abw, _order, _options);
        }

        // BufWr semantic: read from a ReadOnlySequence<byte> overload (apples-to-apples with AcBinary's
        // BufWr Deser path). MemoryPack's ROS overload also single-segment-fast-paths internally.
        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Deserialize() => MemoryPackSerializer.Deserialize<TestOrder>(new ReadOnlySequence<byte>(_serialized), _options);

        public bool VerifyRoundTrip()
        {
            var abw = new ArrayBufferWriter<byte>();
            MemoryPackSerializer.Serialize(abw, _order, _options);
            var roundTripped = MemoryPackSerializer.Deserialize<TestOrder>(new ReadOnlySequence<byte>(abw.WrittenMemory), _options);
            return DeepEqualsViaJson(_order, roundTripped);
        }
    }

    private sealed class AcBinaryBufferWriterBenchmark : ISerializerBenchmark
    {
        private readonly TestOrder _order;
        private readonly AcBinarySerializerOptions _options;
        private readonly byte[] _serialized;
        private readonly ArrayBufferWriter<byte> _bufferWriter;

        public string Engine => Configuration.EngineAcBinary;
        public string IoMode => Configuration.IoBufWrReuse;
        public string DispatchMode => _options.UseGeneratedCode ? Configuration.ModeSGen : Configuration.ModeRuntime;
        public string OptionsPreset { get; }
        public int SerializedSize => _serialized.Length;
        public long SetupSerializeAllocBytes { get; }
        public long SetupDeserializeAllocBytes => 0;
        public string OptionsDescription => BuildAcBinaryOptionsDescription(_options);

        public AcBinaryBufferWriterBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
        {
            _order = order;
            _options = options;
            OptionsPreset = optionsPreset;
            _serialized = AcBinarySerializer.Serialize(order, options);

            // Measure ONLY the BufferWriter infrastructure setup on the serialize side (excluding the
            // helper Serialize above). Deserialize side reads directly from `_serialized` byte[] — no
            // dedicated setup allocation, hence SetupDeserializeAllocBytes = 0.
            GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
            var beforeSetup = GC.GetAllocatedBytesForCurrentThread();
            _bufferWriter = new ArrayBufferWriter<byte>(_serialized.Length * 2);
            var afterSetup = GC.GetAllocatedBytesForCurrentThread();
            SetupSerializeAllocBytes = afterSetup - beforeSetup;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Serialize()
        {
            _bufferWriter.ResetWrittenCount();  // reuse — no alloc, no zeroing
            AcBinarySerializer.Serialize(_order, _bufferWriter, _options);
        }

        // BufWr semantic: read from a ReadOnlySequence<byte> (the ROS overload), NOT from byte[] —
        // single-segment array-backed sequence triggers the fast-path in AcBinaryDeserializer.cs:298 which
        // redirects to the byte[] overload. This means the bench actually exercises the ROS-input path
        // (the production-realistic surface for SignalR / Pipe consumers) rather than secretly testing
        // byte[] Deser under the BufWr label.
        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Deserialize() => AcBinaryDeserializer.Deserialize<TestOrder>(new ReadOnlySequence<byte>(_serialized), _options);

        public bool VerifyRoundTrip()
        {
            _bufferWriter.ResetWrittenCount();
            AcBinarySerializer.Serialize(_order, _bufferWriter, _options);

            var roundTripped = AcBinaryDeserializer.Deserialize<TestOrder>(new ReadOnlySequence<byte>(_bufferWriter.WrittenMemory), _options);
            return DeepEqualsViaJson(_order, roundTripped);
        }
    }

    /// <summary>
    /// Benchmarks MemoryPack via the IBufferWriter overload with a pre-allocated, reused ArrayBufferWriter.
    /// Apples-to-apples counterpart to AcBinaryBufferWriterBenchmark — MemoryPack's IBufferWriter is the path it's designed for.
    /// </summary>
    private sealed class MemoryPackBufferWriterBenchmark : ISerializerBenchmark
    {
        private readonly TestOrder _order;
        private readonly MemoryPackSerializerOptions _options;
        private readonly byte[] _serialized;
        private readonly ArrayBufferWriter<byte> _bufferWriter;

        public string Engine => Configuration.EngineMemoryPack;
        public string IoMode => Configuration.IoBufWrReuse;
        public string DispatchMode => Configuration.ModeSGen; // MemoryPack always uses [MemoryPackable] source-generated formatters
        public string OptionsPreset { get; }
        public int SerializedSize => _serialized.Length;
        public long SetupSerializeAllocBytes { get; }
        public long SetupDeserializeAllocBytes => 0;
        public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}";

        public MemoryPackBufferWriterBenchmark(TestOrder order, string optionsPreset)
        {
            _order = order;
            OptionsPreset = optionsPreset;
            _options = GetMemPackOptions();
            _serialized = MemoryPackSerializer.Serialize(order, _options);

            // Serialize-side setup only — see AcBinaryBufferWriterBenchmark for the full rationale.
            GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
            var beforeSetup = GC.GetAllocatedBytesForCurrentThread();
            _bufferWriter = new ArrayBufferWriter<byte>(_serialized.Length * 2);
            var afterSetup = GC.GetAllocatedBytesForCurrentThread();
            SetupSerializeAllocBytes = afterSetup - beforeSetup;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Serialize()
        {
            _bufferWriter.ResetWrittenCount();
            MemoryPackSerializer.Serialize(_bufferWriter, _order, _options);
        }

        // BufWr semantic: read from a ReadOnlySequence<byte> overload (apples-to-apples with AcBinary's
        // BufWr Deser path). MemoryPack's ROS overload also single-segment-fast-paths internally.
        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Deserialize() => MemoryPackSerializer.Deserialize<TestOrder>(new ReadOnlySequence<byte>(_serialized), _options);

        public bool VerifyRoundTrip()
        {
            _bufferWriter.ResetWrittenCount();
            MemoryPackSerializer.Serialize(_bufferWriter, _order, _options);
            var roundTripped = MemoryPackSerializer.Deserialize<TestOrder>(new ReadOnlySequence<byte>(_bufferWriter.WrittenMemory), _options);
            return DeepEqualsViaJson(_order, roundTripped);
        }
    }

    private sealed class SystemTextJsonBenchmark : ISerializerBenchmark
    {
        private readonly TestOrder _order;
        private readonly JsonSerializerOptions _options;
        private readonly string _serialized;
        private readonly byte[] _serializedUtf8;

        public string Engine => Configuration.EngineSystemTextJson;
        public string IoMode => Configuration.IoString;
        public string DispatchMode => Configuration.ModeRuntime; // System.Text.Json default uses reflection-based metadata (no source generator opt-in here)
        public string OptionsPreset { get; }
        public int SerializedSize => _serializedUtf8.Length;
        public long SetupSerializeAllocBytes => 0;
        public long SetupDeserializeAllocBytes => 0;

        public SystemTextJsonBenchmark(TestOrder order, string optionsPreset)
        {
            _order = order;
            OptionsPreset = optionsPreset;
            _options = new JsonSerializerOptions
            {
                WriteIndented = false,
                DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull,
                ReferenceHandler = System.Text.Json.Serialization.ReferenceHandler.IgnoreCycles
            };
            _serialized = JsonSerializer.Serialize(order, _options);
            _serializedUtf8 = Configuration.Utf8NoBom.GetBytes(_serialized);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Serialize() => JsonSerializer.Serialize(_order, _options);

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void Deserialize() => JsonSerializer.Deserialize<TestOrder>(_serialized, _options);

        public bool VerifyRoundTrip()
        {
            var json = JsonSerializer.Serialize(_order, _options);
            var roundTripped = JsonSerializer.Deserialize<TestOrder>(json, _options);
            return DeepEqualsViaJson(_order, roundTripped);
        }
    }

    #endregion

    // Results / output formatters → Output.cs
    // BenchmarkResult DTO → BenchmarkResult.cs

}