using AyCode.Core.Compression;
using AyCode.Core.Serializers.Attributes;
using AyCode.Core.Serializers.Binaries;
using AyCode.Core.Tests.Serialization;   // DrainFromAsync extension (test-only, used by benchmark)
using AyCode.Core.Tests.TestModels;
using MemoryPack;
#if !AYCODE_NATIVEAOT
using MessagePack;
using MessagePack.Resolvers;
#endif
using Microsoft.Extensions.Options;
using System.Buffers;
using System.Diagnostics;
using System.IO.Pipelines;
using System.IO.Pipes;
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Text;
using System.Text.Json;

using AyCode.Core.Serializers.Console.Benchmarks;

namespace AyCode.Core.Serializers.Console;

/// <summary>
/// Comprehensive benchmark application for all serializers.
/// Compares: AcBinary (all options), MemoryPack, MessagePack, Newtonsoft.Json, System.Text.Json
/// 
/// Usage:
///   dotnet run                    # Run all benchmarks
///   dotnet run -- quick           # Quick mode (fewer iterations)
///   dotnet run -- serialize       # Serialize only
///   dotnet run -- deserialize     # Deserialize only
/// </summary>
public static class Program
{
    // Configuration (constants, mutable state, attribute-flag aggregation) → Configuration.cs
    // BuildAcBinary + GetMemPack helpers → Benchmarks/BenchmarkOptions.cs

    public static void Main(string[] args)
    {
        // Set console encoding to UTF-8 for proper Unicode character display
        System.Console.OutputEncoding = Encoding.UTF8;

        // Setup validation — abort BEFORE any benchmark logic if MemoryPack baseline is invalid.
        // Done early so user is told immediately, not after warmup.
        BenchmarkLoop.ValidateMemoryPackSetup();

        // CLI mode (args provided): run once, parse args, exit. Backward-compatible behaviour.
        if (args.Length > 0)
        {
            if (!TryParseCliArgs(args, out var layer, out var opMode, out var serializerMode))
                return;  // invalid args

            RunBenchmark(layer, opMode, serializerMode);
            return;
        }

        // Interactive mode (no args): loop the menu so the user doesn't have to restart between runs.
        // Q exits the menu (and the application).
        while (true)
        {
            var selection = Menu.ShowInteractiveMenu();
            if (selection == null) return;  // user pressed Q

            RunBenchmark(selection.Value.layer, "all", selection.Value.serializerMode);

            System.Console.WriteLine();
            System.Console.WriteLine("─────────────────────────────────────────────────────────────────────");
            System.Console.WriteLine("Returning to menu — press any key to continue, or Q to quit...");
            var key = System.Console.ReadKey(intercept: true);
            if (key.Key == ConsoleKey.Q) return;
            System.Console.WriteLine();
        }
    }

    /// <summary>
    /// Parses CLI arguments into (layer, opMode, serializerMode). Returns <c>false</c> if the args
    /// are invalid; the caller should then exit without running the standard benchmark.
    /// </summary>
    private static bool TryParseCliArgs(string[] args, out string layer, out string opMode, out string serializerMode)
    {
        layer = "all";
        opMode = "all";
        serializerMode = "standard";

        var arg = args[0].ToLower();

        // Quick mode: short warmup, few iterations, small sample count
        if (arg == "quick")
        {
            Configuration.WarmupIterations = 5;
            Configuration.TestIterations = 100;
            Configuration.BenchmarkSamples = 3;
            layer = "all";
        }
        else if (arg is "core" or "comprehensive" or "edge" or "all"
                       or "small" or "medium" or "large" or "repeated" or "deep")
        {
            layer = arg;
        }
        else if (arg is "asyncpipe" or "pipe")
        {
            // AsyncPipe-only mode: streaming I/O isolation across all test data.
            layer = "all";
            serializerMode = "asyncpipe";
        }
        else if (arg is "ser" or "serialize")
        {
            opMode = "serialize";
            layer = "all";
        }
        else if (arg is "des" or "deserialize")
        {
            opMode = "deserialize";
            layer = "all";
        }
        else
        {
            // Backwards compat: unknown arg → treat as layer keyword
            layer = arg;
        }

        return true;
    }

    /// <summary>
    /// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup
    /// + measurement → grouped results print → save to disk. Used by both the CLI and interactive
    /// menu paths; the interactive loop calls this repeatedly without restarting the process.
    /// </summary>
    private static void RunBenchmark(string layer, string opMode, string serializerMode)
    {
        System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗");
        System.Console.WriteLine("║          COMPREHENSIVE SERIALIZER BENCHMARK SUITE                    ║");
        System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");

        // Stabilization: pin the entire benchmark process to a single logical CPU and bump priority
        // class. Single-core affinity stops Windows from migrating the bench thread between cores
        // mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority
        // reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise
        // randomly inflate samples by 5-15%.
        // Try/finally guarantees the original state is restored even if a benchmark throws — leaving
        // a developer machine pinned to one core after a crashed run is a real foot-gun.
        // Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot.
        var process = Process.GetCurrentProcess();
        var origAffinity = (IntPtr)0;
        var origPriority = ProcessPriorityClass.Normal;
        var stabilizationApplied = false;

        // ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at
        // runtime; skip the affinity step there but still raise priority class (which IS supported
        // on macOS, just less effective for stabilization than affinity pinning).
        if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
        {
            try
            {
                origAffinity = process.ProcessorAffinity;
                origPriority = process.PriorityClass;
                // Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one
                // core, consistently" — not which one. If CPU 0 is heavily contended on the host
                // (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak
                // the mask here. The benchmark is single-threaded for the in-memory rows so single
                // core is sufficient; round-trip-only NamedPipe rows have a server-drain thread
                // that will share the core (acceptable — the bench measures end-to-end RT anyway).
                process.ProcessorAffinity = (IntPtr)1;
                process.PriorityClass = ProcessPriorityClass.High;
                stabilizationApplied = true;
                System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High.");
            }
            catch (Exception ex)
            {
                // Affinity/priority changes may fail on locked-down hosts (group policies, containers
                // without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still
                // works, just with the platform default scheduling.
                System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}");
            }
        }

        try
        {
            var allResults = new List<BenchmarkResult>();
            var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
            var testDataSets = BenchmarkLoop.FilterByLayer(allTestDataSets, layer);

            System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard");
            System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
            System.Console.WriteLine();

            // Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
            // Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
            // alone doesn't ensure that every Serialize<T>/IBufferWriter overload is fully Tier 1 by the time we
            // start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
            // on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
            // Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
            // them all in the background; the per-cell warmup that follows then locks in cache + branch state.
            if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
            {
                System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");

                foreach (var testData in testDataSets)
                {
                    var preSerializers = CreateSerializers(testData, serializerMode);
                    try
                    {
                        foreach (var s in preSerializers)
                        {
                            // Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated:
                            // Ser path first, then Des path — same pattern as the per-cell warmup in
                            // RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming).
                            s.WarmupSerialize(2000);
                            s.WarmupDeserialize(2000);
                        }
                    }
                    finally
                    {
                        // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
                        foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
                    }
                }

                // Let background tiered-JIT compilation drain before we begin measuring.
                if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
                System.Console.WriteLine("✓ Global pre-warmup complete.\n");
            }

            foreach (var testData in testDataSets)
            {
                System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
                System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
                System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");

                var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
                allResults.AddRange(results);
            }

            // Print grouped results
            Output.PrintGroupedResults(allResults, testDataSets);

            // Save results to file
            Output.SaveResults(allResults, testDataSets);

            System.Console.WriteLine("\n✓ Benchmark complete!");
        }
        finally
        {
            // Restore process state — affinity/priority changes are process-wide and persist across
            // interactive-mode iterations of the menu. Without restore, the second menu run would
            // already be on CPU-0 + High priority before its own try-block applied them, masking
            // any stabilization-disabled comparison.
            if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
            {
                try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ }
                try { process.PriorityClass = origPriority; } catch { /* best-effort */ }
            }
        }
    }

    #region Benchmark Execution

private static List<BenchmarkResult> RunBenchmarksForTestData(TestDataSet testData, string mode, string serializerMode)
    {
        var results = new List<BenchmarkResult>();
        var serializers = CreateSerializers(testData, serializerMode);

        // Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure.
        System.Console.WriteLine("Verifying round-trip correctness...");

        foreach (var serializer in serializers)
        {
            if (!serializer.VerifyRoundTrip())
            {
                System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}");
                System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting.");

                Environment.Exit(1);
            }
        }

        System.Console.WriteLine("✓ All serializers passed round-trip verification.");

        // Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and
        // Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary.
        //
        // Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor
        // in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a
        // cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs
        // keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement
        // (steady-state). Branch-predictor history also stays clean per path.
        //
        // GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer
        // pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent
        // heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations.
        //
        // Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT).
        // Each phase's freshly-promoted methods settle before its timing starts.
        System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n");

        foreach (var serializer in serializers)
        {
            var result = new BenchmarkResult
            {
                TestDataName = testData.DisplayName,  // Use DisplayName for IId% info
                Engine = serializer.Engine,
                IoMode = serializer.IoMode,
                DispatchMode = serializer.DispatchMode,
                OptionsPreset = serializer.OptionsPreset,
                OptionsDescription = serializer.OptionsDescription,
                SerializedSize = serializer.SerializedSize,
                SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes,
                SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes,
                IsRoundTripOnly = serializer.IsRoundTripOnly
            };

            // Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark
            // is visibly stuck on a specific row at a specific %% rather than silently hanging.
            var groupLabel = $"{result.SerializerName}";

            if (serializer.IsRoundTripOnly)
            {
                // Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT,
                // Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the
                // entire round-trip path, then record into the RT result columns.
                if (mode is "all" or "serialize" or "ser")
                {
                    BenchmarkLoop.ForceGcCollect();
                    serializer.WarmupSerialize(Configuration.WarmupIterations);
                    if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);

                    var rtIter = BenchmarkLoop.CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
                    var (rtMed, rtMin, rtMax, rtStd) = BenchmarkLoop.RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
                    result.RoundTripTimeMs = rtMed;
                    result.RoundTripTimeMinMs = rtMin;
                    result.RoundTripTimeMaxMs = rtMax;
                    result.RoundTripTimeStdDevMs = rtStd;
                    result.RoundTripIterations = rtIter;
                    // Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len])
                    // also show up — otherwise current-thread alloc would only count the client side and look ~halved.
                    result.RoundTripAllocBytesPerOp = BenchmarkLoop.MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]");
                }
                // mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently.
            }
            else
            {
                // ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
                if (mode is "all" or "serialize" or "ser")
                {
                    BenchmarkLoop.ForceGcCollect();
                    serializer.WarmupSerialize(Configuration.WarmupIterations);
                    if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);

                    var serIter = BenchmarkLoop.CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
                    var (serMed, serMin, serMax, serStd) = BenchmarkLoop.RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
                    result.SerializeTimeMs = serMed;
                    result.SerializeTimeMinMs = serMin;
                    result.SerializeTimeMaxMs = serMax;
                    result.SerializeTimeStdDevMs = serStd;
                    result.SerializeIterations = serIter;
                    // Dedicated alloc-only sample (separate from timing samples; keeps timing pure)
                    result.SerializeAllocBytesPerOp = BenchmarkLoop.MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
                }

                // ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
                // The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the
                // Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph).
                if (mode is "all" or "deserialize" or "des")
                {
                    BenchmarkLoop.ForceGcCollect();
                    serializer.WarmupDeserialize(Configuration.WarmupIterations);
                    if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);

                    var desIter = BenchmarkLoop.CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs);
                    var (desMed, desMin, desMax, desStd) = BenchmarkLoop.RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
                    result.DeserializeTimeMs = desMed;
                    result.DeserializeTimeMinMs = desMin;
                    result.DeserializeTimeMaxMs = desMax;
                    result.DeserializeTimeStdDevMs = desStd;
                    result.DeserializeIterations = desIter;
                    result.DeserializeAllocBytesPerOp = BenchmarkLoop.MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]");
                }

                // Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration,
                // batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
                // then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
                // RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp.
                var serPerOp = Output.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations);
                var desPerOp = Output.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations);
                var rtPerOp = serPerOp + desPerOp;
                result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations);
                result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
                result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
            }

            results.Add(result);
            Output.PrintResult(result);
        }

        // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released
        // before the next test data builds new ones — otherwise pipes / handles leak across test cells).
        foreach (var s in serializers) (s as IDisposable)?.Dispose();

        return results;
    }

    private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, string serializerMode)
    {
        // FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
        // TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[].
        //   - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head.
        // Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
        //
        // FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint —
        // we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor
        // reference earlier. Re-enable when revisiting Fast wire-mode performance.
        if (serializerMode == "fastestbyte")
        {
            var fastestByteOptions = AcBinarySerializerOptions.FastMode;
            fastestByteOptions.WireMode = Configuration.SelectedWireMode;

            return new List<ISerializerBenchmark>
            {
                new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"),
                //new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"),
                new MemoryPackBenchmark(testData.Order, "Default"),
            };
        }

        // AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer).
        // Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with
        // the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it
        // in isolation so the timing numbers reflect ONLY the streaming path.
        if (serializerMode == "asyncpipe")
        {
            // NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here
            // drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to
            // the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the
            // NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall
            // fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level
            // wire chunk AND kernel transfer unit; change ONLY this line when tuning.
            var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode;
            binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize;
            binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode;

            return new List<ISerializerBenchmark>
            {
                // Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync.
                // Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer +
                // MRES wait-on-byte-shortage — over a kernel NamedPipe.
                new AcBinaryNamedPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
                // Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport,
                // same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize<T>(byte[]).
                // No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with
                // the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from
                // kernel-transport-overhead (raw vs in-process Byte[]).
                new AcBinaryNamedPipeRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
                // Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel).
                // Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the
                // kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk
                // on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this
                // isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the
                // in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback
                // is the worst-case benchmark scenario for chunked-streaming and not representative of real network
                // / file / cross-thread Pipe scenarios.
                new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
                // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples
                // baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw
                // byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory].
                new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
            };
        }

        // Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the
        // AsyncPipe menu / CLI mode, never bundled with the steady-state suite).

        var binaryNoInternOption = AcBinarySerializerOptions.Default;
        binaryNoInternOption.UseStringInterning = StringInterningMode.None;
        binaryNoInternOption.WireMode = Configuration.SelectedWireMode;

        var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default;
        binaryDefaultNoSgenOption.UseGeneratedCode = false;
        binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode;

        var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode;
        binaryFastModeNoSgenOption.UseGeneratedCode = false;
        binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode;

        var binaryFastModeOption = AcBinarySerializerOptions.FastMode;
        binaryFastModeOption.WireMode = Configuration.SelectedWireMode;

        // BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives
        // the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call.
        // Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration
        // allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead
        // vs syscall count).
        var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode;
        binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize;
        binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode;

        // In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment
        // concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory
        // System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task).
        var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode;
        binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize;
        binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode;

        var defaultOptions = AcBinarySerializerOptions.Default;
        defaultOptions.UseStringInterning = StringInterningMode.None;
        defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId;
        defaultOptions.WireMode = Configuration.SelectedWireMode;

        return new List<ISerializerBenchmark>
        {
            // ============================================================
            // AcBinary — Byte[] API (uncomment to compare option presets side-by-side)
            // ============================================================
            // Fastest Byte[] — SGen path (UseGeneratedCode=true, default).
            new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
            // Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch.
            // Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples.
            // NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates
            // when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish).
            new AcBinaryBenchmark(testData.Order, binaryFastModeNoSgenOption, "FastMode"),
            // Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) +
            // UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size
            // and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data.
            
            new AcBinaryBenchmark(testData.Order, defaultOptions, "Default"),
            //new AcBinaryBenchmark(testData.Order, binaryDefaultNoSgenOption, "Default"),
            //new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"),
            //new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"),

            // AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario)
            new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"),

            // AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario).
            // 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter
            // allocation. Optimum for this scenario.
            new AcBinaryFreshBufferWriterBenchmark(testData.Order, binaryFastModeBufWrChunk, "FastMode (4KB)"),

            // AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the
            // Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost
            // (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns.
            // The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode.
            new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"),

            // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to-
            // apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs
            // chunked-streaming wire format. The IO column shows "Bytes(in-mem)".
            new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"),

            // AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED
            // here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport
            // measurements.

            // ============================================================
            // MemoryPack — three I/O modes for apples-to-apples comparison
            // ============================================================
            new MemoryPackBenchmark(testData.Order, "Default"),
            new MemoryPackBufferWriterBenchmark(testData.Order, "Default"),
            new MemoryPackFreshBufferWriterBenchmark(testData.Order, "Default"),

            // ============================================================
            // MessagePack — for legacy comparison
            // ============================================================
#if !AYCODE_NATIVEAOT
            // MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed
            // ListFormatter<T> et al. — fails under NativeAOT publish with "No parameterless constructor".
            // Excluded from the AOT build; available for regular JIT runs only.
            new MessagePackBenchmark(testData.Order, "ContractBased"),
#endif

            // System.Text.Json (commented — JSON serializer for reference; not in active suite)
            //new SystemTextJsonBenchmark(testData.Order, "Default")
        };
    }

    #endregion

    // Serializer implementations (ISerializerBenchmark + 12 concrete benchmark classes) → Benchmarks/

    // Results / output formatters → Output.cs
    // BenchmarkResult DTO → BenchmarkResult.cs

}