AyCode.Core/AyCode.Core.Serializers.Con.../BenchmarkLoop.cs

using AyCode.Core.Tests.TestModels;
using MemoryPack;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Text.Json;

namespace AyCode.Core.Serializers.Console;

/// <summary>
/// Benchmark execution helpers: timing (<see cref="RunTimed"/>), per-cell adaptive iteration
/// calibration (<see cref="CalibrateIterations"/>), allocation measurement
/// (<see cref="MeasureAllocation"/> + <see cref="MeasureAllocationTotal"/>), in-place
/// <c>\r</c>-progress reporting, full-GC phase-boundary helper (<see cref="ForceGcCollect"/>),
/// startup validation (<see cref="ValidateMemoryPackSetup"/>), and per-cell round-trip equality
/// (<see cref="DeepEqualsViaJson"/>). Pure benchmark-execution infrastructure — no display
/// formatting (that lives in <c>Output</c>) and no per-engine glue (which lives with the
/// individual <c>ISerializerBenchmark</c> implementations).
/// </summary>
internal static class BenchmarkLoop
{
    /// <summary>
    /// Forces a full GC cycle at a phase boundary in the benchmark loop. Two-pass collect with finalizer drain
    /// in between: the first pass moves managed garbage to the finalization queue, <c>WaitForPendingFinalizers</c>
    /// runs the finalizers, the second pass reclaims any objects the finalizers released. After this returns the
    /// heap is in a known-quiescent state — the next warmup/measurement phase starts on a clean slate, isolated
    /// from the previous phase's residual allocations (write-buffer pools, intern cache, write-plan arrays, etc.).
    /// Called between every Ser-phase / Des-phase boundary in <c>RunBenchmarksForTestData</c>.
    /// </summary>
    [MethodImpl(MethodImplOptions.NoInlining)]
    internal static void ForceGcCollect()
    {
        GC.Collect(2, GCCollectionMode.Forced, blocking: true);
        GC.WaitForPendingFinalizers();
        GC.Collect(2, GCCollectionMode.Forced, blocking: true);
    }

    /// <summary>
    /// Runs the action <paramref name="iterations"/> times for <see cref="Configuration.BenchmarkSamples"/> independent samples,
    /// returning the median, min, and max elapsed time. Multi-sample design reduces single-run variance
    /// from ~±15% to ~±5% by smoothing transient effects (background activity, thermal/turbo state).
    /// When <see cref="Configuration.BenchmarkSamples"/> &lt;= 1, falls back to single-sample timing (Debug / quick mode).
    /// When <paramref name="progressLabel"/> is non-null, emits in-place <c>\r</c> progress updates so a
    /// stuck benchmark (e.g. deadlocked NamedPipe row) is visibly stuck at a specific %% rather than
    /// silently hanging.
    ///
    /// Stabilization (added 2026-05-07):
    ///   1) Pilot sample is run BEFORE the recorded loop and discarded. The first measurement after
    ///      warmup tends to absorb residual JIT bookkeeping and GC bookkeeping; dropping it tightens
    ///      the min/max range without throwing away signal (the median is the SAME data as before).
    ///   2) GC.Collect / WaitForPendingFinalizers / GC.Collect runs BEFORE every recorded sample.
    ///      Without this, GC pressure from sample N occasionally triggered a Gen-2 pause inside
    ///      sample N+1, painting it as an outlier; collecting up-front gives every sample the
    ///      same starting heap shape.
    ///   3) Returns (median, min, max) so the caller can surface the inter-sample range — visible
    ///      noise floor for the row, replacing the previous "median only" view.
    /// </summary>
    internal static (double medianMs, double minMs, double maxMs, double stdDevMs) RunTimed(Action action, int iterations, string? progressLabel = null)
    {
        var samples = Configuration.BenchmarkSamples;
        if (samples <= 1)
        {
            // Single-sample fast path (Debug or trivial run) — no allocation, no sort, no stddev.
            var sw = Stopwatch.StartNew();
            RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);
            sw.Stop();
            var ms = sw.Elapsed.TotalMilliseconds;
            EndProgress(progressLabel, ms);
            return (ms, ms, ms, 0);
        }

        // Pilot sample (discarded). Counts as sample index 0 of (samples + 1) for progress display
        // so the user sees an extra "warmup-ish" tick before the recorded samples start.
        GC.Collect();
        GC.WaitForPendingFinalizers();
        GC.Collect();

        var pilotSw = Stopwatch.StartNew();
        RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: 0);
        pilotSw.Stop();
        // intentionally not stored

        var times = new double[samples];
        for (var s = 0; s < samples; s++)
        {
            // Per-sample GC settle. Forces every sample to start from the same heap state, so
            // a Gen-2 pause caused by the previous sample doesn't bleed into the next sample's
            // timing. Cost is paid OUTSIDE the Stopwatch window — no impact on the measurement.
            GC.Collect();
            GC.WaitForPendingFinalizers();
            GC.Collect();

            var sw = Stopwatch.StartNew();
            RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: s + 1);
            sw.Stop();
            times[s] = sw.Elapsed.TotalMilliseconds;
        }

        // Capture min/max/sum/sumSq BEFORE sort to avoid order ambiguity (Array.Sort is in-place).
        var minMs = double.MaxValue;
        var maxMs = double.MinValue;
        var sum = 0.0;
        var sumSq = 0.0;

        for (var i = 0; i < times.Length; i++)
        {
            var t = times[i];
            sum += t;
            sumSq += t * t;
            if (t < minMs) minMs = t;
            if (t > maxMs) maxMs = t;
        }
        // Population stddev (not sample-stddev — we treat the captured samples as the population for
        // CV computation). variance = E[X²] - E[X]² with Math.Max(0, ...) guard against tiny negative
        // values from FP rounding when samples are nearly identical.
        var mean = sum / times.Length;
        var variance = (sumSq / times.Length) - (mean * mean);
        var stdDevMs = Math.Sqrt(Math.Max(0.0, variance));

        Array.Sort(times);
        // Median: middle value for odd sample counts, average of two middles for even counts.
        var medianMs = samples % 2 == 1 ? times[samples / 2] : (times[samples / 2 - 1] + times[samples / 2]) / 2.0;
        EndProgress(progressLabel, medianMs);

        return (medianMs, minMs, maxMs, stdDevMs);
    }

    /// <summary>
    /// Per-cell adaptive iteration calibration. Runs a 100-iter measurement after warmup and computes
    /// how many iterations are needed to reach <see cref="Configuration.TargetSampleMs"/> wall-clock per sample.
    /// Returns iter rounded UP to the nearest 1000, floored at 1000 (the prior fixed minimum) and
    /// ceiling-capped at 200_000 (sanity bound for pathologically fast ops). In Debug single-sample mode
    /// (<c>Configuration.BenchmarkSamples &lt;= 1</c>) returns the global <see cref="Configuration.TestIterations"/> unchanged —
    /// calibration overhead is unjustified there. Calibration runs OUTSIDE the timed sample loop and
    /// does NOT count toward warmup; its sole purpose is to measure per-op cost.
    /// </summary>
    internal static int CalibrateIterations(Action action, int targetMs)
    {
        if (Configuration.BenchmarkSamples <= 1) return Configuration.TestIterations; // Debug fast path

        GC.Collect();
        GC.WaitForPendingFinalizers();
        GC.Collect();

        const int calibIter = 100;
        var sw = Stopwatch.StartNew();
        for (var i = 0; i < calibIter; i++) action();
        sw.Stop();
        var ms = sw.Elapsed.TotalMilliseconds;

        // Pathologically-fast op below Stopwatch resolution — cap at ceiling (further calibration won't help).
        if (ms <= 0.0001) return 200_000;

        var iterPerMs = calibIter / ms;
        var raw = (int)Math.Ceiling(targetMs * iterPerMs);
        // Round UP to nearest 1000 — keeps numbers human-readable in the markdown output.
        var rounded = ((raw + 999) / 1000) * 1000;

        return rounded switch
        {
            < 1000 => 1000,
            > 200_000 => 200_000,
            _ => rounded
        };
    }

    /// <summary>
    /// Measures per-call allocation in bytes after a clean GC. Single dedicated sample (no median) — keeps timing samples pure.
    /// </summary>
    internal static long MeasureAllocation(Action action, int iterations, string? progressLabel = null)
    {
        GC.Collect();
        GC.WaitForPendingFinalizers();
        GC.Collect();

        var sw = Stopwatch.StartNew();
        var before = GC.GetAllocatedBytesForCurrentThread();
        RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);

        var after = GC.GetAllocatedBytesForCurrentThread();
        sw.Stop();
        EndProgress(progressLabel, sw.Elapsed.TotalMilliseconds);
        return (after - before) / iterations;
    }

    /// <summary>
    /// Process-wide allocation measurement — needed for round-trip-only benchmarks (NamedPipe etc.) where
    /// the work happens across multiple threads. <see cref="GC.GetAllocatedBytesForCurrentThread"/> would
    /// only count the caller-thread allocations, missing the server-side <c>new byte[len]</c> buffers and
    /// any drain-pump-thread allocations. <see cref="GC.GetTotalAllocatedBytes"/> covers the entire process.
    /// Slightly noisier than the per-thread variant (background threads / GC bookkeeping leak in), but
    /// over 1000 iterations the signal dominates.
    /// </summary>
    internal static long MeasureAllocationTotal(Action action, int iterations, string? progressLabel = null)
    {
        GC.Collect();
        GC.WaitForPendingFinalizers();
        GC.Collect();

        var sw = Stopwatch.StartNew();
        var before = GC.GetTotalAllocatedBytes(precise: true);
        RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);

        var after = GC.GetTotalAllocatedBytes(precise: true);
        sw.Stop();
        EndProgress(progressLabel, sw.Elapsed.TotalMilliseconds);
        return (after - before) / iterations;
    }

    // ============================================================================================
    // Progress reporting — \r-driven in-place updates so a stuck benchmark surfaces the exact phase
    // and % where it stopped, instead of appearing as a silent hang. Used by RunTimed and the
    // MeasureAllocation* helpers when the caller passes a non-null progressLabel.
    // ============================================================================================

    // Tracks the longest line written by the current progress session, so EndProgress can clear
    // any leftover characters from a prior longer line (avoids "ghost" trailing chars after \r).
    private static int _progressLastLineLen;

    /// <summary>
    /// Runs <paramref name="action"/> <paramref name="iterations"/> times, emitting \r-overwriting
    /// progress every ~10% (approx. 10 progress prints per sample). When <paramref name="label"/>
    /// is null, runs without any progress output (zero overhead beyond a null check per iter).
    /// </summary>
    private static void RunWithProgress(Action action, int iterations, string? label, int samples, int sampleIndex)
    {
        if (label is null)
        {
            for (var i = 0; i < iterations; i++) action();
            return;
        }

        // ~10 progress emits per sample run. Avoid emitting on every iter (Console.Write is
        // expensive enough to skew sub-µs benchmarks if overdone).
        var step = Math.Max(1, iterations / 10);
        for (var i = 0; i < iterations; i++)
        {
            action();
            if ((i + 1) % step == 0 || i == iterations - 1)
            {
                var pct = (int)((i + 1) * 100L / iterations);
                var line = samples > 1
                    ? $"    > {label}  sample {sampleIndex + 1}/{samples}  {pct,3}%  ({i + 1}/{iterations})"
                    : $"    > {label}  {pct,3}%  ({i + 1}/{iterations})";

                System.Console.Write('\r');
                System.Console.Write(line);

                if (line.Length < _progressLastLineLen)
                    System.Console.Write(new string(' ', _progressLastLineLen - line.Length));

                _progressLastLineLen = line.Length;
            }
        }
    }

    /// <summary>
    /// Closes a progress line cleanly: clears any leftover chars and writes a final "done" line on
    /// the same row, terminated by \n so subsequent <c>WriteLine</c> calls render below.
    /// </summary>
    private static void EndProgress(string? label, double elapsedMs)
    {
        if (label is null) return;
        var done = $"    > {label}  done in {elapsedMs,7:F1} ms";

        System.Console.Write('\r');
        System.Console.Write(done);

        if (done.Length < _progressLastLineLen)
            System.Console.Write(new string(' ', _progressLastLineLen - done.Length));

        System.Console.WriteLine();
        _progressLastLineLen = 0;
    }

#if !AYCODE_NATIVEAOT
    private static readonly JsonSerializerOptions VerifyJsonOpts = new()
    {
        WriteIndented = false,

        DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull,
        ReferenceHandler = System.Text.Json.Serialization.ReferenceHandler.IgnoreCycles
    };
#endif

    /// <summary>
    /// Round-trip equality check: serialize both via System.Text.Json (canonical form) and compare strings.
    /// Slower than property-by-property compare, but universal — works for any object graph without custom comparer.
    /// </summary>
    /// <remarks>
    /// AOT publish skip: <c>System.Text.Json</c>'s reflection path uses runtime closed-generic instantiation
    /// (<c>JsonPropertyInfo&lt;TestStatus&gt;</c> et al.) that the trimmer drops, causing
    /// <c>NotSupportedException: missing native code or metadata</c>. The validation is JIT-only — the actual
    /// benchmark Serialize/Deserialize loops don't touch this path. Under AOT we return <c>true</c> so all
    /// <c>VerifyRoundTrip()</c> calls pass without running the cross-format validation.
    /// </remarks>
    internal static bool DeepEqualsViaJson(object? a, object? b)
    {
#if AYCODE_NATIVEAOT
        // Skip cross-format validation under AOT — STJ reflection path is incompatible. The roundtrip
        // itself still runs (caller-side Serialize+Deserialize), just the JSON-canonical compare is bypassed.
        return true;
#else
        if (a == null && b == null) return true;
        if (a == null || b == null) return false;

        var jsonA = JsonSerializer.Serialize(a, VerifyJsonOpts);
        var jsonB = JsonSerializer.Serialize(b, VerifyJsonOpts);

        return jsonA == jsonB;
#endif
    }

    /// <summary>
    /// Validates MemoryPack setup at startup. Aborts the benchmark if TestOrder is not [MemoryPackable].
    /// Without this attribute, MemoryPack falls back to runtime resolver (slower) — comparison would be INVALID.
    /// </summary>
    internal static void ValidateMemoryPackSetup()
    {
        var typesToCheck = new[] { typeof(TestOrder) };

        foreach (var type in typesToCheck)
        {
            var hasAttr = type.GetCustomAttributes(typeof(MemoryPackableAttribute), inherit: true).Any();
            if (!hasAttr)
            {
                System.Console.Error.WriteLine($"❌ FATAL: {type.FullName} is not [MemoryPackable] — MemoryPack would fall back to runtime resolver, comparison is INVALID for SGen-vs-SGen claim.");
                System.Console.Error.WriteLine("Add [MemoryPackable] to the type and any nested types referenced from it.");

                Environment.Exit(1);
            }
        }
    }

    /// <summary>
    /// Filters test data sets by layer keyword. Layered approach lets you run only what's needed for the iteration cadence.
    /// P1: only "Core" data exists (Small/Medium/Large/Repeated/Deep). Comprehensive and Edge layers will be expanded in P2.
    /// </summary>
    internal static List<TestDataSet> FilterByLayer(List<TestDataSet> all, string layer)
    {
        if (layer == "all") return all.ToList();

        var coreNames = new[] { "Small", "Medium", "Large", "Repeated", "Deep" };
        // P2 will add: "Flat", "Polymorphic", "Collection", "Numeric", "NonAscii", etc.
        var comprehensiveExtras = new string[] { /* P2 */ };
        // P3 will add: "ColdStart", "VeryLarge", "PathologicalString", etc.
        var edgeExtras = new string[] { /* P3 */ };

        return layer switch
        {
            "core" => all.Where(t => StartsWithAny(t.Name, coreNames)).ToList(),
            "comprehensive" => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras)).ToList(),
            "edge" => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras) || StartsWithAny(t.Name, edgeExtras)).ToList(),
            // Single-cell A/B mini-suite filters — match by case-insensitive prefix on Name.
            // Use case: tight optimization-iteration loop on one specific cell (e.g. `dotnet run -- repeated`
            // or interactive menu shortcut), avoiding the full ~110 sec suite when only one cell is in scope.
            "small" => all.Where(t => t.Name.StartsWith("Small", StringComparison.OrdinalIgnoreCase)).ToList(),
            "medium" => all.Where(t => t.Name.StartsWith("Medium", StringComparison.OrdinalIgnoreCase)).ToList(),
            "large" => all.Where(t => t.Name.StartsWith("Large", StringComparison.OrdinalIgnoreCase)).ToList(),
            "repeated" => all.Where(t => t.Name.StartsWith("Repeated", StringComparison.OrdinalIgnoreCase)).ToList(),
            "deep" => all.Where(t => t.Name.StartsWith("Deep", StringComparison.OrdinalIgnoreCase)).ToList(),
            _ => all.ToList()
        };

        static bool StartsWithAny(string name, string[] prefixes) => prefixes.Any(name.StartsWith);
    }
}