AyCode.Core/AyCode.Core.Serializers.Con.../BenchmarkLoop.cs

367 lines
18 KiB
C#

using AyCode.Core.Tests.TestModels;
using MemoryPack;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Text.Json;
namespace AyCode.Core.Serializers.Console;
/// <summary>
/// Benchmark execution helpers: timing (<see cref="RunTimed"/>), per-cell adaptive iteration
/// calibration (<see cref="CalibrateIterations"/>), allocation measurement
/// (<see cref="MeasureAllocation"/> + <see cref="MeasureAllocationTotal"/>), in-place
/// <c>\r</c>-progress reporting, full-GC phase-boundary helper (<see cref="ForceGcCollect"/>),
/// startup validation (<see cref="ValidateMemoryPackSetup"/>), and per-cell round-trip equality
/// (<see cref="DeepEqualsViaJson"/>). Pure benchmark-execution infrastructure — no display
/// formatting (that lives in <c>Output</c>) and no per-engine glue (which lives with the
/// individual <c>ISerializerBenchmark</c> implementations).
/// </summary>
internal static class BenchmarkLoop
{
/// <summary>
/// Forces a full GC cycle at a phase boundary in the benchmark loop. Two-pass collect with finalizer drain
/// in between: the first pass moves managed garbage to the finalization queue, <c>WaitForPendingFinalizers</c>
/// runs the finalizers, the second pass reclaims any objects the finalizers released. After this returns the
/// heap is in a known-quiescent state — the next warmup/measurement phase starts on a clean slate, isolated
/// from the previous phase's residual allocations (write-buffer pools, intern cache, write-plan arrays, etc.).
/// Called between every Ser-phase / Des-phase boundary in <c>RunBenchmarksForTestData</c>.
/// </summary>
[MethodImpl(MethodImplOptions.NoInlining)]
internal static void ForceGcCollect()
{
GC.Collect(2, GCCollectionMode.Forced, blocking: true);
GC.WaitForPendingFinalizers();
GC.Collect(2, GCCollectionMode.Forced, blocking: true);
}
/// <summary>
/// Runs the action <paramref name="iterations"/> times for <see cref="Configuration.BenchmarkSamples"/> independent samples,
/// returning the median, min, and max elapsed time. Multi-sample design reduces single-run variance
/// from ~±15% to ~±5% by smoothing transient effects (background activity, thermal/turbo state).
/// When <see cref="Configuration.BenchmarkSamples"/> &lt;= 1, falls back to single-sample timing (Debug / quick mode).
/// When <paramref name="progressLabel"/> is non-null, emits in-place <c>\r</c> progress updates so a
/// stuck benchmark (e.g. deadlocked NamedPipe row) is visibly stuck at a specific %% rather than
/// silently hanging.
///
/// Stabilization (added 2026-05-07):
/// 1) Pilot sample is run BEFORE the recorded loop and discarded. The first measurement after
/// warmup tends to absorb residual JIT bookkeeping and GC bookkeeping; dropping it tightens
/// the min/max range without throwing away signal (the median is the SAME data as before).
/// 2) GC.Collect / WaitForPendingFinalizers / GC.Collect runs BEFORE every recorded sample.
/// Without this, GC pressure from sample N occasionally triggered a Gen-2 pause inside
/// sample N+1, painting it as an outlier; collecting up-front gives every sample the
/// same starting heap shape.
/// 3) Returns (median, min, max) so the caller can surface the inter-sample range — visible
/// noise floor for the row, replacing the previous "median only" view.
/// </summary>
internal static (double medianMs, double minMs, double maxMs, double stdDevMs) RunTimed(Action action, int iterations, string? progressLabel = null)
{
var samples = Configuration.BenchmarkSamples;
if (samples <= 1)
{
// Single-sample fast path (Debug or trivial run) — no allocation, no sort, no stddev.
var sw = Stopwatch.StartNew();
RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);
sw.Stop();
var ms = sw.Elapsed.TotalMilliseconds;
EndProgress(progressLabel, ms);
return (ms, ms, ms, 0);
}
// Pilot sample (discarded). Counts as sample index 0 of (samples + 1) for progress display
// so the user sees an extra "warmup-ish" tick before the recorded samples start.
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
var pilotSw = Stopwatch.StartNew();
RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: 0);
pilotSw.Stop();
// intentionally not stored
var times = new double[samples];
for (var s = 0; s < samples; s++)
{
// Per-sample GC settle. Forces every sample to start from the same heap state, so
// a Gen-2 pause caused by the previous sample doesn't bleed into the next sample's
// timing. Cost is paid OUTSIDE the Stopwatch window — no impact on the measurement.
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
var sw = Stopwatch.StartNew();
RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: s + 1);
sw.Stop();
times[s] = sw.Elapsed.TotalMilliseconds;
}
// Capture min/max/sum/sumSq BEFORE sort to avoid order ambiguity (Array.Sort is in-place).
var minMs = double.MaxValue;
var maxMs = double.MinValue;
var sum = 0.0;
var sumSq = 0.0;
for (var i = 0; i < times.Length; i++)
{
var t = times[i];
sum += t;
sumSq += t * t;
if (t < minMs) minMs = t;
if (t > maxMs) maxMs = t;
}
// Population stddev (not sample-stddev — we treat the captured samples as the population for
// CV computation). variance = E[X²] - E[X]² with Math.Max(0, ...) guard against tiny negative
// values from FP rounding when samples are nearly identical.
var mean = sum / times.Length;
var variance = (sumSq / times.Length) - (mean * mean);
var stdDevMs = Math.Sqrt(Math.Max(0.0, variance));
Array.Sort(times);
// Median: middle value for odd sample counts, average of two middles for even counts.
var medianMs = samples % 2 == 1 ? times[samples / 2] : (times[samples / 2 - 1] + times[samples / 2]) / 2.0;
EndProgress(progressLabel, medianMs);
return (medianMs, minMs, maxMs, stdDevMs);
}
/// <summary>
/// Per-cell adaptive iteration calibration. Runs a 100-iter measurement after warmup and computes
/// how many iterations are needed to reach <see cref="Configuration.TargetSampleMs"/> wall-clock per sample.
/// Returns iter rounded UP to the nearest 1000, floored at 1000 (the prior fixed minimum) and
/// ceiling-capped at 200_000 (sanity bound for pathologically fast ops). In Debug single-sample mode
/// (<c>Configuration.BenchmarkSamples &lt;= 1</c>) returns the global <see cref="Configuration.TestIterations"/> unchanged —
/// calibration overhead is unjustified there. Calibration runs OUTSIDE the timed sample loop and
/// does NOT count toward warmup; its sole purpose is to measure per-op cost.
/// </summary>
internal static int CalibrateIterations(Action action, int targetMs)
{
if (Configuration.BenchmarkSamples <= 1) return Configuration.TestIterations; // Debug fast path
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
const int calibIter = 100;
var sw = Stopwatch.StartNew();
for (var i = 0; i < calibIter; i++) action();
sw.Stop();
var ms = sw.Elapsed.TotalMilliseconds;
// Pathologically-fast op below Stopwatch resolution — cap at ceiling (further calibration won't help).
if (ms <= 0.0001) return 200_000;
var iterPerMs = calibIter / ms;
var raw = (int)Math.Ceiling(targetMs * iterPerMs);
// Round UP to nearest 1000 — keeps numbers human-readable in the markdown output.
var rounded = ((raw + 999) / 1000) * 1000;
return rounded switch
{
< 1000 => 1000,
> 200_000 => 200_000,
_ => rounded
};
}
/// <summary>
/// Measures per-call allocation in bytes after a clean GC. Single dedicated sample (no median) — keeps timing samples pure.
/// </summary>
internal static long MeasureAllocation(Action action, int iterations, string? progressLabel = null)
{
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
var sw = Stopwatch.StartNew();
var before = GC.GetAllocatedBytesForCurrentThread();
RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);
var after = GC.GetAllocatedBytesForCurrentThread();
sw.Stop();
EndProgress(progressLabel, sw.Elapsed.TotalMilliseconds);
return (after - before) / iterations;
}
/// <summary>
/// Process-wide allocation measurement — needed for round-trip-only benchmarks (NamedPipe etc.) where
/// the work happens across multiple threads. <see cref="GC.GetAllocatedBytesForCurrentThread"/> would
/// only count the caller-thread allocations, missing the server-side <c>new byte[len]</c> buffers and
/// any drain-pump-thread allocations. <see cref="GC.GetTotalAllocatedBytes"/> covers the entire process.
/// Slightly noisier than the per-thread variant (background threads / GC bookkeeping leak in), but
/// over 1000 iterations the signal dominates.
/// </summary>
internal static long MeasureAllocationTotal(Action action, int iterations, string? progressLabel = null)
{
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
var sw = Stopwatch.StartNew();
var before = GC.GetTotalAllocatedBytes(precise: true);
RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);
var after = GC.GetTotalAllocatedBytes(precise: true);
sw.Stop();
EndProgress(progressLabel, sw.Elapsed.TotalMilliseconds);
return (after - before) / iterations;
}
// ============================================================================================
// Progress reporting — \r-driven in-place updates so a stuck benchmark surfaces the exact phase
// and % where it stopped, instead of appearing as a silent hang. Used by RunTimed and the
// MeasureAllocation* helpers when the caller passes a non-null progressLabel.
// ============================================================================================
// Tracks the longest line written by the current progress session, so EndProgress can clear
// any leftover characters from a prior longer line (avoids "ghost" trailing chars after \r).
private static int _progressLastLineLen;
/// <summary>
/// Runs <paramref name="action"/> <paramref name="iterations"/> times, emitting \r-overwriting
/// progress every ~10% (approx. 10 progress prints per sample). When <paramref name="label"/>
/// is null, runs without any progress output (zero overhead beyond a null check per iter).
/// </summary>
private static void RunWithProgress(Action action, int iterations, string? label, int samples, int sampleIndex)
{
if (label is null)
{
for (var i = 0; i < iterations; i++) action();
return;
}
// ~10 progress emits per sample run. Avoid emitting on every iter (Console.Write is
// expensive enough to skew sub-µs benchmarks if overdone).
var step = Math.Max(1, iterations / 10);
for (var i = 0; i < iterations; i++)
{
action();
if ((i + 1) % step == 0 || i == iterations - 1)
{
var pct = (int)((i + 1) * 100L / iterations);
var line = samples > 1
? $" > {label} sample {sampleIndex + 1}/{samples} {pct,3}% ({i + 1}/{iterations})"
: $" > {label} {pct,3}% ({i + 1}/{iterations})";
System.Console.Write('\r');
System.Console.Write(line);
if (line.Length < _progressLastLineLen)
System.Console.Write(new string(' ', _progressLastLineLen - line.Length));
_progressLastLineLen = line.Length;
}
}
}
/// <summary>
/// Closes a progress line cleanly: clears any leftover chars and writes a final "done" line on
/// the same row, terminated by \n so subsequent <c>WriteLine</c> calls render below.
/// </summary>
private static void EndProgress(string? label, double elapsedMs)
{
if (label is null) return;
var done = $" > {label} done in {elapsedMs,7:F1} ms";
System.Console.Write('\r');
System.Console.Write(done);
if (done.Length < _progressLastLineLen)
System.Console.Write(new string(' ', _progressLastLineLen - done.Length));
System.Console.WriteLine();
_progressLastLineLen = 0;
}
#if !AYCODE_NATIVEAOT
private static readonly JsonSerializerOptions VerifyJsonOpts = new()
{
WriteIndented = false,
DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull,
ReferenceHandler = System.Text.Json.Serialization.ReferenceHandler.IgnoreCycles
};
#endif
/// <summary>
/// Round-trip equality check: serialize both via System.Text.Json (canonical form) and compare strings.
/// Slower than property-by-property compare, but universal — works for any object graph without custom comparer.
/// </summary>
/// <remarks>
/// AOT publish skip: <c>System.Text.Json</c>'s reflection path uses runtime closed-generic instantiation
/// (<c>JsonPropertyInfo&lt;TestStatus&gt;</c> et al.) that the trimmer drops, causing
/// <c>NotSupportedException: missing native code or metadata</c>. The validation is JIT-only — the actual
/// benchmark Serialize/Deserialize loops don't touch this path. Under AOT we return <c>true</c> so all
/// <c>VerifyRoundTrip()</c> calls pass without running the cross-format validation.
/// </remarks>
internal static bool DeepEqualsViaJson(object? a, object? b)
{
#if AYCODE_NATIVEAOT
// Skip cross-format validation under AOT — STJ reflection path is incompatible. The roundtrip
// itself still runs (caller-side Serialize+Deserialize), just the JSON-canonical compare is bypassed.
return true;
#else
if (a == null && b == null) return true;
if (a == null || b == null) return false;
var jsonA = JsonSerializer.Serialize(a, VerifyJsonOpts);
var jsonB = JsonSerializer.Serialize(b, VerifyJsonOpts);
return jsonA == jsonB;
#endif
}
/// <summary>
/// Validates MemoryPack setup at startup. Aborts the benchmark if TestOrder is not [MemoryPackable].
/// Without this attribute, MemoryPack falls back to runtime resolver (slower) — comparison would be INVALID.
/// </summary>
internal static void ValidateMemoryPackSetup()
{
var typesToCheck = new[] { typeof(TestOrder) };
foreach (var type in typesToCheck)
{
var hasAttr = type.GetCustomAttributes(typeof(MemoryPackableAttribute), inherit: true).Any();
if (!hasAttr)
{
System.Console.Error.WriteLine($"❌ FATAL: {type.FullName} is not [MemoryPackable] — MemoryPack would fall back to runtime resolver, comparison is INVALID for SGen-vs-SGen claim.");
System.Console.Error.WriteLine("Add [MemoryPackable] to the type and any nested types referenced from it.");
Environment.Exit(1);
}
}
}
/// <summary>
/// Filters test data sets by layer keyword. Layered approach lets you run only what's needed for the iteration cadence.
/// P1: only "Core" data exists (Small/Medium/Large/Repeated/Deep). Comprehensive and Edge layers will be expanded in P2.
/// </summary>
internal static List<TestDataSet> FilterByLayer(List<TestDataSet> all, string layer)
{
if (layer == "all") return all.ToList();
var coreNames = new[] { "Small", "Medium", "Large", "Repeated", "Deep" };
// P2 will add: "Flat", "Polymorphic", "Collection", "Numeric", "NonAscii", etc.
var comprehensiveExtras = new string[] { /* P2 */ };
// P3 will add: "ColdStart", "VeryLarge", "PathologicalString", etc.
var edgeExtras = new string[] { /* P3 */ };
return layer switch
{
"core" => all.Where(t => StartsWithAny(t.Name, coreNames)).ToList(),
"comprehensive" => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras)).ToList(),
"edge" => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras) || StartsWithAny(t.Name, edgeExtras)).ToList(),
// Single-cell A/B mini-suite filters — match by case-insensitive prefix on Name.
// Use case: tight optimization-iteration loop on one specific cell (e.g. `dotnet run -- repeated`
// or interactive menu shortcut), avoiding the full ~110 sec suite when only one cell is in scope.
"small" => all.Where(t => t.Name.StartsWith("Small", StringComparison.OrdinalIgnoreCase)).ToList(),
"medium" => all.Where(t => t.Name.StartsWith("Medium", StringComparison.OrdinalIgnoreCase)).ToList(),
"large" => all.Where(t => t.Name.StartsWith("Large", StringComparison.OrdinalIgnoreCase)).ToList(),
"repeated" => all.Where(t => t.Name.StartsWith("Repeated", StringComparison.OrdinalIgnoreCase)).ToList(),
"deep" => all.Where(t => t.Name.StartsWith("Deep", StringComparison.OrdinalIgnoreCase)).ToList(),
_ => all.ToList()
};
static bool StartsWithAny(string name, string[] prefixes) => prefixes.Any(name.StartsWith);
}
}