AyCode.Core/AyCode.Core.Serializers.Con.../BenchmarkLoop.cs

889 lines
55 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using AyCode.Core.Benchmarks.Reporting;
using AyCode.Core.Benchmarks.Workloads.Scenarios;
using AyCode.Core.Serializers.Binaries;
using AyCode.Core.Tests.TestModels;
using MemoryPack;
using System.Diagnostics;
using System.Runtime.CompilerServices;
namespace AyCode.Core.Serializers.Console;
/// <summary>
/// Benchmark execution: end-to-end orchestration (<see cref="RunBenchmark"/>), per-cell loop
/// (<see cref="RunBenchmarksForTestData"/>), serializer factory (<see cref="CreateSerializers"/>),
/// and the timing / calibration / allocation helpers. Pure benchmark-execution infrastructure —
/// no display formatting (that lives in <c>Output</c>) and no UX-flow (that lives in <c>Program</c>
/// + <c>Menu</c>).
/// </summary>
internal static class BenchmarkLoop
{
/// <summary>
/// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup
/// + measurement → grouped results print → save to disk. Used by both the CLI and interactive
/// menu paths; the interactive loop calls this repeatedly without restarting the process.
/// </summary>
internal static void RunBenchmark(BenchmarkLayer layer, BenchmarkOpMode opMode, SerializerSelectionMode serializerMode)
{
System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗");
System.Console.WriteLine("║ COMPREHENSIVE SERIALIZER BENCHMARK SUITE ║");
System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");
// Stabilization: pin the entire benchmark process to a single logical CPU and bump priority
// class. Single-core affinity stops Windows from migrating the bench thread between cores
// mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority
// reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise
// randomly inflate samples by 5-15%.
// Try/finally guarantees the original state is restored even if a benchmark throws — leaving
// a developer machine pinned to one core after a crashed run is a real foot-gun.
// Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot.
var process = Process.GetCurrentProcess();
var origAffinity = (IntPtr)0;
var origPriority = ProcessPriorityClass.Normal;
var stabilizationApplied = false;
// ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at
// runtime; skip the affinity step there but still raise priority class (which IS supported
// on macOS, just less effective for stabilization than affinity pinning).
if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
{
try
{
origAffinity = process.ProcessorAffinity;
origPriority = process.PriorityClass;
// Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one
// core, consistently" — not which one. If CPU 0 is heavily contended on the host
// (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak
// the mask here. The benchmark is single-threaded for the in-memory rows so single
// core is sufficient; round-trip-only NamedPipe rows have a server-drain thread
// that will share the core (acceptable — the bench measures end-to-end RT anyway).
process.ProcessorAffinity = (IntPtr)1;
process.PriorityClass = ProcessPriorityClass.High;
stabilizationApplied = true;
System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High.");
}
catch (Exception ex)
{
// Affinity/priority changes may fail on locked-down hosts (group policies, containers
// without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still
// works, just with the platform default scheduling.
System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}");
}
}
try
{
var allResults = new List<BenchmarkResult>();
var allTestDataSets = BuildMultiVariantTestDataSets();
var testDataSets = FilterByLayer(allTestDataSets, layer);
System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard");
System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
System.Console.WriteLine();
// Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
// Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
// alone doesn't ensure that every Serialize<T>/IBufferWriter overload is fully Tier 1 by the time we
// start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
// on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
// Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
// them all in the background; the per-cell warmup that follows then locks in cache + branch state.
if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
{
System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");
foreach (var testData in testDataSets)
{
var preSerializers = CreateSerializers(testData, serializerMode);
try
{
foreach (var s in preSerializers)
{
// Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated:
// Ser path first, then Des path — same pattern as the per-cell warmup in
// RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming).
s.WarmupSerialize(2000);
s.WarmupDeserialize(2000);
}
}
finally
{
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
}
}
// Let background tiered-JIT compilation drain before we begin measuring.
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
System.Console.WriteLine("✓ Global pre-warmup complete.\n");
}
foreach (var testData in testDataSets)
{
System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");
var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
allResults.AddRange(results);
}
// Build the reporting context (resolves path via walk-up to .sln, snapshots run-config).
var ctx = new ReportingContext(
SourceTag: "Console",
ResultsDirectory: ReportingContext.ResolveResultsDirectory(),
BuildConfiguration: Configuration.BuildConfiguration,
Utf8NoBom: Configuration.Utf8NoBom,
CharsetName: Configuration.GetCurrentCharsetName(),
WarmupIterations: Configuration.WarmupIterations,
BenchmarkSamples: Configuration.BenchmarkSamples,
TargetSampleMs: Configuration.TargetSampleMs,
UnstableCVThreshold: Configuration.UnstableCVThreshold,
MicroOptCVThreshold: Configuration.MicroOptCVThreshold);
// Print grouped results
BenchmarkReportWriter.PrintGroupedResults(allResults, testDataSets);
// Save results to file (.log + .LLM + .output)
BenchmarkReportWriter.SaveAll(ctx, allResults, testDataSets);
System.Console.WriteLine("\n✓ Benchmark complete!");
}
finally
{
// Restore process state — affinity/priority changes are process-wide and persist across
// interactive-mode iterations of the menu. Without restore, the second menu run would
// already be on CPU-0 + High priority before its own try-block applied them, masking
// any stabilization-disabled comparison.
if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
{
try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ }
try { process.PriorityClass = origPriority; } catch { /* best-effort */ }
}
}
}
private static List<BenchmarkResult> RunBenchmarksForTestData(TestDataSet testData, BenchmarkOpMode mode, SerializerSelectionMode serializerMode)
{
var results = new List<BenchmarkResult>();
var serializers = CreateSerializers(testData, serializerMode);
// Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure.
System.Console.WriteLine("Verifying round-trip correctness...");
foreach (var serializer in serializers)
{
if (!serializer.VerifyRoundTrip())
{
System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}");
System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting.");
Environment.Exit(1);
}
}
System.Console.WriteLine("✓ All serializers passed round-trip verification.");
// Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and
// Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary.
//
// Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor
// in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a
// cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs
// keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement
// (steady-state). Branch-predictor history also stays clean per path.
//
// GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer
// pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent
// heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations.
//
// Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT).
// Each phase's freshly-promoted methods settle before its timing starts.
System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n");
foreach (var serializer in serializers)
{
var result = new BenchmarkResult
{
TestDataName = testData.DisplayName, // Use DisplayName for IId% info
Engine = serializer.Engine,
IoMode = serializer.IoMode,
DispatchMode = serializer.DispatchMode,
OptionsPreset = serializer.OptionsPreset,
OrderTypeName = serializer.OrderTypeName,
OptionsDescription = serializer.OptionsDescription,
SerializedSize = serializer.SerializedSize,
SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes,
SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes,
IsRoundTripOnly = serializer.IsRoundTripOnly
};
// Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark
// is visibly stuck on a specific row at a specific %% rather than silently hanging.
var groupLabel = $"{result.SerializerName}";
if (serializer.IsRoundTripOnly)
{
// Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT,
// Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the
// entire round-trip path, then record into the RT result columns.
if (mode is BenchmarkOpMode.All or BenchmarkOpMode.Serialize)
{
ForceGcCollect();
serializer.WarmupSerialize(Configuration.WarmupIterations);
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
var rtIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
var (rtMed, rtMin, rtMax, rtStd) = RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
result.RoundTripTimeMs = rtMed;
result.RoundTripTimeMinMs = rtMin;
result.RoundTripTimeMaxMs = rtMax;
result.RoundTripTimeStdDevMs = rtStd;
result.RoundTripIterations = rtIter;
// Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len])
// also show up — otherwise current-thread alloc would only count the client side and look ~halved.
result.RoundTripAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]", processWide: true);
}
// mode == BenchmarkOpMode.Deserialize alone is meaningless for a round-trip-only benchmark; skip silently.
}
else
{
// ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
if (mode is BenchmarkOpMode.All or BenchmarkOpMode.Serialize)
{
ForceGcCollect();
serializer.WarmupSerialize(Configuration.WarmupIterations);
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
var serIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
var (serMed, serMin, serMax, serStd) = RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
result.SerializeTimeMs = serMed;
result.SerializeTimeMinMs = serMin;
result.SerializeTimeMaxMs = serMax;
result.SerializeTimeStdDevMs = serStd;
result.SerializeIterations = serIter;
// Dedicated alloc-only sample (separate from timing samples; keeps timing pure)
result.SerializeAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
}
// ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
// The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the
// Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph).
if (mode is BenchmarkOpMode.All or BenchmarkOpMode.Deserialize)
{
ForceGcCollect();
serializer.WarmupDeserialize(Configuration.WarmupIterations);
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
var desIter = CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs);
var (desMed, desMin, desMax, desStd) = RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
result.DeserializeTimeMs = desMed;
result.DeserializeTimeMinMs = desMin;
result.DeserializeTimeMaxMs = desMax;
result.DeserializeTimeStdDevMs = desStd;
result.DeserializeIterations = desIter;
result.DeserializeAllocBytesPerOp = MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]");
}
// Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration,
// batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
// then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
// RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp.
var serPerOp = BenchmarkReportWriter.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations);
var desPerOp = BenchmarkReportWriter.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations);
var rtPerOp = serPerOp + desPerOp;
result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations);
result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
}
results.Add(result);
BenchmarkReportWriter.PrintResult(result);
}
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released
// before the next test data builds new ones — otherwise pipes / handles leak across test cells).
foreach (var s in serializers) (s as IDisposable)?.Dispose();
return results;
}
/// <summary>
/// Phase 2 multi-variant test-data builder. Constructs each cell in both the _All_False and
/// _All_True families, then cross-registers _All_True on the _All_False primaries so the
/// CreateSerializers downstream can pick the matching variant per AcBinary options preset.
/// </summary>
/// <remarks>
/// Memory cost: ~600 KB across 5 cells (Large dominates at ~340 KB for both variants). The two
/// families are built independently — same data values + same numeric sequence (per-family
/// _idCounter reset). MemPack/MsgPack benchmarks consume the _All_True variant canonically;
/// AcBinary's variant is preset-dependent (see CreateSerializers).
/// </remarks>
private static List<TestDataSet> BuildMultiVariantTestDataSets()
{
var allFalse = BenchmarkTestDataProvider_All_False.CreateTestDataSets();
var allTrue = BenchmarkTestDataProvider.CreateTestDataSets();
// Zip by ordinal — both providers emit the same 5 cells in the same order
// (Small / Medium / Large / Repeated / Deep), confirmed by their identical
// CreateTestDataSets call sequence on the generic base.
for (var i = 0; i < allFalse.Count; i++)
{
var falseDs = (TestDataSet<TestOrder_All_False>)allFalse[i];
var trueDs = (TestDataSet<TestOrder_All_True>)allTrue[i];
falseDs.RegisterVariant(trueDs.Order);
}
return allFalse;
}
/// <summary>
/// Phase 2 variant dispatch rule for AcBinary: a preset uses <c>TestOrder_All_False</c> iff every
/// AcBinary "feature flag" is off (no string interning, no reference handling, no metadata, no
/// property filter). Any "true"-flagged feature promotes the benchmark to <c>TestOrder_All_True</c>
/// — the richer graph + opt-out attribute model exercises the feature's deduplication / dispatch
/// path on real shared-reference content. WireMode, SGen mode, and Compression are encoding-axis
/// options and intentionally NOT part of this decision (they don't change which graph shape is
/// meaningful to feed).
/// </summary>
private static bool UsesAllFalseVariant(AcBinarySerializerOptions options) =>
options.UseStringInterning == StringInterningMode.None &&
options.ReferenceHandling == ReferenceHandlingMode.None &&
!options.UseMetadata &&
options.PropertyFilter == null;
// Per-class factory helpers — each returns ISerializerBenchmark closed over the variant T
// selected by UsesAllFalseVariant(options). Compile-time T at the new T() call site preserves
// SGen apples-to-apples (no runtime reflection, no type erasure across the JIT boundary).
private static ISerializerBenchmark MakeAcBinary(TestDataSet td, AcBinarySerializerOptions opt, string preset) =>
UsesAllFalseVariant(opt)
? new AcBinaryBenchmark<TestOrder_All_False>(td.GetOrder<TestOrder_All_False>(), opt, preset)
: new AcBinaryBenchmark<TestOrder_All_True>(td.GetOrder<TestOrder_All_True>(), opt, preset);
private static ISerializerBenchmark MakeAcBinaryBufferWriter(TestDataSet td, AcBinarySerializerOptions opt, string preset) =>
UsesAllFalseVariant(opt)
? new AcBinaryBufferWriterBenchmark<TestOrder_All_False>(td.GetOrder<TestOrder_All_False>(), opt, preset)
: new AcBinaryBufferWriterBenchmark<TestOrder_All_True>(td.GetOrder<TestOrder_All_True>(), opt, preset);
private static ISerializerBenchmark MakeAcBinaryFreshBufferWriter(TestDataSet td, AcBinarySerializerOptions opt, string preset) =>
UsesAllFalseVariant(opt)
? new AcBinaryFreshBufferWriterBenchmark<TestOrder_All_False>(td.GetOrder<TestOrder_All_False>(), opt, preset)
: new AcBinaryFreshBufferWriterBenchmark<TestOrder_All_True>(td.GetOrder<TestOrder_All_True>(), opt, preset);
private static ISerializerBenchmark MakeAcBinaryNamedPipe(TestDataSet td, AcBinarySerializerOptions opt, string preset) =>
UsesAllFalseVariant(opt)
? new AcBinaryNamedPipeBenchmark<TestOrder_All_False>(td.GetOrder<TestOrder_All_False>(), opt, preset)
: new AcBinaryNamedPipeBenchmark<TestOrder_All_True>(td.GetOrder<TestOrder_All_True>(), opt, preset);
private static ISerializerBenchmark MakeAcBinaryNamedPipeRaw(TestDataSet td, AcBinarySerializerOptions opt, string preset) =>
UsesAllFalseVariant(opt)
? new AcBinaryNamedPipeRawByteArrayBenchmark<TestOrder_All_False>(td.GetOrder<TestOrder_All_False>(), opt, preset)
: new AcBinaryNamedPipeRawByteArrayBenchmark<TestOrder_All_True>(td.GetOrder<TestOrder_All_True>(), opt, preset);
private static ISerializerBenchmark MakeAcBinaryInMemoryPipe(TestDataSet td, AcBinarySerializerOptions opt, string preset) =>
UsesAllFalseVariant(opt)
? new AcBinaryInMemoryPipeBenchmark<TestOrder_All_False>(td.GetOrder<TestOrder_All_False>(), opt, preset)
: new AcBinaryInMemoryPipeBenchmark<TestOrder_All_True>(td.GetOrder<TestOrder_All_True>(), opt, preset);
private static ISerializerBenchmark MakeAcBinaryInMemoryRaw(TestDataSet td, AcBinarySerializerOptions opt, string preset) =>
UsesAllFalseVariant(opt)
? new AcBinaryInMemoryRawByteArrayBenchmark<TestOrder_All_False>(td.GetOrder<TestOrder_All_False>(), opt, preset)
: new AcBinaryInMemoryRawByteArrayBenchmark<TestOrder_All_True>(td.GetOrder<TestOrder_All_True>(), opt, preset);
private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, SerializerSelectionMode serializerMode)
{
// Phase 2 variant dispatch (refined): AcBinary picks variant per UsesAllFalseVariant(options).
// MemPack / MsgPack canonically use _All_False (no AcBinary opt-in/opt-out axis — both
// produce identical MemPack/MsgPack wire on either variant since their contract is family-
// agnostic). `orderFalse` is the cell primary; `orderTrue` is fetched on-demand by the AcBinary
// factory helpers when an options preset has a "true" flag.
var orderFalse = testData.GetOrder<TestOrder_All_False>();
// FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
// TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[].
// - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head.
// Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
//
// FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint —
// we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor
// reference earlier. Re-enable when revisiting Fast wire-mode performance.
if (serializerMode == SerializerSelectionMode.FastestByte)
{
var fastestByteOptions = AcBinarySerializerOptions.FastMode;
fastestByteOptions.WireMode = Configuration.SelectedWireMode;
return new List<ISerializerBenchmark>
{
MakeAcBinary(testData, fastestByteOptions, "FastMode"),
//MakeAcBinary(testData, fastWireOptions, "FastMode (FastWire)"),
// MemPack uses _All_False (the AcBinary opt-in/opt-out axis doesn't apply — MemoryPackable
// serialises identical bytes either way; _All_False matches the orderFalse variant the test
// data factory already built, no extra graph allocation needed).
new MemoryPackBenchmark<TestOrder_All_False>(orderFalse, Configuration.SelectedWireMode, "Default"),
};
}
// AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer).
// Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with
// the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it
// in isolation so the timing numbers reflect ONLY the streaming path.
if (serializerMode == SerializerSelectionMode.AsyncPipe)
{
// NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here
// drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to
// the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the
// NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall
// fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level
// wire chunk AND kernel transfer unit; change ONLY this line when tuning.
var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode;
binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize;
binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode;
return new List<ISerializerBenchmark>
{
// Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync.
// Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer +
// MRES wait-on-byte-shortage — over a kernel NamedPipe.
MakeAcBinaryNamedPipe(testData, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
// Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport,
// same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize<T>(byte[]).
// No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with
// the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from
// kernel-transport-overhead (raw vs in-process Byte[]).
MakeAcBinaryNamedPipeRaw(testData, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
// Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel).
// Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the
// kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk
// on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this
// isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the
// in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback
// is the worst-case benchmark scenario for chunked-streaming and not representative of real network
// / file / cross-thread Pipe scenarios.
MakeAcBinaryInMemoryPipe(testData, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
// Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples
// baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw
// byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory].
MakeAcBinaryInMemoryRaw(testData, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
};
}
// Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the
// AsyncPipe menu / CLI mode, never bundled with the steady-state suite).
var binaryNoInternOption = AcBinarySerializerOptions.Default;
binaryNoInternOption.UseStringInterning = StringInterningMode.None;
binaryNoInternOption.WireMode = Configuration.SelectedWireMode;
var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default;
binaryDefaultNoSgenOption.UseGeneratedCode = false;
binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode;
var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode;
binaryFastModeNoSgenOption.UseGeneratedCode = false;
binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode;
var binaryFastModeOption = AcBinarySerializerOptions.FastMode;
binaryFastModeOption.WireMode = Configuration.SelectedWireMode;
// BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives
// the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call.
// Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration
// allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead
// vs syscall count).
var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode;
binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize;
binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode;
// In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment
// concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory
// System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task).
var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode;
binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize;
binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode;
var defaultOptions = AcBinarySerializerOptions.Default;
defaultOptions.UseStringInterning = StringInterningMode.None;
defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId;
defaultOptions.WireMode = Configuration.SelectedWireMode;
return new List<ISerializerBenchmark>
{
// ============================================================
// AcBinary — Byte[] API (uncomment to compare option presets side-by-side)
// ============================================================
// Fastest Byte[] — SGen path (UseGeneratedCode=true, default).
MakeAcBinary(testData, binaryFastModeOption, "FastMode"),
// Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch.
// Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples.
// NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates
// when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish).
MakeAcBinary(testData, binaryFastModeNoSgenOption, "FastMode"),
// Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) +
// UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size
// and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data.
// Default preset (ReferenceHandling=OnlyId + StringInterning) → _All_True graph.
// Phase 2 variant-dispatch rule: any options preset with a "true"-flagged feature uses
// the _All_True family (rich graph, opt-out AcBinarySerializable attribute matches).
MakeAcBinary(testData, defaultOptions, "Default"),
//MakeAcBinary(testData, binaryDefaultNoSgenOption, "Default"),
//MakeAcBinary(testData, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"),
//MakeAcBinary(testData, binaryNoInternOption, "NoIntern"),
// AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario)
MakeAcBinaryBufferWriter(testData, binaryFastModeOption, "FastMode"),
// AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario).
// 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter
// allocation. Optimum for this scenario.
MakeAcBinaryFreshBufferWriter(testData, binaryFastModeBufWrChunk, "FastMode (4KB)"),
// AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the
// Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost
// (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns.
// The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode.
MakeAcBinaryInMemoryPipe(testData, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"),
// Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to-
// apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs
// chunked-streaming wire format. The IO column shows "Bytes(in-mem)".
MakeAcBinaryInMemoryRaw(testData, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"),
// AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED
// here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport
// measurements.
// ============================================================
// MemoryPack — three I/O modes for apples-to-apples comparison
// ============================================================
// MemPack uses _All_False (see FastestByte-mode comment above for rationale).
new MemoryPackBenchmark<TestOrder_All_False>(orderFalse, Configuration.SelectedWireMode, "Default"),
new MemoryPackBufferWriterBenchmark<TestOrder_All_False>(orderFalse, Configuration.SelectedWireMode, "Default"),
new MemoryPackFreshBufferWriterBenchmark<TestOrder_All_False>(orderFalse, Configuration.SelectedWireMode, "Default"),
// ============================================================
// MessagePack — for legacy comparison
// ============================================================
#if !AYCODE_NATIVEAOT
// MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed
// ListFormatter<T> et al. — fails under NativeAOT publish with "No parameterless constructor".
// Excluded from the AOT build; available for regular JIT runs only.
new MessagePackBenchmark<TestOrder_All_False>(orderFalse, "ContractBased"),
#endif
// System.Text.Json (commented — JSON serializer for reference; not in active suite)
//new SystemTextJsonBenchmark<TestOrder_All_False>(orderFalse, "Default")
};
}
/// <summary>
/// Forces a full GC cycle at a phase boundary in the benchmark loop. Two-pass collect with finalizer drain
/// in between: the first pass moves managed garbage to the finalization queue, <c>WaitForPendingFinalizers</c>
/// runs the finalizers, the second pass reclaims any objects the finalizers released. After this returns the
/// heap is in a known-quiescent state — the next warmup/measurement phase starts on a clean slate, isolated
/// from the previous phase's residual allocations (write-buffer pools, intern cache, write-plan arrays, etc.).
/// Called between every Ser-phase / Des-phase boundary in <see cref="RunBenchmarksForTestData"/>.
/// </summary>
[MethodImpl(MethodImplOptions.NoInlining)]
internal static void ForceGcCollect()
{
GC.Collect(2, GCCollectionMode.Forced, blocking: true);
GC.WaitForPendingFinalizers();
GC.Collect(2, GCCollectionMode.Forced, blocking: true);
}
/// <summary>
/// Runs the action <paramref name="iterations"/> times for <see cref="Configuration.BenchmarkSamples"/> independent samples,
/// returning the median, min, and max elapsed time. Multi-sample design reduces single-run variance
/// from ~±15% to ~±5% by smoothing transient effects (background activity, thermal/turbo state).
/// When <see cref="Configuration.BenchmarkSamples"/> &lt;= 1, falls back to single-sample timing (Debug / quick mode).
/// When <paramref name="progressLabel"/> is non-null, emits in-place <c>\r</c> progress updates so a
/// stuck benchmark (e.g. deadlocked NamedPipe row) is visibly stuck at a specific %% rather than
/// silently hanging.
///
/// Stabilization (added 2026-05-07):
/// 1) Pilot sample is run BEFORE the recorded loop and discarded. The first measurement after
/// warmup tends to absorb residual JIT bookkeeping and GC bookkeeping; dropping it tightens
/// the min/max range without throwing away signal (the median is the SAME data as before).
/// 2) GC.Collect / WaitForPendingFinalizers / GC.Collect runs BEFORE every recorded sample.
/// Without this, GC pressure from sample N occasionally triggered a Gen-2 pause inside
/// sample N+1, painting it as an outlier; collecting up-front gives every sample the
/// same starting heap shape.
/// 3) Returns (median, min, max) so the caller can surface the inter-sample range — visible
/// noise floor for the row, replacing the previous "median only" view.
/// </summary>
internal static (double medianMs, double minMs, double maxMs, double stdDevMs) RunTimed(Action action, int iterations, string? progressLabel = null)
{
var samples = Configuration.BenchmarkSamples;
if (samples <= 1)
{
// Single-sample fast path (Debug or trivial run) — no allocation, no sort, no stddev.
var sw = Stopwatch.StartNew();
RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);
sw.Stop();
var ms = sw.Elapsed.TotalMilliseconds;
EndProgress(progressLabel, ms);
return (ms, ms, ms, 0);
}
// Pilot sample (discarded). Counts as sample index 0 of (samples + 1) for progress display
// so the user sees an extra "warmup-ish" tick before the recorded samples start.
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
var pilotSw = Stopwatch.StartNew();
RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: 0);
pilotSw.Stop();
// intentionally not stored
var times = new double[samples];
for (var s = 0; s < samples; s++)
{
// Per-sample GC settle. Forces every sample to start from the same heap state, so
// a Gen-2 pause caused by the previous sample doesn't bleed into the next sample's
// timing. Cost is paid OUTSIDE the Stopwatch window — no impact on the measurement.
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
// Inter-sample thermal-settle: CPU boost-clock can drop mid-batch under sustained load
// (e.g. 10×250ms = 2.5 sec burst). InterSampleSettleMs lets the boost-clock state
// settle so later samples don't read systematically slower than early ones. Skip before
// the first sample (no prior heat to settle from). Set to 0 in Configuration to disable.
if (s > 0 && Configuration.InterSampleSettleMs > 0)
Thread.Sleep(Configuration.InterSampleSettleMs);
var sw = Stopwatch.StartNew();
RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: s + 1);
sw.Stop();
times[s] = sw.Elapsed.TotalMilliseconds;
}
// Capture min/max/sum/sumSq BEFORE sort to avoid order ambiguity (Array.Sort is in-place).
var minMs = double.MaxValue;
var maxMs = double.MinValue;
var sum = 0.0;
var sumSq = 0.0;
for (var i = 0; i < times.Length; i++)
{
var t = times[i];
sum += t;
sumSq += t * t;
if (t < minMs) minMs = t;
if (t > maxMs) maxMs = t;
}
// Population stddev (not sample-stddev — we treat the captured samples as the population for
// CV computation). variance = E[X²] - E[X]² with Math.Max(0, ...) guard against tiny negative
// values from FP rounding when samples are nearly identical.
var mean = sum / times.Length;
var variance = (sumSq / times.Length) - (mean * mean);
var stdDevMs = Math.Sqrt(Math.Max(0.0, variance));
Array.Sort(times);
// Trimmed median: when samples >= 4, drop the single min and single max (sorted-array
// first and last) and compute median on the remaining (samples - 2) entries. Removes the
// worst per-sample contamination (a thermal spike, OS preempt, or a GC pause that escaped
// the per-sample GC.Collect settle) without throwing away too much signal. The min/max /
// stdDev outputs still reflect the FULL sample population — the trim affects only the
// headline median figure, so the visible range still shows the actual measurement extremes.
var trimStart = samples >= 4 ? 1 : 0;
var trimCount = samples >= 4 ? samples - 2 : samples;
var medianMs = trimCount % 2 == 1
? times[trimStart + trimCount / 2]
: (times[trimStart + trimCount / 2 - 1] + times[trimStart + trimCount / 2]) / 2.0;
EndProgress(progressLabel, medianMs);
return (medianMs, minMs, maxMs, stdDevMs);
}
/// <summary>
/// Per-cell adaptive iteration calibration. Runs a 100-iter measurement after warmup and computes
/// how many iterations are needed to reach <see cref="Configuration.TargetSampleMs"/> wall-clock per sample.
/// Returns iter rounded UP to the nearest 1000, floored at 1000 (the prior fixed minimum) and
/// ceiling-capped at 200_000 (sanity bound for pathologically fast ops). In Debug single-sample mode
/// (<c>Configuration.BenchmarkSamples &lt;= 1</c>) returns the global <see cref="Configuration.TestIterations"/> unchanged —
/// calibration overhead is unjustified there. Calibration runs OUTSIDE the timed sample loop and
/// does NOT count toward warmup; its sole purpose is to measure per-op cost.
/// </summary>
internal static int CalibrateIterations(Action action, int targetMs)
{
if (Configuration.BenchmarkSamples <= 1) return Configuration.TestIterations; // Debug fast path
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
const int calibIter = 100;
var sw = Stopwatch.StartNew();
for (var i = 0; i < calibIter; i++) action();
sw.Stop();
var ms = sw.Elapsed.TotalMilliseconds;
// Pathologically-fast op below Stopwatch resolution — cap at ceiling (further calibration won't help).
if (ms <= 0.0001) return 200_000;
var iterPerMs = calibIter / ms;
var raw = (int)Math.Ceiling(targetMs * iterPerMs);
// Round UP to nearest 1000 — keeps numbers human-readable in the markdown output.
var rounded = ((raw + 999) / 1000) * 1000;
return rounded switch
{
< 1000 => 1000,
> 200_000 => 200_000,
_ => rounded
};
}
/// <summary>
/// Measures per-call allocation in bytes after a clean GC. Single dedicated sample (no median) — keeps
/// timing samples pure. When <paramref name="processWide"/> is <c>true</c>, uses
/// <see cref="GC.GetTotalAllocatedBytes"/> instead of <see cref="GC.GetAllocatedBytesForCurrentThread"/>
/// — needed for round-trip-only benchmarks (NamedPipe etc.) where the work happens across multiple
/// threads (server-side <c>new byte[len]</c> buffers, drain-pump-thread allocations). Per-thread mode
/// is slightly cleaner for in-memory benchmarks; process-wide mode is slightly noisier (background
/// threads / GC bookkeeping leak in) but over 1000 iterations the signal dominates.
/// </summary>
internal static long MeasureAllocation(Action action, int iterations, string? progressLabel = null, bool processWide = false)
{
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
var sw = Stopwatch.StartNew();
var before = processWide ? GC.GetTotalAllocatedBytes(precise: true) : GC.GetAllocatedBytesForCurrentThread();
RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);
var after = processWide ? GC.GetTotalAllocatedBytes(precise: true) : GC.GetAllocatedBytesForCurrentThread();
sw.Stop();
EndProgress(progressLabel, sw.Elapsed.TotalMilliseconds);
return (after - before) / iterations;
}
// ============================================================================================
// Progress reporting — \r-driven in-place updates so a stuck benchmark surfaces the exact phase
// and % where it stopped, instead of appearing as a silent hang. Used by RunTimed and the
// MeasureAllocation* helpers when the caller passes a non-null progressLabel.
// ============================================================================================
// Tracks the longest line written by the current progress session, so EndProgress can clear
// any leftover characters from a prior longer line (avoids "ghost" trailing chars after \r).
private static int _progressLastLineLen;
/// <summary>
/// Runs <paramref name="action"/> <paramref name="iterations"/> times, emitting \r-overwriting
/// progress every ~10% (approx. 10 progress prints per sample). When <paramref name="label"/>
/// is null, runs without any progress output (zero overhead beyond a null check per iter).
/// </summary>
private static void RunWithProgress(Action action, int iterations, string? label, int samples, int sampleIndex)
{
if (label is null)
{
for (var i = 0; i < iterations; i++) action();
return;
}
// Batch-based progress emit — ~10 progress prints per sample. The inner loop is branchless
// (no per-iter modulo / progress check), so the per-iter overhead is bare `action()` cost.
// The outer loop drives the batches; progress emit happens once per batch on the boundary.
// This keeps sub-µs ops cleanly measurable — the prior `if ((i + 1) % step == 0)` check
// added a 1-2 cycle per-iter branch that distorted hot loops near the Stopwatch resolution.
var step = Math.Max(1, iterations / 10);
var done = 0;
while (done < iterations)
{
var batch = Math.Min(step, iterations - done);
// Inner tight loop: no progress check, no modulo. Just the measured action() calls.
for (var i = 0; i < batch; i++) action();
done += batch;
var pct = (int)(done * 100L / iterations);
var line = samples > 1
? $" > {label} sample {sampleIndex + 1}/{samples} {pct,3}% ({done}/{iterations})"
: $" > {label} {pct,3}% ({done}/{iterations})";
System.Console.Write('\r');
System.Console.Write(line);
if (line.Length < _progressLastLineLen)
System.Console.Write(new string(' ', _progressLastLineLen - line.Length));
_progressLastLineLen = line.Length;
}
}
/// <summary>
/// Closes a progress line cleanly: clears any leftover chars and writes a final "done" line on
/// the same row, terminated by \n so subsequent <c>WriteLine</c> calls render below.
/// </summary>
private static void EndProgress(string? label, double elapsedMs)
{
if (label is null) return;
var done = $" > {label} done in {elapsedMs,7:F1} ms";
System.Console.Write('\r');
System.Console.Write(done);
if (done.Length < _progressLastLineLen)
System.Console.Write(new string(' ', _progressLastLineLen - done.Length));
System.Console.WriteLine();
_progressLastLineLen = 0;
}
/// <summary>
/// Validates MemoryPack setup at startup. Aborts the benchmark if TestOrder_All_True is not [MemoryPackable].
/// Without this attribute, MemoryPack falls back to runtime resolver (slower) — comparison would be INVALID.
/// </summary>
internal static void ValidateMemoryPackSetup()
{
var typesToCheck = new[] { typeof(TestOrder_All_True) };
foreach (var type in typesToCheck)
{
var hasAttr = type.GetCustomAttributes(typeof(MemoryPackableAttribute), inherit: true).Any();
if (!hasAttr)
{
System.Console.Error.WriteLine($"❌ FATAL: {type.FullName} is not [MemoryPackable] — MemoryPack would fall back to runtime resolver, comparison is INVALID for SGen-vs-SGen claim.");
System.Console.Error.WriteLine("Add [MemoryPackable] to the type and any nested types referenced from it.");
Environment.Exit(1);
}
}
}
/// <summary>
/// Filters test data sets by layer keyword. Layered approach lets you run only what's needed for the iteration cadence.
/// P1: only "Core" data exists (Small/Medium/Large/Repeated/Deep). Comprehensive and Edge layers will be expanded in P2.
/// </summary>
internal static List<TestDataSet> FilterByLayer(List<TestDataSet> all, BenchmarkLayer layer)
{
if (layer == BenchmarkLayer.All) return all.ToList();
var coreNames = new[] { "Small", "Medium", "Large", "Repeated", "Deep" };
// P2 will add: "Flat", "Polymorphic", "Collection", "Numeric", "NonAscii", etc.
var comprehensiveExtras = new string[] { /* P2 */ };
// P3 will add: "ColdStart", "VeryLarge", "PathologicalString", etc.
var edgeExtras = new string[] { /* P3 */ };
return layer switch
{
BenchmarkLayer.Core => all.Where(t => StartsWithAny(t.Name, coreNames)).ToList(),
BenchmarkLayer.Comprehensive => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras)).ToList(),
BenchmarkLayer.Edge => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras) || StartsWithAny(t.Name, edgeExtras)).ToList(),
// Single-cell A/B mini-suite filters — match by case-insensitive prefix on Name.
// Use case: tight optimization-iteration loop on one specific cell (e.g. `dotnet run -- repeated`
// or interactive menu shortcut), avoiding the full ~110 sec suite when only one cell is in scope.
BenchmarkLayer.Small => all.Where(t => t.Name.StartsWith("Small", StringComparison.OrdinalIgnoreCase)).ToList(),
BenchmarkLayer.Medium => all.Where(t => t.Name.StartsWith("Medium", StringComparison.OrdinalIgnoreCase)).ToList(),
BenchmarkLayer.Large => all.Where(t => t.Name.StartsWith("Large", StringComparison.OrdinalIgnoreCase)).ToList(),
BenchmarkLayer.Repeated => all.Where(t => t.Name.StartsWith("Repeated", StringComparison.OrdinalIgnoreCase)).ToList(),
BenchmarkLayer.Deep => all.Where(t => t.Name.StartsWith("Deep", StringComparison.OrdinalIgnoreCase)).ToList(),
_ => all.ToList()
};
static bool StartsWithAny(string name, string[] prefixes) => prefixes.Any(name.StartsWith);
}
}