AyCode.Core/AyCode.Core.Serializers.Con.../BenchmarkLoop.cs

795 lines
47 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using AyCode.Core.Serializers.Binaries;
using AyCode.Core.Serializers.Console.Benchmarks;
using AyCode.Core.Tests.TestModels;
using MemoryPack;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Text.Json;
namespace AyCode.Core.Serializers.Console;
/// <summary>
/// Benchmark execution: end-to-end orchestration (<see cref="RunBenchmark"/>), per-cell loop
/// (<see cref="RunBenchmarksForTestData"/>), serializer factory (<see cref="CreateSerializers"/>),
/// and the timing / calibration / allocation helpers. Pure benchmark-execution infrastructure —
/// no display formatting (that lives in <c>Output</c>) and no UX-flow (that lives in <c>Program</c>
/// + <c>Menu</c>).
/// </summary>
internal static class BenchmarkLoop
{
/// <summary>
/// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup
/// + measurement → grouped results print → save to disk. Used by both the CLI and interactive
/// menu paths; the interactive loop calls this repeatedly without restarting the process.
/// </summary>
internal static void RunBenchmark(BenchmarkLayer layer, BenchmarkOpMode opMode, SerializerSelectionMode serializerMode)
{
System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗");
System.Console.WriteLine("║ COMPREHENSIVE SERIALIZER BENCHMARK SUITE ║");
System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");
// Stabilization: pin the entire benchmark process to a single logical CPU and bump priority
// class. Single-core affinity stops Windows from migrating the bench thread between cores
// mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority
// reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise
// randomly inflate samples by 5-15%.
// Try/finally guarantees the original state is restored even if a benchmark throws — leaving
// a developer machine pinned to one core after a crashed run is a real foot-gun.
// Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot.
var process = Process.GetCurrentProcess();
var origAffinity = (IntPtr)0;
var origPriority = ProcessPriorityClass.Normal;
var stabilizationApplied = false;
// ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at
// runtime; skip the affinity step there but still raise priority class (which IS supported
// on macOS, just less effective for stabilization than affinity pinning).
if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
{
try
{
origAffinity = process.ProcessorAffinity;
origPriority = process.PriorityClass;
// Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one
// core, consistently" — not which one. If CPU 0 is heavily contended on the host
// (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak
// the mask here. The benchmark is single-threaded for the in-memory rows so single
// core is sufficient; round-trip-only NamedPipe rows have a server-drain thread
// that will share the core (acceptable — the bench measures end-to-end RT anyway).
process.ProcessorAffinity = (IntPtr)1;
process.PriorityClass = ProcessPriorityClass.High;
stabilizationApplied = true;
System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High.");
}
catch (Exception ex)
{
// Affinity/priority changes may fail on locked-down hosts (group policies, containers
// without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still
// works, just with the platform default scheduling.
System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}");
}
}
try
{
var allResults = new List<BenchmarkResult>();
var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
var testDataSets = FilterByLayer(allTestDataSets, layer);
System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard");
System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
System.Console.WriteLine();
// Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
// Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
// alone doesn't ensure that every Serialize<T>/IBufferWriter overload is fully Tier 1 by the time we
// start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
// on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
// Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
// them all in the background; the per-cell warmup that follows then locks in cache + branch state.
if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
{
System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");
foreach (var testData in testDataSets)
{
var preSerializers = CreateSerializers(testData, serializerMode);
try
{
foreach (var s in preSerializers)
{
// Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated:
// Ser path first, then Des path — same pattern as the per-cell warmup in
// RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming).
s.WarmupSerialize(2000);
s.WarmupDeserialize(2000);
}
}
finally
{
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
}
}
// Let background tiered-JIT compilation drain before we begin measuring.
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
System.Console.WriteLine("✓ Global pre-warmup complete.\n");
}
foreach (var testData in testDataSets)
{
System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");
var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
allResults.AddRange(results);
}
// Print grouped results
Output.PrintGroupedResults(allResults, testDataSets);
// Save results to file
Output.SaveResults(allResults, testDataSets);
System.Console.WriteLine("\n✓ Benchmark complete!");
}
finally
{
// Restore process state — affinity/priority changes are process-wide and persist across
// interactive-mode iterations of the menu. Without restore, the second menu run would
// already be on CPU-0 + High priority before its own try-block applied them, masking
// any stabilization-disabled comparison.
if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
{
try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ }
try { process.PriorityClass = origPriority; } catch { /* best-effort */ }
}
}
}
private static List<BenchmarkResult> RunBenchmarksForTestData(TestDataSet testData, BenchmarkOpMode mode, SerializerSelectionMode serializerMode)
{
var results = new List<BenchmarkResult>();
var serializers = CreateSerializers(testData, serializerMode);
// Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure.
System.Console.WriteLine("Verifying round-trip correctness...");
foreach (var serializer in serializers)
{
if (!serializer.VerifyRoundTrip())
{
System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}");
System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting.");
Environment.Exit(1);
}
}
System.Console.WriteLine("✓ All serializers passed round-trip verification.");
// Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and
// Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary.
//
// Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor
// in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a
// cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs
// keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement
// (steady-state). Branch-predictor history also stays clean per path.
//
// GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer
// pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent
// heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations.
//
// Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT).
// Each phase's freshly-promoted methods settle before its timing starts.
System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n");
foreach (var serializer in serializers)
{
var result = new BenchmarkResult
{
TestDataName = testData.DisplayName, // Use DisplayName for IId% info
Engine = serializer.Engine,
IoMode = serializer.IoMode,
DispatchMode = serializer.DispatchMode,
OptionsPreset = serializer.OptionsPreset,
OptionsDescription = serializer.OptionsDescription,
SerializedSize = serializer.SerializedSize,
SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes,
SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes,
IsRoundTripOnly = serializer.IsRoundTripOnly
};
// Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark
// is visibly stuck on a specific row at a specific %% rather than silently hanging.
var groupLabel = $"{result.SerializerName}";
if (serializer.IsRoundTripOnly)
{
// Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT,
// Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the
// entire round-trip path, then record into the RT result columns.
if (mode is BenchmarkOpMode.All or BenchmarkOpMode.Serialize)
{
ForceGcCollect();
serializer.WarmupSerialize(Configuration.WarmupIterations);
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
var rtIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
var (rtMed, rtMin, rtMax, rtStd) = RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
result.RoundTripTimeMs = rtMed;
result.RoundTripTimeMinMs = rtMin;
result.RoundTripTimeMaxMs = rtMax;
result.RoundTripTimeStdDevMs = rtStd;
result.RoundTripIterations = rtIter;
// Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len])
// also show up — otherwise current-thread alloc would only count the client side and look ~halved.
result.RoundTripAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]", processWide: true);
}
// mode == BenchmarkOpMode.Deserialize alone is meaningless for a round-trip-only benchmark; skip silently.
}
else
{
// ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
if (mode is BenchmarkOpMode.All or BenchmarkOpMode.Serialize)
{
ForceGcCollect();
serializer.WarmupSerialize(Configuration.WarmupIterations);
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
var serIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
var (serMed, serMin, serMax, serStd) = RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
result.SerializeTimeMs = serMed;
result.SerializeTimeMinMs = serMin;
result.SerializeTimeMaxMs = serMax;
result.SerializeTimeStdDevMs = serStd;
result.SerializeIterations = serIter;
// Dedicated alloc-only sample (separate from timing samples; keeps timing pure)
result.SerializeAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
}
// ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
// The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the
// Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph).
if (mode is BenchmarkOpMode.All or BenchmarkOpMode.Deserialize)
{
ForceGcCollect();
serializer.WarmupDeserialize(Configuration.WarmupIterations);
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
var desIter = CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs);
var (desMed, desMin, desMax, desStd) = RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
result.DeserializeTimeMs = desMed;
result.DeserializeTimeMinMs = desMin;
result.DeserializeTimeMaxMs = desMax;
result.DeserializeTimeStdDevMs = desStd;
result.DeserializeIterations = desIter;
result.DeserializeAllocBytesPerOp = MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]");
}
// Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration,
// batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
// then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
// RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp.
var serPerOp = Output.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations);
var desPerOp = Output.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations);
var rtPerOp = serPerOp + desPerOp;
result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations);
result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
}
results.Add(result);
Output.PrintResult(result);
}
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released
// before the next test data builds new ones — otherwise pipes / handles leak across test cells).
foreach (var s in serializers) (s as IDisposable)?.Dispose();
return results;
}
private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, SerializerSelectionMode serializerMode)
{
// FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
// TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[].
// - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head.
// Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
//
// FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint —
// we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor
// reference earlier. Re-enable when revisiting Fast wire-mode performance.
if (serializerMode == SerializerSelectionMode.FastestByte)
{
var fastestByteOptions = AcBinarySerializerOptions.FastMode;
fastestByteOptions.WireMode = Configuration.SelectedWireMode;
return new List<ISerializerBenchmark>
{
new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"),
//new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"),
new MemoryPackBenchmark(testData.Order, "Default"),
};
}
// AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer).
// Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with
// the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it
// in isolation so the timing numbers reflect ONLY the streaming path.
if (serializerMode == SerializerSelectionMode.AsyncPipe)
{
// NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here
// drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to
// the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the
// NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall
// fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level
// wire chunk AND kernel transfer unit; change ONLY this line when tuning.
var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode;
binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize;
binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode;
return new List<ISerializerBenchmark>
{
// Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync.
// Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer +
// MRES wait-on-byte-shortage — over a kernel NamedPipe.
new AcBinaryNamedPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
// Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport,
// same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize<T>(byte[]).
// No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with
// the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from
// kernel-transport-overhead (raw vs in-process Byte[]).
new AcBinaryNamedPipeRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
// Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel).
// Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the
// kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk
// on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this
// isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the
// in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback
// is the worst-case benchmark scenario for chunked-streaming and not representative of real network
// / file / cross-thread Pipe scenarios.
new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
// Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples
// baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw
// byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory].
new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
};
}
// Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the
// AsyncPipe menu / CLI mode, never bundled with the steady-state suite).
var binaryNoInternOption = AcBinarySerializerOptions.Default;
binaryNoInternOption.UseStringInterning = StringInterningMode.None;
binaryNoInternOption.WireMode = Configuration.SelectedWireMode;
var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default;
binaryDefaultNoSgenOption.UseGeneratedCode = false;
binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode;
var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode;
binaryFastModeNoSgenOption.UseGeneratedCode = false;
binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode;
var binaryFastModeOption = AcBinarySerializerOptions.FastMode;
binaryFastModeOption.WireMode = Configuration.SelectedWireMode;
// BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives
// the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call.
// Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration
// allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead
// vs syscall count).
var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode;
binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize;
binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode;
// In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment
// concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory
// System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task).
var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode;
binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize;
binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode;
var defaultOptions = AcBinarySerializerOptions.Default;
defaultOptions.UseStringInterning = StringInterningMode.None;
defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId;
defaultOptions.WireMode = Configuration.SelectedWireMode;
return new List<ISerializerBenchmark>
{
// ============================================================
// AcBinary — Byte[] API (uncomment to compare option presets side-by-side)
// ============================================================
// Fastest Byte[] — SGen path (UseGeneratedCode=true, default).
new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
// Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch.
// Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples.
// NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates
// when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish).
new AcBinaryBenchmark(testData.Order, binaryFastModeNoSgenOption, "FastMode"),
// Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) +
// UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size
// and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data.
new AcBinaryBenchmark(testData.Order, defaultOptions, "Default"),
//new AcBinaryBenchmark(testData.Order, binaryDefaultNoSgenOption, "Default"),
//new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"),
//new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"),
// AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario)
new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
// AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario).
// 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter
// allocation. Optimum for this scenario.
new AcBinaryFreshBufferWriterBenchmark(testData.Order, binaryFastModeBufWrChunk, "FastMode (4KB)"),
// AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the
// Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost
// (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns.
// The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode.
new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"),
// Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to-
// apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs
// chunked-streaming wire format. The IO column shows "Bytes(in-mem)".
new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"),
// AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED
// here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport
// measurements.
// ============================================================
// MemoryPack — three I/O modes for apples-to-apples comparison
// ============================================================
new MemoryPackBenchmark(testData.Order, "Default"),
new MemoryPackBufferWriterBenchmark(testData.Order, "Default"),
new MemoryPackFreshBufferWriterBenchmark(testData.Order, "Default"),
// ============================================================
// MessagePack — for legacy comparison
// ============================================================
#if !AYCODE_NATIVEAOT
// MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed
// ListFormatter<T> et al. — fails under NativeAOT publish with "No parameterless constructor".
// Excluded from the AOT build; available for regular JIT runs only.
new MessagePackBenchmark(testData.Order, "ContractBased"),
#endif
// System.Text.Json (commented — JSON serializer for reference; not in active suite)
//new SystemTextJsonBenchmark(testData.Order, "Default")
};
}
/// <summary>
/// Forces a full GC cycle at a phase boundary in the benchmark loop. Two-pass collect with finalizer drain
/// in between: the first pass moves managed garbage to the finalization queue, <c>WaitForPendingFinalizers</c>
/// runs the finalizers, the second pass reclaims any objects the finalizers released. After this returns the
/// heap is in a known-quiescent state — the next warmup/measurement phase starts on a clean slate, isolated
/// from the previous phase's residual allocations (write-buffer pools, intern cache, write-plan arrays, etc.).
/// Called between every Ser-phase / Des-phase boundary in <see cref="RunBenchmarksForTestData"/>.
/// </summary>
[MethodImpl(MethodImplOptions.NoInlining)]
internal static void ForceGcCollect()
{
GC.Collect(2, GCCollectionMode.Forced, blocking: true);
GC.WaitForPendingFinalizers();
GC.Collect(2, GCCollectionMode.Forced, blocking: true);
}
/// <summary>
/// Runs the action <paramref name="iterations"/> times for <see cref="Configuration.BenchmarkSamples"/> independent samples,
/// returning the median, min, and max elapsed time. Multi-sample design reduces single-run variance
/// from ~±15% to ~±5% by smoothing transient effects (background activity, thermal/turbo state).
/// When <see cref="Configuration.BenchmarkSamples"/> &lt;= 1, falls back to single-sample timing (Debug / quick mode).
/// When <paramref name="progressLabel"/> is non-null, emits in-place <c>\r</c> progress updates so a
/// stuck benchmark (e.g. deadlocked NamedPipe row) is visibly stuck at a specific %% rather than
/// silently hanging.
///
/// Stabilization (added 2026-05-07):
/// 1) Pilot sample is run BEFORE the recorded loop and discarded. The first measurement after
/// warmup tends to absorb residual JIT bookkeeping and GC bookkeeping; dropping it tightens
/// the min/max range without throwing away signal (the median is the SAME data as before).
/// 2) GC.Collect / WaitForPendingFinalizers / GC.Collect runs BEFORE every recorded sample.
/// Without this, GC pressure from sample N occasionally triggered a Gen-2 pause inside
/// sample N+1, painting it as an outlier; collecting up-front gives every sample the
/// same starting heap shape.
/// 3) Returns (median, min, max) so the caller can surface the inter-sample range — visible
/// noise floor for the row, replacing the previous "median only" view.
/// </summary>
internal static (double medianMs, double minMs, double maxMs, double stdDevMs) RunTimed(Action action, int iterations, string? progressLabel = null)
{
var samples = Configuration.BenchmarkSamples;
if (samples <= 1)
{
// Single-sample fast path (Debug or trivial run) — no allocation, no sort, no stddev.
var sw = Stopwatch.StartNew();
RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);
sw.Stop();
var ms = sw.Elapsed.TotalMilliseconds;
EndProgress(progressLabel, ms);
return (ms, ms, ms, 0);
}
// Pilot sample (discarded). Counts as sample index 0 of (samples + 1) for progress display
// so the user sees an extra "warmup-ish" tick before the recorded samples start.
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
var pilotSw = Stopwatch.StartNew();
RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: 0);
pilotSw.Stop();
// intentionally not stored
var times = new double[samples];
for (var s = 0; s < samples; s++)
{
// Per-sample GC settle. Forces every sample to start from the same heap state, so
// a Gen-2 pause caused by the previous sample doesn't bleed into the next sample's
// timing. Cost is paid OUTSIDE the Stopwatch window — no impact on the measurement.
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
var sw = Stopwatch.StartNew();
RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: s + 1);
sw.Stop();
times[s] = sw.Elapsed.TotalMilliseconds;
}
// Capture min/max/sum/sumSq BEFORE sort to avoid order ambiguity (Array.Sort is in-place).
var minMs = double.MaxValue;
var maxMs = double.MinValue;
var sum = 0.0;
var sumSq = 0.0;
for (var i = 0; i < times.Length; i++)
{
var t = times[i];
sum += t;
sumSq += t * t;
if (t < minMs) minMs = t;
if (t > maxMs) maxMs = t;
}
// Population stddev (not sample-stddev — we treat the captured samples as the population for
// CV computation). variance = E[X²] - E[X]² with Math.Max(0, ...) guard against tiny negative
// values from FP rounding when samples are nearly identical.
var mean = sum / times.Length;
var variance = (sumSq / times.Length) - (mean * mean);
var stdDevMs = Math.Sqrt(Math.Max(0.0, variance));
Array.Sort(times);
// Median: middle value for odd sample counts, average of two middles for even counts.
var medianMs = samples % 2 == 1 ? times[samples / 2] : (times[samples / 2 - 1] + times[samples / 2]) / 2.0;
EndProgress(progressLabel, medianMs);
return (medianMs, minMs, maxMs, stdDevMs);
}
/// <summary>
/// Per-cell adaptive iteration calibration. Runs a 100-iter measurement after warmup and computes
/// how many iterations are needed to reach <see cref="Configuration.TargetSampleMs"/> wall-clock per sample.
/// Returns iter rounded UP to the nearest 1000, floored at 1000 (the prior fixed minimum) and
/// ceiling-capped at 200_000 (sanity bound for pathologically fast ops). In Debug single-sample mode
/// (<c>Configuration.BenchmarkSamples &lt;= 1</c>) returns the global <see cref="Configuration.TestIterations"/> unchanged —
/// calibration overhead is unjustified there. Calibration runs OUTSIDE the timed sample loop and
/// does NOT count toward warmup; its sole purpose is to measure per-op cost.
/// </summary>
internal static int CalibrateIterations(Action action, int targetMs)
{
if (Configuration.BenchmarkSamples <= 1) return Configuration.TestIterations; // Debug fast path
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
const int calibIter = 100;
var sw = Stopwatch.StartNew();
for (var i = 0; i < calibIter; i++) action();
sw.Stop();
var ms = sw.Elapsed.TotalMilliseconds;
// Pathologically-fast op below Stopwatch resolution — cap at ceiling (further calibration won't help).
if (ms <= 0.0001) return 200_000;
var iterPerMs = calibIter / ms;
var raw = (int)Math.Ceiling(targetMs * iterPerMs);
// Round UP to nearest 1000 — keeps numbers human-readable in the markdown output.
var rounded = ((raw + 999) / 1000) * 1000;
return rounded switch
{
< 1000 => 1000,
> 200_000 => 200_000,
_ => rounded
};
}
/// <summary>
/// Measures per-call allocation in bytes after a clean GC. Single dedicated sample (no median) — keeps
/// timing samples pure. When <paramref name="processWide"/> is <c>true</c>, uses
/// <see cref="GC.GetTotalAllocatedBytes"/> instead of <see cref="GC.GetAllocatedBytesForCurrentThread"/>
/// — needed for round-trip-only benchmarks (NamedPipe etc.) where the work happens across multiple
/// threads (server-side <c>new byte[len]</c> buffers, drain-pump-thread allocations). Per-thread mode
/// is slightly cleaner for in-memory benchmarks; process-wide mode is slightly noisier (background
/// threads / GC bookkeeping leak in) but over 1000 iterations the signal dominates.
/// </summary>
internal static long MeasureAllocation(Action action, int iterations, string? progressLabel = null, bool processWide = false)
{
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
var sw = Stopwatch.StartNew();
var before = processWide ? GC.GetTotalAllocatedBytes(precise: true) : GC.GetAllocatedBytesForCurrentThread();
RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);
var after = processWide ? GC.GetTotalAllocatedBytes(precise: true) : GC.GetAllocatedBytesForCurrentThread();
sw.Stop();
EndProgress(progressLabel, sw.Elapsed.TotalMilliseconds);
return (after - before) / iterations;
}
// ============================================================================================
// Progress reporting — \r-driven in-place updates so a stuck benchmark surfaces the exact phase
// and % where it stopped, instead of appearing as a silent hang. Used by RunTimed and the
// MeasureAllocation* helpers when the caller passes a non-null progressLabel.
// ============================================================================================
// Tracks the longest line written by the current progress session, so EndProgress can clear
// any leftover characters from a prior longer line (avoids "ghost" trailing chars after \r).
private static int _progressLastLineLen;
/// <summary>
/// Runs <paramref name="action"/> <paramref name="iterations"/> times, emitting \r-overwriting
/// progress every ~10% (approx. 10 progress prints per sample). When <paramref name="label"/>
/// is null, runs without any progress output (zero overhead beyond a null check per iter).
/// </summary>
private static void RunWithProgress(Action action, int iterations, string? label, int samples, int sampleIndex)
{
if (label is null)
{
for (var i = 0; i < iterations; i++) action();
return;
}
// ~10 progress emits per sample run. Avoid emitting on every iter (Console.Write is
// expensive enough to skew sub-µs benchmarks if overdone).
var step = Math.Max(1, iterations / 10);
for (var i = 0; i < iterations; i++)
{
action();
if ((i + 1) % step == 0 || i == iterations - 1)
{
var pct = (int)((i + 1) * 100L / iterations);
var line = samples > 1
? $" > {label} sample {sampleIndex + 1}/{samples} {pct,3}% ({i + 1}/{iterations})"
: $" > {label} {pct,3}% ({i + 1}/{iterations})";
System.Console.Write('\r');
System.Console.Write(line);
if (line.Length < _progressLastLineLen)
System.Console.Write(new string(' ', _progressLastLineLen - line.Length));
_progressLastLineLen = line.Length;
}
}
}
/// <summary>
/// Closes a progress line cleanly: clears any leftover chars and writes a final "done" line on
/// the same row, terminated by \n so subsequent <c>WriteLine</c> calls render below.
/// </summary>
private static void EndProgress(string? label, double elapsedMs)
{
if (label is null) return;
var done = $" > {label} done in {elapsedMs,7:F1} ms";
System.Console.Write('\r');
System.Console.Write(done);
if (done.Length < _progressLastLineLen)
System.Console.Write(new string(' ', _progressLastLineLen - done.Length));
System.Console.WriteLine();
_progressLastLineLen = 0;
}
#if !AYCODE_NATIVEAOT
private static readonly JsonSerializerOptions VerifyJsonOpts = new()
{
WriteIndented = false,
DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull,
ReferenceHandler = System.Text.Json.Serialization.ReferenceHandler.IgnoreCycles
};
#endif
/// <summary>
/// Round-trip equality check: serialize both via System.Text.Json (canonical form) and compare strings.
/// Slower than property-by-property compare, but universal — works for any object graph without custom comparer.
/// </summary>
/// <remarks>
/// AOT publish skip: <c>System.Text.Json</c>'s reflection path uses runtime closed-generic instantiation
/// (<c>JsonPropertyInfo&lt;TestStatus&gt;</c> et al.) that the trimmer drops, causing
/// <c>NotSupportedException: missing native code or metadata</c>. The validation is JIT-only — the actual
/// benchmark Serialize/Deserialize loops don't touch this path. Under AOT we return <c>true</c> so all
/// <c>VerifyRoundTrip()</c> calls pass without running the cross-format validation.
/// </remarks>
internal static bool DeepEqualsViaJson(object? a, object? b)
{
#if AYCODE_NATIVEAOT
// Skip cross-format validation under AOT — STJ reflection path is incompatible. The roundtrip
// itself still runs (caller-side Serialize+Deserialize), just the JSON-canonical compare is bypassed.
return true;
#else
if (a == null && b == null) return true;
if (a == null || b == null) return false;
var jsonA = JsonSerializer.Serialize(a, VerifyJsonOpts);
var jsonB = JsonSerializer.Serialize(b, VerifyJsonOpts);
return jsonA == jsonB;
#endif
}
/// <summary>
/// Validates MemoryPack setup at startup. Aborts the benchmark if TestOrder_All_True is not [MemoryPackable].
/// Without this attribute, MemoryPack falls back to runtime resolver (slower) — comparison would be INVALID.
/// </summary>
internal static void ValidateMemoryPackSetup()
{
var typesToCheck = new[] { typeof(TestOrder_All_True) };
foreach (var type in typesToCheck)
{
var hasAttr = type.GetCustomAttributes(typeof(MemoryPackableAttribute), inherit: true).Any();
if (!hasAttr)
{
System.Console.Error.WriteLine($"❌ FATAL: {type.FullName} is not [MemoryPackable] — MemoryPack would fall back to runtime resolver, comparison is INVALID for SGen-vs-SGen claim.");
System.Console.Error.WriteLine("Add [MemoryPackable] to the type and any nested types referenced from it.");
Environment.Exit(1);
}
}
}
/// <summary>
/// Filters test data sets by layer keyword. Layered approach lets you run only what's needed for the iteration cadence.
/// P1: only "Core" data exists (Small/Medium/Large/Repeated/Deep). Comprehensive and Edge layers will be expanded in P2.
/// </summary>
internal static List<TestDataSet> FilterByLayer(List<TestDataSet> all, BenchmarkLayer layer)
{
if (layer == BenchmarkLayer.All) return all.ToList();
var coreNames = new[] { "Small", "Medium", "Large", "Repeated", "Deep" };
// P2 will add: "Flat", "Polymorphic", "Collection", "Numeric", "NonAscii", etc.
var comprehensiveExtras = new string[] { /* P2 */ };
// P3 will add: "ColdStart", "VeryLarge", "PathologicalString", etc.
var edgeExtras = new string[] { /* P3 */ };
return layer switch
{
BenchmarkLayer.Core => all.Where(t => StartsWithAny(t.Name, coreNames)).ToList(),
BenchmarkLayer.Comprehensive => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras)).ToList(),
BenchmarkLayer.Edge => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras) || StartsWithAny(t.Name, edgeExtras)).ToList(),
// Single-cell A/B mini-suite filters — match by case-insensitive prefix on Name.
// Use case: tight optimization-iteration loop on one specific cell (e.g. `dotnet run -- repeated`
// or interactive menu shortcut), avoiding the full ~110 sec suite when only one cell is in scope.
BenchmarkLayer.Small => all.Where(t => t.Name.StartsWith("Small", StringComparison.OrdinalIgnoreCase)).ToList(),
BenchmarkLayer.Medium => all.Where(t => t.Name.StartsWith("Medium", StringComparison.OrdinalIgnoreCase)).ToList(),
BenchmarkLayer.Large => all.Where(t => t.Name.StartsWith("Large", StringComparison.OrdinalIgnoreCase)).ToList(),
BenchmarkLayer.Repeated => all.Where(t => t.Name.StartsWith("Repeated", StringComparison.OrdinalIgnoreCase)).ToList(),
BenchmarkLayer.Deep => all.Where(t => t.Name.StartsWith("Deep", StringComparison.OrdinalIgnoreCase)).ToList(),
_ => all.ToList()
};
static bool StartsWithAny(string name, string[] prefixes) => prefixes.Any(name.StartsWith);
}
}