889 lines
55 KiB
C#
889 lines
55 KiB
C#
using AyCode.Core.Benchmarks.Reporting;
|
||
using AyCode.Core.Benchmarks.Workloads.Scenarios;
|
||
using AyCode.Core.Serializers.Binaries;
|
||
using AyCode.Core.Tests.TestModels;
|
||
using MemoryPack;
|
||
using System.Diagnostics;
|
||
using System.Runtime.CompilerServices;
|
||
|
||
namespace AyCode.Core.Serializers.Console;
|
||
|
||
/// <summary>
|
||
/// Benchmark execution: end-to-end orchestration (<see cref="RunBenchmark"/>), per-cell loop
|
||
/// (<see cref="RunBenchmarksForTestData"/>), serializer factory (<see cref="CreateSerializers"/>),
|
||
/// and the timing / calibration / allocation helpers. Pure benchmark-execution infrastructure —
|
||
/// no display formatting (that lives in <c>Output</c>) and no UX-flow (that lives in <c>Program</c>
|
||
/// + <c>Menu</c>).
|
||
/// </summary>
|
||
internal static class BenchmarkLoop
|
||
{
|
||
/// <summary>
|
||
/// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup
|
||
/// + measurement → grouped results print → save to disk. Used by both the CLI and interactive
|
||
/// menu paths; the interactive loop calls this repeatedly without restarting the process.
|
||
/// </summary>
|
||
internal static void RunBenchmark(BenchmarkLayer layer, BenchmarkOpMode opMode, SerializerSelectionMode serializerMode)
|
||
{
|
||
System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗");
|
||
System.Console.WriteLine("║ COMPREHENSIVE SERIALIZER BENCHMARK SUITE ║");
|
||
System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");
|
||
|
||
// Stabilization: pin the entire benchmark process to a single logical CPU and bump priority
|
||
// class. Single-core affinity stops Windows from migrating the bench thread between cores
|
||
// mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority
|
||
// reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise
|
||
// randomly inflate samples by 5-15%.
|
||
// Try/finally guarantees the original state is restored even if a benchmark throws — leaving
|
||
// a developer machine pinned to one core after a crashed run is a real foot-gun.
|
||
// Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot.
|
||
var process = Process.GetCurrentProcess();
|
||
var origAffinity = (IntPtr)0;
|
||
var origPriority = ProcessPriorityClass.Normal;
|
||
var stabilizationApplied = false;
|
||
|
||
// ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at
|
||
// runtime; skip the affinity step there but still raise priority class (which IS supported
|
||
// on macOS, just less effective for stabilization than affinity pinning).
|
||
if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
|
||
{
|
||
try
|
||
{
|
||
origAffinity = process.ProcessorAffinity;
|
||
origPriority = process.PriorityClass;
|
||
// Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one
|
||
// core, consistently" — not which one. If CPU 0 is heavily contended on the host
|
||
// (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak
|
||
// the mask here. The benchmark is single-threaded for the in-memory rows so single
|
||
// core is sufficient; round-trip-only NamedPipe rows have a server-drain thread
|
||
// that will share the core (acceptable — the bench measures end-to-end RT anyway).
|
||
process.ProcessorAffinity = (IntPtr)1;
|
||
process.PriorityClass = ProcessPriorityClass.High;
|
||
stabilizationApplied = true;
|
||
System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High.");
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
// Affinity/priority changes may fail on locked-down hosts (group policies, containers
|
||
// without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still
|
||
// works, just with the platform default scheduling.
|
||
System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}");
|
||
}
|
||
}
|
||
|
||
try
|
||
{
|
||
var allResults = new List<BenchmarkResult>();
|
||
var allTestDataSets = BuildMultiVariantTestDataSets();
|
||
var testDataSets = FilterByLayer(allTestDataSets, layer);
|
||
|
||
System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard");
|
||
System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
|
||
System.Console.WriteLine();
|
||
|
||
// Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
|
||
// Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
|
||
// alone doesn't ensure that every Serialize<T>/IBufferWriter overload is fully Tier 1 by the time we
|
||
// start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
|
||
// on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
|
||
// Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
|
||
// them all in the background; the per-cell warmup that follows then locks in cache + branch state.
|
||
if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
|
||
{
|
||
System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");
|
||
|
||
foreach (var testData in testDataSets)
|
||
{
|
||
var preSerializers = CreateSerializers(testData, serializerMode);
|
||
try
|
||
{
|
||
foreach (var s in preSerializers)
|
||
{
|
||
// Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated:
|
||
// Ser path first, then Des path — same pattern as the per-cell warmup in
|
||
// RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming).
|
||
s.WarmupSerialize(2000);
|
||
s.WarmupDeserialize(2000);
|
||
}
|
||
}
|
||
finally
|
||
{
|
||
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
|
||
foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
|
||
}
|
||
}
|
||
|
||
// Let background tiered-JIT compilation drain before we begin measuring.
|
||
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
|
||
System.Console.WriteLine("✓ Global pre-warmup complete.\n");
|
||
}
|
||
|
||
foreach (var testData in testDataSets)
|
||
{
|
||
System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
|
||
System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
|
||
System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");
|
||
|
||
var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
|
||
allResults.AddRange(results);
|
||
}
|
||
|
||
// Build the reporting context (resolves path via walk-up to .sln, snapshots run-config).
|
||
var ctx = new ReportingContext(
|
||
SourceTag: "Console",
|
||
ResultsDirectory: ReportingContext.ResolveResultsDirectory(),
|
||
BuildConfiguration: Configuration.BuildConfiguration,
|
||
Utf8NoBom: Configuration.Utf8NoBom,
|
||
CharsetName: Configuration.GetCurrentCharsetName(),
|
||
WarmupIterations: Configuration.WarmupIterations,
|
||
BenchmarkSamples: Configuration.BenchmarkSamples,
|
||
TargetSampleMs: Configuration.TargetSampleMs,
|
||
UnstableCVThreshold: Configuration.UnstableCVThreshold,
|
||
MicroOptCVThreshold: Configuration.MicroOptCVThreshold);
|
||
|
||
// Print grouped results
|
||
BenchmarkReportWriter.PrintGroupedResults(allResults, testDataSets);
|
||
|
||
// Save results to file (.log + .LLM + .output)
|
||
BenchmarkReportWriter.SaveAll(ctx, allResults, testDataSets);
|
||
|
||
System.Console.WriteLine("\n✓ Benchmark complete!");
|
||
}
|
||
finally
|
||
{
|
||
// Restore process state — affinity/priority changes are process-wide and persist across
|
||
// interactive-mode iterations of the menu. Without restore, the second menu run would
|
||
// already be on CPU-0 + High priority before its own try-block applied them, masking
|
||
// any stabilization-disabled comparison.
|
||
if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
|
||
{
|
||
try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ }
|
||
try { process.PriorityClass = origPriority; } catch { /* best-effort */ }
|
||
}
|
||
}
|
||
}
|
||
|
||
private static List<BenchmarkResult> RunBenchmarksForTestData(TestDataSet testData, BenchmarkOpMode mode, SerializerSelectionMode serializerMode)
|
||
{
|
||
var results = new List<BenchmarkResult>();
|
||
var serializers = CreateSerializers(testData, serializerMode);
|
||
|
||
// Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure.
|
||
System.Console.WriteLine("Verifying round-trip correctness...");
|
||
|
||
foreach (var serializer in serializers)
|
||
{
|
||
if (!serializer.VerifyRoundTrip())
|
||
{
|
||
System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}");
|
||
System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting.");
|
||
|
||
Environment.Exit(1);
|
||
}
|
||
}
|
||
|
||
System.Console.WriteLine("✓ All serializers passed round-trip verification.");
|
||
|
||
// Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and
|
||
// Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary.
|
||
//
|
||
// Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor
|
||
// in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a
|
||
// cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs
|
||
// keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement
|
||
// (steady-state). Branch-predictor history also stays clean per path.
|
||
//
|
||
// GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer
|
||
// pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent
|
||
// heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations.
|
||
//
|
||
// Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT).
|
||
// Each phase's freshly-promoted methods settle before its timing starts.
|
||
System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n");
|
||
|
||
foreach (var serializer in serializers)
|
||
{
|
||
var result = new BenchmarkResult
|
||
{
|
||
TestDataName = testData.DisplayName, // Use DisplayName for IId% info
|
||
Engine = serializer.Engine,
|
||
IoMode = serializer.IoMode,
|
||
DispatchMode = serializer.DispatchMode,
|
||
OptionsPreset = serializer.OptionsPreset,
|
||
OrderTypeName = serializer.OrderTypeName,
|
||
OptionsDescription = serializer.OptionsDescription,
|
||
SerializedSize = serializer.SerializedSize,
|
||
SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes,
|
||
SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes,
|
||
IsRoundTripOnly = serializer.IsRoundTripOnly
|
||
};
|
||
|
||
// Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark
|
||
// is visibly stuck on a specific row at a specific %% rather than silently hanging.
|
||
var groupLabel = $"{result.SerializerName}";
|
||
|
||
if (serializer.IsRoundTripOnly)
|
||
{
|
||
// Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT,
|
||
// Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the
|
||
// entire round-trip path, then record into the RT result columns.
|
||
if (mode is BenchmarkOpMode.All or BenchmarkOpMode.Serialize)
|
||
{
|
||
ForceGcCollect();
|
||
serializer.WarmupSerialize(Configuration.WarmupIterations);
|
||
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
|
||
|
||
var rtIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
|
||
var (rtMed, rtMin, rtMax, rtStd) = RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
|
||
result.RoundTripTimeMs = rtMed;
|
||
result.RoundTripTimeMinMs = rtMin;
|
||
result.RoundTripTimeMaxMs = rtMax;
|
||
result.RoundTripTimeStdDevMs = rtStd;
|
||
result.RoundTripIterations = rtIter;
|
||
// Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len])
|
||
// also show up — otherwise current-thread alloc would only count the client side and look ~halved.
|
||
result.RoundTripAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]", processWide: true);
|
||
}
|
||
// mode == BenchmarkOpMode.Deserialize alone is meaningless for a round-trip-only benchmark; skip silently.
|
||
}
|
||
else
|
||
{
|
||
// ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
|
||
if (mode is BenchmarkOpMode.All or BenchmarkOpMode.Serialize)
|
||
{
|
||
ForceGcCollect();
|
||
serializer.WarmupSerialize(Configuration.WarmupIterations);
|
||
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
|
||
|
||
var serIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
|
||
var (serMed, serMin, serMax, serStd) = RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
|
||
result.SerializeTimeMs = serMed;
|
||
result.SerializeTimeMinMs = serMin;
|
||
result.SerializeTimeMaxMs = serMax;
|
||
result.SerializeTimeStdDevMs = serStd;
|
||
result.SerializeIterations = serIter;
|
||
// Dedicated alloc-only sample (separate from timing samples; keeps timing pure)
|
||
result.SerializeAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
|
||
}
|
||
|
||
// ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
|
||
// The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the
|
||
// Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph).
|
||
if (mode is BenchmarkOpMode.All or BenchmarkOpMode.Deserialize)
|
||
{
|
||
ForceGcCollect();
|
||
serializer.WarmupDeserialize(Configuration.WarmupIterations);
|
||
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
|
||
|
||
var desIter = CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs);
|
||
var (desMed, desMin, desMax, desStd) = RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
|
||
result.DeserializeTimeMs = desMed;
|
||
result.DeserializeTimeMinMs = desMin;
|
||
result.DeserializeTimeMaxMs = desMax;
|
||
result.DeserializeTimeStdDevMs = desStd;
|
||
result.DeserializeIterations = desIter;
|
||
result.DeserializeAllocBytesPerOp = MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]");
|
||
}
|
||
|
||
// Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration,
|
||
// batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
|
||
// then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
|
||
// RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp.
|
||
var serPerOp = BenchmarkReportWriter.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations);
|
||
var desPerOp = BenchmarkReportWriter.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations);
|
||
var rtPerOp = serPerOp + desPerOp;
|
||
result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations);
|
||
result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
|
||
result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
|
||
}
|
||
|
||
results.Add(result);
|
||
BenchmarkReportWriter.PrintResult(result);
|
||
}
|
||
|
||
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released
|
||
// before the next test data builds new ones — otherwise pipes / handles leak across test cells).
|
||
foreach (var s in serializers) (s as IDisposable)?.Dispose();
|
||
|
||
return results;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Phase 2 multi-variant test-data builder. Constructs each cell in both the _All_False and
|
||
/// _All_True families, then cross-registers _All_True on the _All_False primaries so the
|
||
/// CreateSerializers downstream can pick the matching variant per AcBinary options preset.
|
||
/// </summary>
|
||
/// <remarks>
|
||
/// Memory cost: ~600 KB across 5 cells (Large dominates at ~340 KB for both variants). The two
|
||
/// families are built independently — same data values + same numeric sequence (per-family
|
||
/// _idCounter reset). MemPack/MsgPack benchmarks consume the _All_True variant canonically;
|
||
/// AcBinary's variant is preset-dependent (see CreateSerializers).
|
||
/// </remarks>
|
||
private static List<TestDataSet> BuildMultiVariantTestDataSets()
|
||
{
|
||
var allFalse = BenchmarkTestDataProvider_All_False.CreateTestDataSets();
|
||
var allTrue = BenchmarkTestDataProvider.CreateTestDataSets();
|
||
|
||
// Zip by ordinal — both providers emit the same 5 cells in the same order
|
||
// (Small / Medium / Large / Repeated / Deep), confirmed by their identical
|
||
// CreateTestDataSets call sequence on the generic base.
|
||
for (var i = 0; i < allFalse.Count; i++)
|
||
{
|
||
var falseDs = (TestDataSet<TestOrder_All_False>)allFalse[i];
|
||
var trueDs = (TestDataSet<TestOrder_All_True>)allTrue[i];
|
||
falseDs.RegisterVariant(trueDs.Order);
|
||
}
|
||
return allFalse;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Phase 2 variant dispatch rule for AcBinary: a preset uses <c>TestOrder_All_False</c> iff every
|
||
/// AcBinary "feature flag" is off (no string interning, no reference handling, no metadata, no
|
||
/// property filter). Any "true"-flagged feature promotes the benchmark to <c>TestOrder_All_True</c>
|
||
/// — the richer graph + opt-out attribute model exercises the feature's deduplication / dispatch
|
||
/// path on real shared-reference content. WireMode, SGen mode, and Compression are encoding-axis
|
||
/// options and intentionally NOT part of this decision (they don't change which graph shape is
|
||
/// meaningful to feed).
|
||
/// </summary>
|
||
private static bool UsesAllFalseVariant(AcBinarySerializerOptions options) =>
|
||
options.UseStringInterning == StringInterningMode.None &&
|
||
options.ReferenceHandling == ReferenceHandlingMode.None &&
|
||
!options.UseMetadata &&
|
||
options.PropertyFilter == null;
|
||
|
||
// Per-class factory helpers — each returns ISerializerBenchmark closed over the variant T
|
||
// selected by UsesAllFalseVariant(options). Compile-time T at the new T() call site preserves
|
||
// SGen apples-to-apples (no runtime reflection, no type erasure across the JIT boundary).
|
||
private static ISerializerBenchmark MakeAcBinary(TestDataSet td, AcBinarySerializerOptions opt, string preset) =>
|
||
UsesAllFalseVariant(opt)
|
||
? new AcBinaryBenchmark<TestOrder_All_False>(td.GetOrder<TestOrder_All_False>(), opt, preset)
|
||
: new AcBinaryBenchmark<TestOrder_All_True>(td.GetOrder<TestOrder_All_True>(), opt, preset);
|
||
|
||
private static ISerializerBenchmark MakeAcBinaryBufferWriter(TestDataSet td, AcBinarySerializerOptions opt, string preset) =>
|
||
UsesAllFalseVariant(opt)
|
||
? new AcBinaryBufferWriterBenchmark<TestOrder_All_False>(td.GetOrder<TestOrder_All_False>(), opt, preset)
|
||
: new AcBinaryBufferWriterBenchmark<TestOrder_All_True>(td.GetOrder<TestOrder_All_True>(), opt, preset);
|
||
|
||
private static ISerializerBenchmark MakeAcBinaryFreshBufferWriter(TestDataSet td, AcBinarySerializerOptions opt, string preset) =>
|
||
UsesAllFalseVariant(opt)
|
||
? new AcBinaryFreshBufferWriterBenchmark<TestOrder_All_False>(td.GetOrder<TestOrder_All_False>(), opt, preset)
|
||
: new AcBinaryFreshBufferWriterBenchmark<TestOrder_All_True>(td.GetOrder<TestOrder_All_True>(), opt, preset);
|
||
|
||
private static ISerializerBenchmark MakeAcBinaryNamedPipe(TestDataSet td, AcBinarySerializerOptions opt, string preset) =>
|
||
UsesAllFalseVariant(opt)
|
||
? new AcBinaryNamedPipeBenchmark<TestOrder_All_False>(td.GetOrder<TestOrder_All_False>(), opt, preset)
|
||
: new AcBinaryNamedPipeBenchmark<TestOrder_All_True>(td.GetOrder<TestOrder_All_True>(), opt, preset);
|
||
|
||
private static ISerializerBenchmark MakeAcBinaryNamedPipeRaw(TestDataSet td, AcBinarySerializerOptions opt, string preset) =>
|
||
UsesAllFalseVariant(opt)
|
||
? new AcBinaryNamedPipeRawByteArrayBenchmark<TestOrder_All_False>(td.GetOrder<TestOrder_All_False>(), opt, preset)
|
||
: new AcBinaryNamedPipeRawByteArrayBenchmark<TestOrder_All_True>(td.GetOrder<TestOrder_All_True>(), opt, preset);
|
||
|
||
private static ISerializerBenchmark MakeAcBinaryInMemoryPipe(TestDataSet td, AcBinarySerializerOptions opt, string preset) =>
|
||
UsesAllFalseVariant(opt)
|
||
? new AcBinaryInMemoryPipeBenchmark<TestOrder_All_False>(td.GetOrder<TestOrder_All_False>(), opt, preset)
|
||
: new AcBinaryInMemoryPipeBenchmark<TestOrder_All_True>(td.GetOrder<TestOrder_All_True>(), opt, preset);
|
||
|
||
private static ISerializerBenchmark MakeAcBinaryInMemoryRaw(TestDataSet td, AcBinarySerializerOptions opt, string preset) =>
|
||
UsesAllFalseVariant(opt)
|
||
? new AcBinaryInMemoryRawByteArrayBenchmark<TestOrder_All_False>(td.GetOrder<TestOrder_All_False>(), opt, preset)
|
||
: new AcBinaryInMemoryRawByteArrayBenchmark<TestOrder_All_True>(td.GetOrder<TestOrder_All_True>(), opt, preset);
|
||
|
||
private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, SerializerSelectionMode serializerMode)
|
||
{
|
||
// Phase 2 variant dispatch (refined): AcBinary picks variant per UsesAllFalseVariant(options).
|
||
// MemPack / MsgPack canonically use _All_False (no AcBinary opt-in/opt-out axis — both
|
||
// produce identical MemPack/MsgPack wire on either variant since their contract is family-
|
||
// agnostic). `orderFalse` is the cell primary; `orderTrue` is fetched on-demand by the AcBinary
|
||
// factory helpers when an options preset has a "true" flag.
|
||
var orderFalse = testData.GetOrder<TestOrder_All_False>();
|
||
|
||
// FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
|
||
// TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[].
|
||
// - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head.
|
||
// Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
|
||
//
|
||
// FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint —
|
||
// we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor
|
||
// reference earlier. Re-enable when revisiting Fast wire-mode performance.
|
||
if (serializerMode == SerializerSelectionMode.FastestByte)
|
||
{
|
||
var fastestByteOptions = AcBinarySerializerOptions.FastMode;
|
||
fastestByteOptions.WireMode = Configuration.SelectedWireMode;
|
||
|
||
return new List<ISerializerBenchmark>
|
||
{
|
||
MakeAcBinary(testData, fastestByteOptions, "FastMode"),
|
||
//MakeAcBinary(testData, fastWireOptions, "FastMode (FastWire)"),
|
||
// MemPack uses _All_False (the AcBinary opt-in/opt-out axis doesn't apply — MemoryPackable
|
||
// serialises identical bytes either way; _All_False matches the orderFalse variant the test
|
||
// data factory already built, no extra graph allocation needed).
|
||
new MemoryPackBenchmark<TestOrder_All_False>(orderFalse, Configuration.SelectedWireMode, "Default"),
|
||
};
|
||
}
|
||
|
||
// AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer).
|
||
// Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with
|
||
// the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it
|
||
// in isolation so the timing numbers reflect ONLY the streaming path.
|
||
if (serializerMode == SerializerSelectionMode.AsyncPipe)
|
||
{
|
||
// NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here
|
||
// drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to
|
||
// the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the
|
||
// NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall
|
||
// fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level
|
||
// wire chunk AND kernel transfer unit; change ONLY this line when tuning.
|
||
var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode;
|
||
binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize;
|
||
binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode;
|
||
|
||
return new List<ISerializerBenchmark>
|
||
{
|
||
// Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync.
|
||
// Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer +
|
||
// MRES wait-on-byte-shortage — over a kernel NamedPipe.
|
||
MakeAcBinaryNamedPipe(testData, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
|
||
// Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport,
|
||
// same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize<T>(byte[]).
|
||
// No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with
|
||
// the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from
|
||
// kernel-transport-overhead (raw vs in-process Byte[]).
|
||
MakeAcBinaryNamedPipeRaw(testData, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
|
||
// Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel).
|
||
// Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the
|
||
// kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk
|
||
// on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this
|
||
// isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the
|
||
// in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback
|
||
// is the worst-case benchmark scenario for chunked-streaming and not representative of real network
|
||
// / file / cross-thread Pipe scenarios.
|
||
MakeAcBinaryInMemoryPipe(testData, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
|
||
// Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples
|
||
// baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw
|
||
// byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory].
|
||
MakeAcBinaryInMemoryRaw(testData, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
|
||
};
|
||
}
|
||
|
||
// Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the
|
||
// AsyncPipe menu / CLI mode, never bundled with the steady-state suite).
|
||
|
||
var binaryNoInternOption = AcBinarySerializerOptions.Default;
|
||
binaryNoInternOption.UseStringInterning = StringInterningMode.None;
|
||
binaryNoInternOption.WireMode = Configuration.SelectedWireMode;
|
||
|
||
var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default;
|
||
binaryDefaultNoSgenOption.UseGeneratedCode = false;
|
||
binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode;
|
||
|
||
var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode;
|
||
binaryFastModeNoSgenOption.UseGeneratedCode = false;
|
||
binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode;
|
||
|
||
var binaryFastModeOption = AcBinarySerializerOptions.FastMode;
|
||
binaryFastModeOption.WireMode = Configuration.SelectedWireMode;
|
||
|
||
// BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives
|
||
// the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call.
|
||
// Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration
|
||
// allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead
|
||
// vs syscall count).
|
||
var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode;
|
||
binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize;
|
||
binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode;
|
||
|
||
// In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment
|
||
// concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory
|
||
// System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task).
|
||
var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode;
|
||
binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize;
|
||
binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode;
|
||
|
||
var defaultOptions = AcBinarySerializerOptions.Default;
|
||
defaultOptions.UseStringInterning = StringInterningMode.None;
|
||
defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId;
|
||
defaultOptions.WireMode = Configuration.SelectedWireMode;
|
||
|
||
return new List<ISerializerBenchmark>
|
||
{
|
||
// ============================================================
|
||
// AcBinary — Byte[] API (uncomment to compare option presets side-by-side)
|
||
// ============================================================
|
||
// Fastest Byte[] — SGen path (UseGeneratedCode=true, default).
|
||
MakeAcBinary(testData, binaryFastModeOption, "FastMode"),
|
||
// Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch.
|
||
// Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples.
|
||
// NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates
|
||
// when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish).
|
||
MakeAcBinary(testData, binaryFastModeNoSgenOption, "FastMode"),
|
||
// Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) +
|
||
// UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size
|
||
// and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data.
|
||
|
||
// Default preset (ReferenceHandling=OnlyId + StringInterning) → _All_True graph.
|
||
// Phase 2 variant-dispatch rule: any options preset with a "true"-flagged feature uses
|
||
// the _All_True family (rich graph, opt-out AcBinarySerializable attribute matches).
|
||
MakeAcBinary(testData, defaultOptions, "Default"),
|
||
//MakeAcBinary(testData, binaryDefaultNoSgenOption, "Default"),
|
||
//MakeAcBinary(testData, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"),
|
||
//MakeAcBinary(testData, binaryNoInternOption, "NoIntern"),
|
||
|
||
// AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario)
|
||
MakeAcBinaryBufferWriter(testData, binaryFastModeOption, "FastMode"),
|
||
|
||
// AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario).
|
||
// 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter
|
||
// allocation. Optimum for this scenario.
|
||
MakeAcBinaryFreshBufferWriter(testData, binaryFastModeBufWrChunk, "FastMode (4KB)"),
|
||
|
||
// AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the
|
||
// Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost
|
||
// (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns.
|
||
// The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode.
|
||
MakeAcBinaryInMemoryPipe(testData, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"),
|
||
|
||
// Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to-
|
||
// apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs
|
||
// chunked-streaming wire format. The IO column shows "Bytes(in-mem)".
|
||
MakeAcBinaryInMemoryRaw(testData, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"),
|
||
|
||
// AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED
|
||
// here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport
|
||
// measurements.
|
||
|
||
// ============================================================
|
||
// MemoryPack — three I/O modes for apples-to-apples comparison
|
||
// ============================================================
|
||
// MemPack uses _All_False (see FastestByte-mode comment above for rationale).
|
||
new MemoryPackBenchmark<TestOrder_All_False>(orderFalse, Configuration.SelectedWireMode, "Default"),
|
||
new MemoryPackBufferWriterBenchmark<TestOrder_All_False>(orderFalse, Configuration.SelectedWireMode, "Default"),
|
||
new MemoryPackFreshBufferWriterBenchmark<TestOrder_All_False>(orderFalse, Configuration.SelectedWireMode, "Default"),
|
||
|
||
// ============================================================
|
||
// MessagePack — for legacy comparison
|
||
// ============================================================
|
||
#if !AYCODE_NATIVEAOT
|
||
// MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed
|
||
// ListFormatter<T> et al. — fails under NativeAOT publish with "No parameterless constructor".
|
||
// Excluded from the AOT build; available for regular JIT runs only.
|
||
new MessagePackBenchmark<TestOrder_All_False>(orderFalse, "ContractBased"),
|
||
#endif
|
||
|
||
// System.Text.Json (commented — JSON serializer for reference; not in active suite)
|
||
//new SystemTextJsonBenchmark<TestOrder_All_False>(orderFalse, "Default")
|
||
};
|
||
}
|
||
|
||
/// <summary>
|
||
/// Forces a full GC cycle at a phase boundary in the benchmark loop. Two-pass collect with finalizer drain
|
||
/// in between: the first pass moves managed garbage to the finalization queue, <c>WaitForPendingFinalizers</c>
|
||
/// runs the finalizers, the second pass reclaims any objects the finalizers released. After this returns the
|
||
/// heap is in a known-quiescent state — the next warmup/measurement phase starts on a clean slate, isolated
|
||
/// from the previous phase's residual allocations (write-buffer pools, intern cache, write-plan arrays, etc.).
|
||
/// Called between every Ser-phase / Des-phase boundary in <see cref="RunBenchmarksForTestData"/>.
|
||
/// </summary>
|
||
[MethodImpl(MethodImplOptions.NoInlining)]
|
||
internal static void ForceGcCollect()
|
||
{
|
||
GC.Collect(2, GCCollectionMode.Forced, blocking: true);
|
||
GC.WaitForPendingFinalizers();
|
||
GC.Collect(2, GCCollectionMode.Forced, blocking: true);
|
||
}
|
||
|
||
/// <summary>
|
||
/// Runs the action <paramref name="iterations"/> times for <see cref="Configuration.BenchmarkSamples"/> independent samples,
|
||
/// returning the median, min, and max elapsed time. Multi-sample design reduces single-run variance
|
||
/// from ~±15% to ~±5% by smoothing transient effects (background activity, thermal/turbo state).
|
||
/// When <see cref="Configuration.BenchmarkSamples"/> <= 1, falls back to single-sample timing (Debug / quick mode).
|
||
/// When <paramref name="progressLabel"/> is non-null, emits in-place <c>\r</c> progress updates so a
|
||
/// stuck benchmark (e.g. deadlocked NamedPipe row) is visibly stuck at a specific %% rather than
|
||
/// silently hanging.
|
||
///
|
||
/// Stabilization (added 2026-05-07):
|
||
/// 1) Pilot sample is run BEFORE the recorded loop and discarded. The first measurement after
|
||
/// warmup tends to absorb residual JIT bookkeeping and GC bookkeeping; dropping it tightens
|
||
/// the min/max range without throwing away signal (the median is the SAME data as before).
|
||
/// 2) GC.Collect / WaitForPendingFinalizers / GC.Collect runs BEFORE every recorded sample.
|
||
/// Without this, GC pressure from sample N occasionally triggered a Gen-2 pause inside
|
||
/// sample N+1, painting it as an outlier; collecting up-front gives every sample the
|
||
/// same starting heap shape.
|
||
/// 3) Returns (median, min, max) so the caller can surface the inter-sample range — visible
|
||
/// noise floor for the row, replacing the previous "median only" view.
|
||
/// </summary>
|
||
internal static (double medianMs, double minMs, double maxMs, double stdDevMs) RunTimed(Action action, int iterations, string? progressLabel = null)
|
||
{
|
||
var samples = Configuration.BenchmarkSamples;
|
||
if (samples <= 1)
|
||
{
|
||
// Single-sample fast path (Debug or trivial run) — no allocation, no sort, no stddev.
|
||
var sw = Stopwatch.StartNew();
|
||
RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);
|
||
sw.Stop();
|
||
var ms = sw.Elapsed.TotalMilliseconds;
|
||
EndProgress(progressLabel, ms);
|
||
return (ms, ms, ms, 0);
|
||
}
|
||
|
||
// Pilot sample (discarded). Counts as sample index 0 of (samples + 1) for progress display
|
||
// so the user sees an extra "warmup-ish" tick before the recorded samples start.
|
||
GC.Collect();
|
||
GC.WaitForPendingFinalizers();
|
||
GC.Collect();
|
||
|
||
var pilotSw = Stopwatch.StartNew();
|
||
RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: 0);
|
||
pilotSw.Stop();
|
||
// intentionally not stored
|
||
|
||
var times = new double[samples];
|
||
for (var s = 0; s < samples; s++)
|
||
{
|
||
// Per-sample GC settle. Forces every sample to start from the same heap state, so
|
||
// a Gen-2 pause caused by the previous sample doesn't bleed into the next sample's
|
||
// timing. Cost is paid OUTSIDE the Stopwatch window — no impact on the measurement.
|
||
GC.Collect();
|
||
GC.WaitForPendingFinalizers();
|
||
GC.Collect();
|
||
|
||
// Inter-sample thermal-settle: CPU boost-clock can drop mid-batch under sustained load
|
||
// (e.g. 10×250ms = 2.5 sec burst). InterSampleSettleMs lets the boost-clock state
|
||
// settle so later samples don't read systematically slower than early ones. Skip before
|
||
// the first sample (no prior heat to settle from). Set to 0 in Configuration to disable.
|
||
if (s > 0 && Configuration.InterSampleSettleMs > 0)
|
||
Thread.Sleep(Configuration.InterSampleSettleMs);
|
||
|
||
var sw = Stopwatch.StartNew();
|
||
RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: s + 1);
|
||
sw.Stop();
|
||
times[s] = sw.Elapsed.TotalMilliseconds;
|
||
}
|
||
|
||
// Capture min/max/sum/sumSq BEFORE sort to avoid order ambiguity (Array.Sort is in-place).
|
||
var minMs = double.MaxValue;
|
||
var maxMs = double.MinValue;
|
||
var sum = 0.0;
|
||
var sumSq = 0.0;
|
||
|
||
for (var i = 0; i < times.Length; i++)
|
||
{
|
||
var t = times[i];
|
||
sum += t;
|
||
sumSq += t * t;
|
||
if (t < minMs) minMs = t;
|
||
if (t > maxMs) maxMs = t;
|
||
}
|
||
// Population stddev (not sample-stddev — we treat the captured samples as the population for
|
||
// CV computation). variance = E[X²] - E[X]² with Math.Max(0, ...) guard against tiny negative
|
||
// values from FP rounding when samples are nearly identical.
|
||
var mean = sum / times.Length;
|
||
var variance = (sumSq / times.Length) - (mean * mean);
|
||
var stdDevMs = Math.Sqrt(Math.Max(0.0, variance));
|
||
|
||
Array.Sort(times);
|
||
|
||
// Trimmed median: when samples >= 4, drop the single min and single max (sorted-array
|
||
// first and last) and compute median on the remaining (samples - 2) entries. Removes the
|
||
// worst per-sample contamination (a thermal spike, OS preempt, or a GC pause that escaped
|
||
// the per-sample GC.Collect settle) without throwing away too much signal. The min/max /
|
||
// stdDev outputs still reflect the FULL sample population — the trim affects only the
|
||
// headline median figure, so the visible range still shows the actual measurement extremes.
|
||
var trimStart = samples >= 4 ? 1 : 0;
|
||
var trimCount = samples >= 4 ? samples - 2 : samples;
|
||
var medianMs = trimCount % 2 == 1
|
||
? times[trimStart + trimCount / 2]
|
||
: (times[trimStart + trimCount / 2 - 1] + times[trimStart + trimCount / 2]) / 2.0;
|
||
EndProgress(progressLabel, medianMs);
|
||
|
||
return (medianMs, minMs, maxMs, stdDevMs);
|
||
}
|
||
|
||
/// <summary>
|
||
/// Per-cell adaptive iteration calibration. Runs a 100-iter measurement after warmup and computes
|
||
/// how many iterations are needed to reach <see cref="Configuration.TargetSampleMs"/> wall-clock per sample.
|
||
/// Returns iter rounded UP to the nearest 1000, floored at 1000 (the prior fixed minimum) and
|
||
/// ceiling-capped at 200_000 (sanity bound for pathologically fast ops). In Debug single-sample mode
|
||
/// (<c>Configuration.BenchmarkSamples <= 1</c>) returns the global <see cref="Configuration.TestIterations"/> unchanged —
|
||
/// calibration overhead is unjustified there. Calibration runs OUTSIDE the timed sample loop and
|
||
/// does NOT count toward warmup; its sole purpose is to measure per-op cost.
|
||
/// </summary>
|
||
internal static int CalibrateIterations(Action action, int targetMs)
|
||
{
|
||
if (Configuration.BenchmarkSamples <= 1) return Configuration.TestIterations; // Debug fast path
|
||
|
||
GC.Collect();
|
||
GC.WaitForPendingFinalizers();
|
||
GC.Collect();
|
||
|
||
const int calibIter = 100;
|
||
var sw = Stopwatch.StartNew();
|
||
for (var i = 0; i < calibIter; i++) action();
|
||
sw.Stop();
|
||
var ms = sw.Elapsed.TotalMilliseconds;
|
||
|
||
// Pathologically-fast op below Stopwatch resolution — cap at ceiling (further calibration won't help).
|
||
if (ms <= 0.0001) return 200_000;
|
||
|
||
var iterPerMs = calibIter / ms;
|
||
var raw = (int)Math.Ceiling(targetMs * iterPerMs);
|
||
// Round UP to nearest 1000 — keeps numbers human-readable in the markdown output.
|
||
var rounded = ((raw + 999) / 1000) * 1000;
|
||
|
||
return rounded switch
|
||
{
|
||
< 1000 => 1000,
|
||
> 200_000 => 200_000,
|
||
_ => rounded
|
||
};
|
||
}
|
||
|
||
/// <summary>
|
||
/// Measures per-call allocation in bytes after a clean GC. Single dedicated sample (no median) — keeps
|
||
/// timing samples pure. When <paramref name="processWide"/> is <c>true</c>, uses
|
||
/// <see cref="GC.GetTotalAllocatedBytes"/> instead of <see cref="GC.GetAllocatedBytesForCurrentThread"/>
|
||
/// — needed for round-trip-only benchmarks (NamedPipe etc.) where the work happens across multiple
|
||
/// threads (server-side <c>new byte[len]</c> buffers, drain-pump-thread allocations). Per-thread mode
|
||
/// is slightly cleaner for in-memory benchmarks; process-wide mode is slightly noisier (background
|
||
/// threads / GC bookkeeping leak in) but over 1000 iterations the signal dominates.
|
||
/// </summary>
|
||
internal static long MeasureAllocation(Action action, int iterations, string? progressLabel = null, bool processWide = false)
|
||
{
|
||
GC.Collect();
|
||
GC.WaitForPendingFinalizers();
|
||
GC.Collect();
|
||
|
||
var sw = Stopwatch.StartNew();
|
||
var before = processWide ? GC.GetTotalAllocatedBytes(precise: true) : GC.GetAllocatedBytesForCurrentThread();
|
||
RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);
|
||
var after = processWide ? GC.GetTotalAllocatedBytes(precise: true) : GC.GetAllocatedBytesForCurrentThread();
|
||
sw.Stop();
|
||
EndProgress(progressLabel, sw.Elapsed.TotalMilliseconds);
|
||
return (after - before) / iterations;
|
||
}
|
||
|
||
// ============================================================================================
|
||
// Progress reporting — \r-driven in-place updates so a stuck benchmark surfaces the exact phase
|
||
// and % where it stopped, instead of appearing as a silent hang. Used by RunTimed and the
|
||
// MeasureAllocation* helpers when the caller passes a non-null progressLabel.
|
||
// ============================================================================================
|
||
|
||
// Tracks the longest line written by the current progress session, so EndProgress can clear
|
||
// any leftover characters from a prior longer line (avoids "ghost" trailing chars after \r).
|
||
private static int _progressLastLineLen;
|
||
|
||
/// <summary>
|
||
/// Runs <paramref name="action"/> <paramref name="iterations"/> times, emitting \r-overwriting
|
||
/// progress every ~10% (approx. 10 progress prints per sample). When <paramref name="label"/>
|
||
/// is null, runs without any progress output (zero overhead beyond a null check per iter).
|
||
/// </summary>
|
||
private static void RunWithProgress(Action action, int iterations, string? label, int samples, int sampleIndex)
|
||
{
|
||
if (label is null)
|
||
{
|
||
for (var i = 0; i < iterations; i++) action();
|
||
return;
|
||
}
|
||
|
||
// Batch-based progress emit — ~10 progress prints per sample. The inner loop is branchless
|
||
// (no per-iter modulo / progress check), so the per-iter overhead is bare `action()` cost.
|
||
// The outer loop drives the batches; progress emit happens once per batch on the boundary.
|
||
// This keeps sub-µs ops cleanly measurable — the prior `if ((i + 1) % step == 0)` check
|
||
// added a 1-2 cycle per-iter branch that distorted hot loops near the Stopwatch resolution.
|
||
var step = Math.Max(1, iterations / 10);
|
||
var done = 0;
|
||
while (done < iterations)
|
||
{
|
||
var batch = Math.Min(step, iterations - done);
|
||
|
||
// Inner tight loop: no progress check, no modulo. Just the measured action() calls.
|
||
for (var i = 0; i < batch; i++) action();
|
||
done += batch;
|
||
|
||
var pct = (int)(done * 100L / iterations);
|
||
var line = samples > 1
|
||
? $" > {label} sample {sampleIndex + 1}/{samples} {pct,3}% ({done}/{iterations})"
|
||
: $" > {label} {pct,3}% ({done}/{iterations})";
|
||
|
||
System.Console.Write('\r');
|
||
System.Console.Write(line);
|
||
|
||
if (line.Length < _progressLastLineLen)
|
||
System.Console.Write(new string(' ', _progressLastLineLen - line.Length));
|
||
|
||
_progressLastLineLen = line.Length;
|
||
}
|
||
}
|
||
|
||
/// <summary>
|
||
/// Closes a progress line cleanly: clears any leftover chars and writes a final "done" line on
|
||
/// the same row, terminated by \n so subsequent <c>WriteLine</c> calls render below.
|
||
/// </summary>
|
||
private static void EndProgress(string? label, double elapsedMs)
|
||
{
|
||
if (label is null) return;
|
||
var done = $" > {label} done in {elapsedMs,7:F1} ms";
|
||
|
||
System.Console.Write('\r');
|
||
System.Console.Write(done);
|
||
|
||
if (done.Length < _progressLastLineLen)
|
||
System.Console.Write(new string(' ', _progressLastLineLen - done.Length));
|
||
|
||
System.Console.WriteLine();
|
||
_progressLastLineLen = 0;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Validates MemoryPack setup at startup. Aborts the benchmark if TestOrder_All_True is not [MemoryPackable].
|
||
/// Without this attribute, MemoryPack falls back to runtime resolver (slower) — comparison would be INVALID.
|
||
/// </summary>
|
||
internal static void ValidateMemoryPackSetup()
|
||
{
|
||
var typesToCheck = new[] { typeof(TestOrder_All_True) };
|
||
|
||
foreach (var type in typesToCheck)
|
||
{
|
||
var hasAttr = type.GetCustomAttributes(typeof(MemoryPackableAttribute), inherit: true).Any();
|
||
if (!hasAttr)
|
||
{
|
||
System.Console.Error.WriteLine($"❌ FATAL: {type.FullName} is not [MemoryPackable] — MemoryPack would fall back to runtime resolver, comparison is INVALID for SGen-vs-SGen claim.");
|
||
System.Console.Error.WriteLine("Add [MemoryPackable] to the type and any nested types referenced from it.");
|
||
|
||
Environment.Exit(1);
|
||
}
|
||
}
|
||
}
|
||
|
||
/// <summary>
|
||
/// Filters test data sets by layer keyword. Layered approach lets you run only what's needed for the iteration cadence.
|
||
/// P1: only "Core" data exists (Small/Medium/Large/Repeated/Deep). Comprehensive and Edge layers will be expanded in P2.
|
||
/// </summary>
|
||
internal static List<TestDataSet> FilterByLayer(List<TestDataSet> all, BenchmarkLayer layer)
|
||
{
|
||
if (layer == BenchmarkLayer.All) return all.ToList();
|
||
|
||
var coreNames = new[] { "Small", "Medium", "Large", "Repeated", "Deep" };
|
||
// P2 will add: "Flat", "Polymorphic", "Collection", "Numeric", "NonAscii", etc.
|
||
var comprehensiveExtras = new string[] { /* P2 */ };
|
||
// P3 will add: "ColdStart", "VeryLarge", "PathologicalString", etc.
|
||
var edgeExtras = new string[] { /* P3 */ };
|
||
|
||
return layer switch
|
||
{
|
||
BenchmarkLayer.Core => all.Where(t => StartsWithAny(t.Name, coreNames)).ToList(),
|
||
BenchmarkLayer.Comprehensive => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras)).ToList(),
|
||
BenchmarkLayer.Edge => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras) || StartsWithAny(t.Name, edgeExtras)).ToList(),
|
||
// Single-cell A/B mini-suite filters — match by case-insensitive prefix on Name.
|
||
// Use case: tight optimization-iteration loop on one specific cell (e.g. `dotnet run -- repeated`
|
||
// or interactive menu shortcut), avoiding the full ~110 sec suite when only one cell is in scope.
|
||
BenchmarkLayer.Small => all.Where(t => t.Name.StartsWith("Small", StringComparison.OrdinalIgnoreCase)).ToList(),
|
||
BenchmarkLayer.Medium => all.Where(t => t.Name.StartsWith("Medium", StringComparison.OrdinalIgnoreCase)).ToList(),
|
||
BenchmarkLayer.Large => all.Where(t => t.Name.StartsWith("Large", StringComparison.OrdinalIgnoreCase)).ToList(),
|
||
BenchmarkLayer.Repeated => all.Where(t => t.Name.StartsWith("Repeated", StringComparison.OrdinalIgnoreCase)).ToList(),
|
||
BenchmarkLayer.Deep => all.Where(t => t.Name.StartsWith("Deep", StringComparison.OrdinalIgnoreCase)).ToList(),
|
||
_ => all.ToList()
|
||
};
|
||
|
||
static bool StartsWithAny(string name, string[] prefixes) => prefixes.Any(name.StartsWith);
|
||
}
|
||
}
|