[LOADED_DOCS: 2 files, no new loads]
Refactor: move benchmark logic to BenchmarkLoop.cs Moved all benchmark execution logic (RunBenchmark, RunBenchmarksForTestData, CreateSerializers) from Program.cs into a new static class BenchmarkLoop in BenchmarkLoop.cs. Program.cs now delegates benchmark runs to BenchmarkLoop, improving separation of concerns. No changes to benchmark functionality.
This commit is contained in:
parent
c722f775f6
commit
ad9e05413c
|
|
@ -1,3 +1,5 @@
|
|||
using AyCode.Core.Serializers.Binaries;
|
||||
using AyCode.Core.Serializers.Console.Benchmarks;
|
||||
using AyCode.Core.Tests.TestModels;
|
||||
using MemoryPack;
|
||||
using System.Diagnostics;
|
||||
|
|
@ -7,24 +9,469 @@ using System.Text.Json;
|
|||
namespace AyCode.Core.Serializers.Console;
|
||||
|
||||
/// <summary>
|
||||
/// Benchmark execution helpers: timing (<see cref="RunTimed"/>), per-cell adaptive iteration
|
||||
/// calibration (<see cref="CalibrateIterations"/>), allocation measurement
|
||||
/// (<see cref="MeasureAllocation"/> + <see cref="MeasureAllocationTotal"/>), in-place
|
||||
/// <c>\r</c>-progress reporting, full-GC phase-boundary helper (<see cref="ForceGcCollect"/>),
|
||||
/// startup validation (<see cref="ValidateMemoryPackSetup"/>), and per-cell round-trip equality
|
||||
/// (<see cref="DeepEqualsViaJson"/>). Pure benchmark-execution infrastructure — no display
|
||||
/// formatting (that lives in <c>Output</c>) and no per-engine glue (which lives with the
|
||||
/// individual <c>ISerializerBenchmark</c> implementations).
|
||||
/// Benchmark execution: end-to-end orchestration (<see cref="RunBenchmark"/>), per-cell loop
|
||||
/// (<see cref="RunBenchmarksForTestData"/>), serializer factory (<see cref="CreateSerializers"/>),
|
||||
/// and the timing / calibration / allocation helpers. Pure benchmark-execution infrastructure —
|
||||
/// no display formatting (that lives in <c>Output</c>) and no UX-flow (that lives in <c>Program</c>
|
||||
/// + <c>Menu</c>).
|
||||
/// </summary>
|
||||
internal static class BenchmarkLoop
|
||||
{
|
||||
/// <summary>
|
||||
/// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup
|
||||
/// + measurement → grouped results print → save to disk. Used by both the CLI and interactive
|
||||
/// menu paths; the interactive loop calls this repeatedly without restarting the process.
|
||||
/// </summary>
|
||||
internal static void RunBenchmark(string layer, string opMode, string serializerMode)
|
||||
{
|
||||
System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗");
|
||||
System.Console.WriteLine("║ COMPREHENSIVE SERIALIZER BENCHMARK SUITE ║");
|
||||
System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");
|
||||
|
||||
// Stabilization: pin the entire benchmark process to a single logical CPU and bump priority
|
||||
// class. Single-core affinity stops Windows from migrating the bench thread between cores
|
||||
// mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority
|
||||
// reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise
|
||||
// randomly inflate samples by 5-15%.
|
||||
// Try/finally guarantees the original state is restored even if a benchmark throws — leaving
|
||||
// a developer machine pinned to one core after a crashed run is a real foot-gun.
|
||||
// Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot.
|
||||
var process = Process.GetCurrentProcess();
|
||||
var origAffinity = (IntPtr)0;
|
||||
var origPriority = ProcessPriorityClass.Normal;
|
||||
var stabilizationApplied = false;
|
||||
|
||||
// ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at
|
||||
// runtime; skip the affinity step there but still raise priority class (which IS supported
|
||||
// on macOS, just less effective for stabilization than affinity pinning).
|
||||
if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
|
||||
{
|
||||
try
|
||||
{
|
||||
origAffinity = process.ProcessorAffinity;
|
||||
origPriority = process.PriorityClass;
|
||||
// Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one
|
||||
// core, consistently" — not which one. If CPU 0 is heavily contended on the host
|
||||
// (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak
|
||||
// the mask here. The benchmark is single-threaded for the in-memory rows so single
|
||||
// core is sufficient; round-trip-only NamedPipe rows have a server-drain thread
|
||||
// that will share the core (acceptable — the bench measures end-to-end RT anyway).
|
||||
process.ProcessorAffinity = (IntPtr)1;
|
||||
process.PriorityClass = ProcessPriorityClass.High;
|
||||
stabilizationApplied = true;
|
||||
System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High.");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Affinity/priority changes may fail on locked-down hosts (group policies, containers
|
||||
// without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still
|
||||
// works, just with the platform default scheduling.
|
||||
System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var allResults = new List<BenchmarkResult>();
|
||||
var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
|
||||
var testDataSets = FilterByLayer(allTestDataSets, layer);
|
||||
|
||||
System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard");
|
||||
System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
|
||||
System.Console.WriteLine();
|
||||
|
||||
// Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
|
||||
// Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
|
||||
// alone doesn't ensure that every Serialize<T>/IBufferWriter overload is fully Tier 1 by the time we
|
||||
// start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
|
||||
// on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
|
||||
// Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
|
||||
// them all in the background; the per-cell warmup that follows then locks in cache + branch state.
|
||||
if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
|
||||
{
|
||||
System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");
|
||||
|
||||
foreach (var testData in testDataSets)
|
||||
{
|
||||
var preSerializers = CreateSerializers(testData, serializerMode);
|
||||
try
|
||||
{
|
||||
foreach (var s in preSerializers)
|
||||
{
|
||||
// Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated:
|
||||
// Ser path first, then Des path — same pattern as the per-cell warmup in
|
||||
// RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming).
|
||||
s.WarmupSerialize(2000);
|
||||
s.WarmupDeserialize(2000);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
|
||||
foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
// Let background tiered-JIT compilation drain before we begin measuring.
|
||||
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
|
||||
System.Console.WriteLine("✓ Global pre-warmup complete.\n");
|
||||
}
|
||||
|
||||
foreach (var testData in testDataSets)
|
||||
{
|
||||
System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
|
||||
System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
|
||||
System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");
|
||||
|
||||
var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
|
||||
allResults.AddRange(results);
|
||||
}
|
||||
|
||||
// Print grouped results
|
||||
Output.PrintGroupedResults(allResults, testDataSets);
|
||||
|
||||
// Save results to file
|
||||
Output.SaveResults(allResults, testDataSets);
|
||||
|
||||
System.Console.WriteLine("\n✓ Benchmark complete!");
|
||||
}
|
||||
finally
|
||||
{
|
||||
// Restore process state — affinity/priority changes are process-wide and persist across
|
||||
// interactive-mode iterations of the menu. Without restore, the second menu run would
|
||||
// already be on CPU-0 + High priority before its own try-block applied them, masking
|
||||
// any stabilization-disabled comparison.
|
||||
if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
|
||||
{
|
||||
try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ }
|
||||
try { process.PriorityClass = origPriority; } catch { /* best-effort */ }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static List<BenchmarkResult> RunBenchmarksForTestData(TestDataSet testData, string mode, string serializerMode)
|
||||
{
|
||||
var results = new List<BenchmarkResult>();
|
||||
var serializers = CreateSerializers(testData, serializerMode);
|
||||
|
||||
// Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure.
|
||||
System.Console.WriteLine("Verifying round-trip correctness...");
|
||||
|
||||
foreach (var serializer in serializers)
|
||||
{
|
||||
if (!serializer.VerifyRoundTrip())
|
||||
{
|
||||
System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}");
|
||||
System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting.");
|
||||
|
||||
Environment.Exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
System.Console.WriteLine("✓ All serializers passed round-trip verification.");
|
||||
|
||||
// Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and
|
||||
// Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary.
|
||||
//
|
||||
// Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor
|
||||
// in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a
|
||||
// cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs
|
||||
// keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement
|
||||
// (steady-state). Branch-predictor history also stays clean per path.
|
||||
//
|
||||
// GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer
|
||||
// pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent
|
||||
// heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations.
|
||||
//
|
||||
// Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT).
|
||||
// Each phase's freshly-promoted methods settle before its timing starts.
|
||||
System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n");
|
||||
|
||||
foreach (var serializer in serializers)
|
||||
{
|
||||
var result = new BenchmarkResult
|
||||
{
|
||||
TestDataName = testData.DisplayName, // Use DisplayName for IId% info
|
||||
Engine = serializer.Engine,
|
||||
IoMode = serializer.IoMode,
|
||||
DispatchMode = serializer.DispatchMode,
|
||||
OptionsPreset = serializer.OptionsPreset,
|
||||
OptionsDescription = serializer.OptionsDescription,
|
||||
SerializedSize = serializer.SerializedSize,
|
||||
SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes,
|
||||
SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes,
|
||||
IsRoundTripOnly = serializer.IsRoundTripOnly
|
||||
};
|
||||
|
||||
// Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark
|
||||
// is visibly stuck on a specific row at a specific %% rather than silently hanging.
|
||||
var groupLabel = $"{result.SerializerName}";
|
||||
|
||||
if (serializer.IsRoundTripOnly)
|
||||
{
|
||||
// Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT,
|
||||
// Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the
|
||||
// entire round-trip path, then record into the RT result columns.
|
||||
if (mode is "all" or "serialize" or "ser")
|
||||
{
|
||||
ForceGcCollect();
|
||||
serializer.WarmupSerialize(Configuration.WarmupIterations);
|
||||
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
|
||||
|
||||
var rtIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
|
||||
var (rtMed, rtMin, rtMax, rtStd) = RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
|
||||
result.RoundTripTimeMs = rtMed;
|
||||
result.RoundTripTimeMinMs = rtMin;
|
||||
result.RoundTripTimeMaxMs = rtMax;
|
||||
result.RoundTripTimeStdDevMs = rtStd;
|
||||
result.RoundTripIterations = rtIter;
|
||||
// Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len])
|
||||
// also show up — otherwise current-thread alloc would only count the client side and look ~halved.
|
||||
result.RoundTripAllocBytesPerOp = MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]");
|
||||
}
|
||||
// mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently.
|
||||
}
|
||||
else
|
||||
{
|
||||
// ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
|
||||
if (mode is "all" or "serialize" or "ser")
|
||||
{
|
||||
ForceGcCollect();
|
||||
serializer.WarmupSerialize(Configuration.WarmupIterations);
|
||||
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
|
||||
|
||||
var serIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
|
||||
var (serMed, serMin, serMax, serStd) = RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
|
||||
result.SerializeTimeMs = serMed;
|
||||
result.SerializeTimeMinMs = serMin;
|
||||
result.SerializeTimeMaxMs = serMax;
|
||||
result.SerializeTimeStdDevMs = serStd;
|
||||
result.SerializeIterations = serIter;
|
||||
// Dedicated alloc-only sample (separate from timing samples; keeps timing pure)
|
||||
result.SerializeAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
|
||||
}
|
||||
|
||||
// ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
|
||||
// The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the
|
||||
// Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph).
|
||||
if (mode is "all" or "deserialize" or "des")
|
||||
{
|
||||
ForceGcCollect();
|
||||
serializer.WarmupDeserialize(Configuration.WarmupIterations);
|
||||
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
|
||||
|
||||
var desIter = CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs);
|
||||
var (desMed, desMin, desMax, desStd) = RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
|
||||
result.DeserializeTimeMs = desMed;
|
||||
result.DeserializeTimeMinMs = desMin;
|
||||
result.DeserializeTimeMaxMs = desMax;
|
||||
result.DeserializeTimeStdDevMs = desStd;
|
||||
result.DeserializeIterations = desIter;
|
||||
result.DeserializeAllocBytesPerOp = MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]");
|
||||
}
|
||||
|
||||
// Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration,
|
||||
// batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
|
||||
// then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
|
||||
// RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp.
|
||||
var serPerOp = Output.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations);
|
||||
var desPerOp = Output.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations);
|
||||
var rtPerOp = serPerOp + desPerOp;
|
||||
result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations);
|
||||
result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
|
||||
result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
|
||||
}
|
||||
|
||||
results.Add(result);
|
||||
Output.PrintResult(result);
|
||||
}
|
||||
|
||||
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released
|
||||
// before the next test data builds new ones — otherwise pipes / handles leak across test cells).
|
||||
foreach (var s in serializers) (s as IDisposable)?.Dispose();
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, string serializerMode)
|
||||
{
|
||||
// FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
|
||||
// TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[].
|
||||
// - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head.
|
||||
// Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
|
||||
//
|
||||
// FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint —
|
||||
// we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor
|
||||
// reference earlier. Re-enable when revisiting Fast wire-mode performance.
|
||||
if (serializerMode == "fastestbyte")
|
||||
{
|
||||
var fastestByteOptions = AcBinarySerializerOptions.FastMode;
|
||||
fastestByteOptions.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
return new List<ISerializerBenchmark>
|
||||
{
|
||||
new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"),
|
||||
//new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"),
|
||||
new MemoryPackBenchmark(testData.Order, "Default"),
|
||||
};
|
||||
}
|
||||
|
||||
// AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer).
|
||||
// Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with
|
||||
// the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it
|
||||
// in isolation so the timing numbers reflect ONLY the streaming path.
|
||||
if (serializerMode == "asyncpipe")
|
||||
{
|
||||
// NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here
|
||||
// drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to
|
||||
// the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the
|
||||
// NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall
|
||||
// fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level
|
||||
// wire chunk AND kernel transfer unit; change ONLY this line when tuning.
|
||||
var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode;
|
||||
binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize;
|
||||
binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
return new List<ISerializerBenchmark>
|
||||
{
|
||||
// Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync.
|
||||
// Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer +
|
||||
// MRES wait-on-byte-shortage — over a kernel NamedPipe.
|
||||
new AcBinaryNamedPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
|
||||
// Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport,
|
||||
// same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize<T>(byte[]).
|
||||
// No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with
|
||||
// the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from
|
||||
// kernel-transport-overhead (raw vs in-process Byte[]).
|
||||
new AcBinaryNamedPipeRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
|
||||
// Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel).
|
||||
// Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the
|
||||
// kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk
|
||||
// on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this
|
||||
// isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the
|
||||
// in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback
|
||||
// is the worst-case benchmark scenario for chunked-streaming and not representative of real network
|
||||
// / file / cross-thread Pipe scenarios.
|
||||
new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
|
||||
// Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples
|
||||
// baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw
|
||||
// byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory].
|
||||
new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
|
||||
};
|
||||
}
|
||||
|
||||
// Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the
|
||||
// AsyncPipe menu / CLI mode, never bundled with the steady-state suite).
|
||||
|
||||
var binaryNoInternOption = AcBinarySerializerOptions.Default;
|
||||
binaryNoInternOption.UseStringInterning = StringInterningMode.None;
|
||||
binaryNoInternOption.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default;
|
||||
binaryDefaultNoSgenOption.UseGeneratedCode = false;
|
||||
binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode;
|
||||
binaryFastModeNoSgenOption.UseGeneratedCode = false;
|
||||
binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
var binaryFastModeOption = AcBinarySerializerOptions.FastMode;
|
||||
binaryFastModeOption.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
// BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives
|
||||
// the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call.
|
||||
// Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration
|
||||
// allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead
|
||||
// vs syscall count).
|
||||
var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode;
|
||||
binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize;
|
||||
binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
// In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment
|
||||
// concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory
|
||||
// System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task).
|
||||
var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode;
|
||||
binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize;
|
||||
binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
var defaultOptions = AcBinarySerializerOptions.Default;
|
||||
defaultOptions.UseStringInterning = StringInterningMode.None;
|
||||
defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId;
|
||||
defaultOptions.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
return new List<ISerializerBenchmark>
|
||||
{
|
||||
// ============================================================
|
||||
// AcBinary — Byte[] API (uncomment to compare option presets side-by-side)
|
||||
// ============================================================
|
||||
// Fastest Byte[] — SGen path (UseGeneratedCode=true, default).
|
||||
new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
|
||||
// Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch.
|
||||
// Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples.
|
||||
// NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates
|
||||
// when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish).
|
||||
new AcBinaryBenchmark(testData.Order, binaryFastModeNoSgenOption, "FastMode"),
|
||||
// Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) +
|
||||
// UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size
|
||||
// and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data.
|
||||
|
||||
new AcBinaryBenchmark(testData.Order, defaultOptions, "Default"),
|
||||
//new AcBinaryBenchmark(testData.Order, binaryDefaultNoSgenOption, "Default"),
|
||||
//new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"),
|
||||
//new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"),
|
||||
|
||||
// AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario)
|
||||
new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
|
||||
|
||||
// AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario).
|
||||
// 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter
|
||||
// allocation. Optimum for this scenario.
|
||||
new AcBinaryFreshBufferWriterBenchmark(testData.Order, binaryFastModeBufWrChunk, "FastMode (4KB)"),
|
||||
|
||||
// AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the
|
||||
// Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost
|
||||
// (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns.
|
||||
// The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode.
|
||||
new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"),
|
||||
|
||||
// Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to-
|
||||
// apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs
|
||||
// chunked-streaming wire format. The IO column shows "Bytes(in-mem)".
|
||||
new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"),
|
||||
|
||||
// AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED
|
||||
// here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport
|
||||
// measurements.
|
||||
|
||||
// ============================================================
|
||||
// MemoryPack — three I/O modes for apples-to-apples comparison
|
||||
// ============================================================
|
||||
new MemoryPackBenchmark(testData.Order, "Default"),
|
||||
new MemoryPackBufferWriterBenchmark(testData.Order, "Default"),
|
||||
new MemoryPackFreshBufferWriterBenchmark(testData.Order, "Default"),
|
||||
|
||||
// ============================================================
|
||||
// MessagePack — for legacy comparison
|
||||
// ============================================================
|
||||
#if !AYCODE_NATIVEAOT
|
||||
// MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed
|
||||
// ListFormatter<T> et al. — fails under NativeAOT publish with "No parameterless constructor".
|
||||
// Excluded from the AOT build; available for regular JIT runs only.
|
||||
new MessagePackBenchmark(testData.Order, "ContractBased"),
|
||||
#endif
|
||||
|
||||
// System.Text.Json (commented — JSON serializer for reference; not in active suite)
|
||||
//new SystemTextJsonBenchmark(testData.Order, "Default")
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Forces a full GC cycle at a phase boundary in the benchmark loop. Two-pass collect with finalizer drain
|
||||
/// in between: the first pass moves managed garbage to the finalization queue, <c>WaitForPendingFinalizers</c>
|
||||
/// runs the finalizers, the second pass reclaims any objects the finalizers released. After this returns the
|
||||
/// heap is in a known-quiescent state — the next warmup/measurement phase starts on a clean slate, isolated
|
||||
/// from the previous phase's residual allocations (write-buffer pools, intern cache, write-plan arrays, etc.).
|
||||
/// Called between every Ser-phase / Des-phase boundary in <c>RunBenchmarksForTestData</c>.
|
||||
/// Called between every Ser-phase / Des-phase boundary in <see cref="RunBenchmarksForTestData"/>.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.NoInlining)]
|
||||
internal static void ForceGcCollect()
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ public static class Program
|
|||
if (!TryParseCliArgs(args, out var layer, out var opMode, out var serializerMode))
|
||||
return; // invalid args
|
||||
|
||||
RunBenchmark(layer, opMode, serializerMode);
|
||||
BenchmarkLoop.RunBenchmark(layer, opMode, serializerMode);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -63,7 +63,7 @@ public static class Program
|
|||
var selection = Menu.ShowInteractiveMenu();
|
||||
if (selection == null) return; // user pressed Q
|
||||
|
||||
RunBenchmark(selection.Value.layer, "all", selection.Value.serializerMode);
|
||||
BenchmarkLoop.RunBenchmark(selection.Value.layer, "all", selection.Value.serializerMode);
|
||||
|
||||
System.Console.WriteLine();
|
||||
System.Console.WriteLine("─────────────────────────────────────────────────────────────────────");
|
||||
|
|
@ -124,457 +124,7 @@ public static class Program
|
|||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup
|
||||
/// + measurement → grouped results print → save to disk. Used by both the CLI and interactive
|
||||
/// menu paths; the interactive loop calls this repeatedly without restarting the process.
|
||||
/// </summary>
|
||||
private static void RunBenchmark(string layer, string opMode, string serializerMode)
|
||||
{
|
||||
System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗");
|
||||
System.Console.WriteLine("║ COMPREHENSIVE SERIALIZER BENCHMARK SUITE ║");
|
||||
System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");
|
||||
|
||||
// Stabilization: pin the entire benchmark process to a single logical CPU and bump priority
|
||||
// class. Single-core affinity stops Windows from migrating the bench thread between cores
|
||||
// mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority
|
||||
// reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise
|
||||
// randomly inflate samples by 5-15%.
|
||||
// Try/finally guarantees the original state is restored even if a benchmark throws — leaving
|
||||
// a developer machine pinned to one core after a crashed run is a real foot-gun.
|
||||
// Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot.
|
||||
var process = Process.GetCurrentProcess();
|
||||
var origAffinity = (IntPtr)0;
|
||||
var origPriority = ProcessPriorityClass.Normal;
|
||||
var stabilizationApplied = false;
|
||||
|
||||
// ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at
|
||||
// runtime; skip the affinity step there but still raise priority class (which IS supported
|
||||
// on macOS, just less effective for stabilization than affinity pinning).
|
||||
if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
|
||||
{
|
||||
try
|
||||
{
|
||||
origAffinity = process.ProcessorAffinity;
|
||||
origPriority = process.PriorityClass;
|
||||
// Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one
|
||||
// core, consistently" — not which one. If CPU 0 is heavily contended on the host
|
||||
// (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak
|
||||
// the mask here. The benchmark is single-threaded for the in-memory rows so single
|
||||
// core is sufficient; round-trip-only NamedPipe rows have a server-drain thread
|
||||
// that will share the core (acceptable — the bench measures end-to-end RT anyway).
|
||||
process.ProcessorAffinity = (IntPtr)1;
|
||||
process.PriorityClass = ProcessPriorityClass.High;
|
||||
stabilizationApplied = true;
|
||||
System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High.");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Affinity/priority changes may fail on locked-down hosts (group policies, containers
|
||||
// without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still
|
||||
// works, just with the platform default scheduling.
|
||||
System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var allResults = new List<BenchmarkResult>();
|
||||
var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
|
||||
var testDataSets = BenchmarkLoop.FilterByLayer(allTestDataSets, layer);
|
||||
|
||||
System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard");
|
||||
System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
|
||||
System.Console.WriteLine();
|
||||
|
||||
// Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
|
||||
// Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
|
||||
// alone doesn't ensure that every Serialize<T>/IBufferWriter overload is fully Tier 1 by the time we
|
||||
// start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
|
||||
// on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
|
||||
// Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
|
||||
// them all in the background; the per-cell warmup that follows then locks in cache + branch state.
|
||||
if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
|
||||
{
|
||||
System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");
|
||||
|
||||
foreach (var testData in testDataSets)
|
||||
{
|
||||
var preSerializers = CreateSerializers(testData, serializerMode);
|
||||
try
|
||||
{
|
||||
foreach (var s in preSerializers)
|
||||
{
|
||||
// Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated:
|
||||
// Ser path first, then Des path — same pattern as the per-cell warmup in
|
||||
// RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming).
|
||||
s.WarmupSerialize(2000);
|
||||
s.WarmupDeserialize(2000);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
|
||||
foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
// Let background tiered-JIT compilation drain before we begin measuring.
|
||||
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
|
||||
System.Console.WriteLine("✓ Global pre-warmup complete.\n");
|
||||
}
|
||||
|
||||
foreach (var testData in testDataSets)
|
||||
{
|
||||
System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
|
||||
System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
|
||||
System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");
|
||||
|
||||
var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
|
||||
allResults.AddRange(results);
|
||||
}
|
||||
|
||||
// Print grouped results
|
||||
Output.PrintGroupedResults(allResults, testDataSets);
|
||||
|
||||
// Save results to file
|
||||
Output.SaveResults(allResults, testDataSets);
|
||||
|
||||
System.Console.WriteLine("\n✓ Benchmark complete!");
|
||||
}
|
||||
finally
|
||||
{
|
||||
// Restore process state — affinity/priority changes are process-wide and persist across
|
||||
// interactive-mode iterations of the menu. Without restore, the second menu run would
|
||||
// already be on CPU-0 + High priority before its own try-block applied them, masking
|
||||
// any stabilization-disabled comparison.
|
||||
if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
|
||||
{
|
||||
try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ }
|
||||
try { process.PriorityClass = origPriority; } catch { /* best-effort */ }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#region Benchmark Execution
|
||||
|
||||
private static List<BenchmarkResult> RunBenchmarksForTestData(TestDataSet testData, string mode, string serializerMode)
|
||||
{
|
||||
var results = new List<BenchmarkResult>();
|
||||
var serializers = CreateSerializers(testData, serializerMode);
|
||||
|
||||
// Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure.
|
||||
System.Console.WriteLine("Verifying round-trip correctness...");
|
||||
|
||||
foreach (var serializer in serializers)
|
||||
{
|
||||
if (!serializer.VerifyRoundTrip())
|
||||
{
|
||||
System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}");
|
||||
System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting.");
|
||||
|
||||
Environment.Exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
System.Console.WriteLine("✓ All serializers passed round-trip verification.");
|
||||
|
||||
// Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and
|
||||
// Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary.
|
||||
//
|
||||
// Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor
|
||||
// in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a
|
||||
// cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs
|
||||
// keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement
|
||||
// (steady-state). Branch-predictor history also stays clean per path.
|
||||
//
|
||||
// GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer
|
||||
// pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent
|
||||
// heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations.
|
||||
//
|
||||
// Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT).
|
||||
// Each phase's freshly-promoted methods settle before its timing starts.
|
||||
System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n");
|
||||
|
||||
foreach (var serializer in serializers)
|
||||
{
|
||||
var result = new BenchmarkResult
|
||||
{
|
||||
TestDataName = testData.DisplayName, // Use DisplayName for IId% info
|
||||
Engine = serializer.Engine,
|
||||
IoMode = serializer.IoMode,
|
||||
DispatchMode = serializer.DispatchMode,
|
||||
OptionsPreset = serializer.OptionsPreset,
|
||||
OptionsDescription = serializer.OptionsDescription,
|
||||
SerializedSize = serializer.SerializedSize,
|
||||
SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes,
|
||||
SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes,
|
||||
IsRoundTripOnly = serializer.IsRoundTripOnly
|
||||
};
|
||||
|
||||
// Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark
|
||||
// is visibly stuck on a specific row at a specific %% rather than silently hanging.
|
||||
var groupLabel = $"{result.SerializerName}";
|
||||
|
||||
if (serializer.IsRoundTripOnly)
|
||||
{
|
||||
// Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT,
|
||||
// Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the
|
||||
// entire round-trip path, then record into the RT result columns.
|
||||
if (mode is "all" or "serialize" or "ser")
|
||||
{
|
||||
BenchmarkLoop.ForceGcCollect();
|
||||
serializer.WarmupSerialize(Configuration.WarmupIterations);
|
||||
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
|
||||
|
||||
var rtIter = BenchmarkLoop.CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
|
||||
var (rtMed, rtMin, rtMax, rtStd) = BenchmarkLoop.RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
|
||||
result.RoundTripTimeMs = rtMed;
|
||||
result.RoundTripTimeMinMs = rtMin;
|
||||
result.RoundTripTimeMaxMs = rtMax;
|
||||
result.RoundTripTimeStdDevMs = rtStd;
|
||||
result.RoundTripIterations = rtIter;
|
||||
// Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len])
|
||||
// also show up — otherwise current-thread alloc would only count the client side and look ~halved.
|
||||
result.RoundTripAllocBytesPerOp = BenchmarkLoop.MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]");
|
||||
}
|
||||
// mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently.
|
||||
}
|
||||
else
|
||||
{
|
||||
// ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
|
||||
if (mode is "all" or "serialize" or "ser")
|
||||
{
|
||||
BenchmarkLoop.ForceGcCollect();
|
||||
serializer.WarmupSerialize(Configuration.WarmupIterations);
|
||||
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
|
||||
|
||||
var serIter = BenchmarkLoop.CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs);
|
||||
var (serMed, serMin, serMax, serStd) = BenchmarkLoop.RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
|
||||
result.SerializeTimeMs = serMed;
|
||||
result.SerializeTimeMinMs = serMin;
|
||||
result.SerializeTimeMaxMs = serMax;
|
||||
result.SerializeTimeStdDevMs = serStd;
|
||||
result.SerializeIterations = serIter;
|
||||
// Dedicated alloc-only sample (separate from timing samples; keeps timing pure)
|
||||
result.SerializeAllocBytesPerOp = BenchmarkLoop.MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
|
||||
}
|
||||
|
||||
// ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect.
|
||||
// The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the
|
||||
// Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph).
|
||||
if (mode is "all" or "deserialize" or "des")
|
||||
{
|
||||
BenchmarkLoop.ForceGcCollect();
|
||||
serializer.WarmupDeserialize(Configuration.WarmupIterations);
|
||||
if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep);
|
||||
|
||||
var desIter = BenchmarkLoop.CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs);
|
||||
var (desMed, desMin, desMax, desStd) = BenchmarkLoop.RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
|
||||
result.DeserializeTimeMs = desMed;
|
||||
result.DeserializeTimeMinMs = desMin;
|
||||
result.DeserializeTimeMaxMs = desMax;
|
||||
result.DeserializeTimeStdDevMs = desStd;
|
||||
result.DeserializeIterations = desIter;
|
||||
result.DeserializeAllocBytesPerOp = BenchmarkLoop.MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]");
|
||||
}
|
||||
|
||||
// Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration,
|
||||
// batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
|
||||
// then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
|
||||
// RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp.
|
||||
var serPerOp = Output.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations);
|
||||
var desPerOp = Output.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations);
|
||||
var rtPerOp = serPerOp + desPerOp;
|
||||
result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations);
|
||||
result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
|
||||
result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
|
||||
}
|
||||
|
||||
results.Add(result);
|
||||
Output.PrintResult(result);
|
||||
}
|
||||
|
||||
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released
|
||||
// before the next test data builds new ones — otherwise pipes / handles leak across test cells).
|
||||
foreach (var s in serializers) (s as IDisposable)?.Dispose();
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, string serializerMode)
|
||||
{
|
||||
// FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
|
||||
// TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[].
|
||||
// - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head.
|
||||
// Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
|
||||
//
|
||||
// FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint —
|
||||
// we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor
|
||||
// reference earlier. Re-enable when revisiting Fast wire-mode performance.
|
||||
if (serializerMode == "fastestbyte")
|
||||
{
|
||||
var fastestByteOptions = AcBinarySerializerOptions.FastMode;
|
||||
fastestByteOptions.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
return new List<ISerializerBenchmark>
|
||||
{
|
||||
new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"),
|
||||
//new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"),
|
||||
new MemoryPackBenchmark(testData.Order, "Default"),
|
||||
};
|
||||
}
|
||||
|
||||
// AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer).
|
||||
// Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with
|
||||
// the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it
|
||||
// in isolation so the timing numbers reflect ONLY the streaming path.
|
||||
if (serializerMode == "asyncpipe")
|
||||
{
|
||||
// NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here
|
||||
// drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to
|
||||
// the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the
|
||||
// NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall
|
||||
// fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level
|
||||
// wire chunk AND kernel transfer unit; change ONLY this line when tuning.
|
||||
var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode;
|
||||
binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize;
|
||||
binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
return new List<ISerializerBenchmark>
|
||||
{
|
||||
// Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync.
|
||||
// Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer +
|
||||
// MRES wait-on-byte-shortage — over a kernel NamedPipe.
|
||||
new AcBinaryNamedPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
|
||||
// Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport,
|
||||
// same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize<T>(byte[]).
|
||||
// No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with
|
||||
// the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from
|
||||
// kernel-transport-overhead (raw vs in-process Byte[]).
|
||||
new AcBinaryNamedPipeRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
|
||||
// Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel).
|
||||
// Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the
|
||||
// kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk
|
||||
// on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this
|
||||
// isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the
|
||||
// in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback
|
||||
// is the worst-case benchmark scenario for chunked-streaming and not representative of real network
|
||||
// / file / cross-thread Pipe scenarios.
|
||||
new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
|
||||
// Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples
|
||||
// baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw
|
||||
// byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory].
|
||||
new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
|
||||
};
|
||||
}
|
||||
|
||||
// Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the
|
||||
// AsyncPipe menu / CLI mode, never bundled with the steady-state suite).
|
||||
|
||||
var binaryNoInternOption = AcBinarySerializerOptions.Default;
|
||||
binaryNoInternOption.UseStringInterning = StringInterningMode.None;
|
||||
binaryNoInternOption.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default;
|
||||
binaryDefaultNoSgenOption.UseGeneratedCode = false;
|
||||
binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode;
|
||||
binaryFastModeNoSgenOption.UseGeneratedCode = false;
|
||||
binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
var binaryFastModeOption = AcBinarySerializerOptions.FastMode;
|
||||
binaryFastModeOption.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
// BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives
|
||||
// the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call.
|
||||
// Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration
|
||||
// allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead
|
||||
// vs syscall count).
|
||||
var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode;
|
||||
binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize;
|
||||
binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
// In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment
|
||||
// concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory
|
||||
// System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task).
|
||||
var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode;
|
||||
binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize;
|
||||
binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
var defaultOptions = AcBinarySerializerOptions.Default;
|
||||
defaultOptions.UseStringInterning = StringInterningMode.None;
|
||||
defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId;
|
||||
defaultOptions.WireMode = Configuration.SelectedWireMode;
|
||||
|
||||
return new List<ISerializerBenchmark>
|
||||
{
|
||||
// ============================================================
|
||||
// AcBinary — Byte[] API (uncomment to compare option presets side-by-side)
|
||||
// ============================================================
|
||||
// Fastest Byte[] — SGen path (UseGeneratedCode=true, default).
|
||||
new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
|
||||
// Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch.
|
||||
// Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples.
|
||||
// NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates
|
||||
// when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish).
|
||||
new AcBinaryBenchmark(testData.Order, binaryFastModeNoSgenOption, "FastMode"),
|
||||
// Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) +
|
||||
// UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size
|
||||
// and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data.
|
||||
|
||||
new AcBinaryBenchmark(testData.Order, defaultOptions, "Default"),
|
||||
//new AcBinaryBenchmark(testData.Order, binaryDefaultNoSgenOption, "Default"),
|
||||
//new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"),
|
||||
//new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"),
|
||||
|
||||
// AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario)
|
||||
new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
|
||||
|
||||
// AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario).
|
||||
// 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter
|
||||
// allocation. Optimum for this scenario.
|
||||
new AcBinaryFreshBufferWriterBenchmark(testData.Order, binaryFastModeBufWrChunk, "FastMode (4KB)"),
|
||||
|
||||
// AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the
|
||||
// Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost
|
||||
// (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns.
|
||||
// The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode.
|
||||
new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"),
|
||||
|
||||
// Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to-
|
||||
// apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs
|
||||
// chunked-streaming wire format. The IO column shows "Bytes(in-mem)".
|
||||
new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"),
|
||||
|
||||
// AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED
|
||||
// here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport
|
||||
// measurements.
|
||||
|
||||
// ============================================================
|
||||
// MemoryPack — three I/O modes for apples-to-apples comparison
|
||||
// ============================================================
|
||||
new MemoryPackBenchmark(testData.Order, "Default"),
|
||||
new MemoryPackBufferWriterBenchmark(testData.Order, "Default"),
|
||||
new MemoryPackFreshBufferWriterBenchmark(testData.Order, "Default"),
|
||||
|
||||
// ============================================================
|
||||
// MessagePack — for legacy comparison
|
||||
// ============================================================
|
||||
#if !AYCODE_NATIVEAOT
|
||||
// MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed
|
||||
// ListFormatter<T> et al. — fails under NativeAOT publish with "No parameterless constructor".
|
||||
// Excluded from the AOT build; available for regular JIT runs only.
|
||||
new MessagePackBenchmark(testData.Order, "ContractBased"),
|
||||
#endif
|
||||
|
||||
// System.Text.Json (commented — JSON serializer for reference; not in active suite)
|
||||
//new SystemTextJsonBenchmark(testData.Order, "Default")
|
||||
};
|
||||
}
|
||||
|
||||
#endregion
|
||||
// RunBenchmark + RunBenchmarksForTestData + CreateSerializers → BenchmarkLoop.cs
|
||||
|
||||
// Serializer implementations (ISerializerBenchmark + 12 concrete benchmark classes) → Benchmarks/
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue