From ad9e05413c5115dd697eced622597fa8d0983f63 Mon Sep 17 00:00:00 2001 From: Loretta Date: Tue, 12 May 2026 14:09:43 +0200 Subject: [PATCH] [LOADED_DOCS: 2 files, no new loads] Refactor: move benchmark logic to BenchmarkLoop.cs Moved all benchmark execution logic (RunBenchmark, RunBenchmarksForTestData, CreateSerializers) from Program.cs into a new static class BenchmarkLoop in BenchmarkLoop.cs. Program.cs now delegates benchmark runs to BenchmarkLoop, improving separation of concerns. No changes to benchmark functionality. --- .../BenchmarkLoop.cs | 465 +++++++++++++++++- AyCode.Core.Serializers.Console/Program.cs | 456 +---------------- 2 files changed, 459 insertions(+), 462 deletions(-) diff --git a/AyCode.Core.Serializers.Console/BenchmarkLoop.cs b/AyCode.Core.Serializers.Console/BenchmarkLoop.cs index 4e41e66..13fc83b 100644 --- a/AyCode.Core.Serializers.Console/BenchmarkLoop.cs +++ b/AyCode.Core.Serializers.Console/BenchmarkLoop.cs @@ -1,3 +1,5 @@ +using AyCode.Core.Serializers.Binaries; +using AyCode.Core.Serializers.Console.Benchmarks; using AyCode.Core.Tests.TestModels; using MemoryPack; using System.Diagnostics; @@ -7,24 +9,469 @@ using System.Text.Json; namespace AyCode.Core.Serializers.Console; /// -/// Benchmark execution helpers: timing (), per-cell adaptive iteration -/// calibration (), allocation measurement -/// ( + ), in-place -/// \r-progress reporting, full-GC phase-boundary helper (), -/// startup validation (), and per-cell round-trip equality -/// (). Pure benchmark-execution infrastructure — no display -/// formatting (that lives in Output) and no per-engine glue (which lives with the -/// individual ISerializerBenchmark implementations). +/// Benchmark execution: end-to-end orchestration (), per-cell loop +/// (), serializer factory (), +/// and the timing / calibration / allocation helpers. Pure benchmark-execution infrastructure — +/// no display formatting (that lives in Output) and no UX-flow (that lives in Program +/// + Menu). /// internal static class BenchmarkLoop { + /// + /// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup + /// + measurement → grouped results print → save to disk. Used by both the CLI and interactive + /// menu paths; the interactive loop calls this repeatedly without restarting the process. + /// + internal static void RunBenchmark(string layer, string opMode, string serializerMode) + { + System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗"); + System.Console.WriteLine("║ COMPREHENSIVE SERIALIZER BENCHMARK SUITE ║"); + System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝"); + + // Stabilization: pin the entire benchmark process to a single logical CPU and bump priority + // class. Single-core affinity stops Windows from migrating the bench thread between cores + // mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority + // reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise + // randomly inflate samples by 5-15%. + // Try/finally guarantees the original state is restored even if a benchmark throws — leaving + // a developer machine pinned to one core after a crashed run is a real foot-gun. + // Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot. + var process = Process.GetCurrentProcess(); + var origAffinity = (IntPtr)0; + var origPriority = ProcessPriorityClass.Normal; + var stabilizationApplied = false; + + // ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at + // runtime; skip the affinity step there but still raise priority class (which IS supported + // on macOS, just less effective for stabilization than affinity pinning). + if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux())) + { + try + { + origAffinity = process.ProcessorAffinity; + origPriority = process.PriorityClass; + // Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one + // core, consistently" — not which one. If CPU 0 is heavily contended on the host + // (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak + // the mask here. The benchmark is single-threaded for the in-memory rows so single + // core is sufficient; round-trip-only NamedPipe rows have a server-drain thread + // that will share the core (acceptable — the bench measures end-to-end RT anyway). + process.ProcessorAffinity = (IntPtr)1; + process.PriorityClass = ProcessPriorityClass.High; + stabilizationApplied = true; + System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High."); + } + catch (Exception ex) + { + // Affinity/priority changes may fail on locked-down hosts (group policies, containers + // without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still + // works, just with the platform default scheduling. + System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}"); + } + } + + try + { + var allResults = new List(); + var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets(); + var testDataSets = FilterByLayer(allTestDataSets, layer); + + System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard"); + System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}"); + System.Console.WriteLine(); + + // Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens. + // Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup + // alone doesn't ensure that every Serialize/IBufferWriter overload is fully Tier 1 by the time we + // start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants + // on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger). + // Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes + // them all in the background; the per-cell warmup that follows then locks in cache + branch state. + if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration) + { + System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)..."); + + foreach (var testData in testDataSets) + { + var preSerializers = CreateSerializers(testData, serializerMode); + try + { + foreach (var s in preSerializers) + { + // Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated: + // Ser path first, then Des path — same pattern as the per-cell warmup in + // RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming). + s.WarmupSerialize(2000); + s.WarmupDeserialize(2000); + } + } + finally + { + // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources). + foreach (var s in preSerializers) (s as IDisposable)?.Dispose(); + } + } + + // Let background tiered-JIT compilation drain before we begin measuring. + if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep); + System.Console.WriteLine("✓ Global pre-warmup complete.\n"); + } + + foreach (var testData in testDataSets) + { + System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}"); + System.Console.WriteLine($"TEST DATA: {testData.DisplayName}"); + System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}"); + + var results = RunBenchmarksForTestData(testData, opMode, serializerMode); + allResults.AddRange(results); + } + + // Print grouped results + Output.PrintGroupedResults(allResults, testDataSets); + + // Save results to file + Output.SaveResults(allResults, testDataSets); + + System.Console.WriteLine("\n✓ Benchmark complete!"); + } + finally + { + // Restore process state — affinity/priority changes are process-wide and persist across + // interactive-mode iterations of the menu. Without restore, the second menu run would + // already be on CPU-0 + High priority before its own try-block applied them, masking + // any stabilization-disabled comparison. + if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux())) + { + try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ } + try { process.PriorityClass = origPriority; } catch { /* best-effort */ } + } + } + } + + private static List RunBenchmarksForTestData(TestDataSet testData, string mode, string serializerMode) + { + var results = new List(); + var serializers = CreateSerializers(testData, serializerMode); + + // Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure. + System.Console.WriteLine("Verifying round-trip correctness..."); + + foreach (var serializer in serializers) + { + if (!serializer.VerifyRoundTrip()) + { + System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}"); + System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting."); + + Environment.Exit(1); + } + } + + System.Console.WriteLine("✓ All serializers passed round-trip verification."); + + // Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and + // Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary. + // + // Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor + // in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a + // cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs + // keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement + // (steady-state). Branch-predictor history also stays clean per path. + // + // GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer + // pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent + // heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations. + // + // Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT). + // Each phase's freshly-promoted methods settle before its timing starts. + System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n"); + + foreach (var serializer in serializers) + { + var result = new BenchmarkResult + { + TestDataName = testData.DisplayName, // Use DisplayName for IId% info + Engine = serializer.Engine, + IoMode = serializer.IoMode, + DispatchMode = serializer.DispatchMode, + OptionsPreset = serializer.OptionsPreset, + OptionsDescription = serializer.OptionsDescription, + SerializedSize = serializer.SerializedSize, + SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes, + SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes, + IsRoundTripOnly = serializer.IsRoundTripOnly + }; + + // Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark + // is visibly stuck on a specific row at a specific %% rather than silently hanging. + var groupLabel = $"{result.SerializerName}"; + + if (serializer.IsRoundTripOnly) + { + // Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT, + // Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the + // entire round-trip path, then record into the RT result columns. + if (mode is "all" or "serialize" or "ser") + { + ForceGcCollect(); + serializer.WarmupSerialize(Configuration.WarmupIterations); + if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep); + + var rtIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs); + var (rtMed, rtMin, rtMax, rtStd) = RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]"); + result.RoundTripTimeMs = rtMed; + result.RoundTripTimeMinMs = rtMin; + result.RoundTripTimeMaxMs = rtMax; + result.RoundTripTimeStdDevMs = rtStd; + result.RoundTripIterations = rtIter; + // Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len]) + // also show up — otherwise current-thread alloc would only count the client side and look ~halved. + result.RoundTripAllocBytesPerOp = MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]"); + } + // mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently. + } + else + { + // ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect. + if (mode is "all" or "serialize" or "ser") + { + ForceGcCollect(); + serializer.WarmupSerialize(Configuration.WarmupIterations); + if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep); + + var serIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs); + var (serMed, serMin, serMax, serStd) = RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]"); + result.SerializeTimeMs = serMed; + result.SerializeTimeMinMs = serMin; + result.SerializeTimeMaxMs = serMax; + result.SerializeTimeStdDevMs = serStd; + result.SerializeIterations = serIter; + // Dedicated alloc-only sample (separate from timing samples; keeps timing pure) + result.SerializeAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]"); + } + + // ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect. + // The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the + // Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph). + if (mode is "all" or "deserialize" or "des") + { + ForceGcCollect(); + serializer.WarmupDeserialize(Configuration.WarmupIterations); + if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep); + + var desIter = CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs); + var (desMed, desMin, desMax, desStd) = RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]"); + result.DeserializeTimeMs = desMed; + result.DeserializeTimeMinMs = desMin; + result.DeserializeTimeMaxMs = desMax; + result.DeserializeTimeStdDevMs = desStd; + result.DeserializeIterations = desIter; + result.DeserializeAllocBytesPerOp = MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]"); + } + + // Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration, + // batch-time addition would be misleading. Instead: compute per-op µs (iter-independent), + // then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that + // RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp. + var serPerOp = Output.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations); + var desPerOp = Output.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations); + var rtPerOp = serPerOp + desPerOp; + result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations); + result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations; + result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp; + } + + results.Add(result); + Output.PrintResult(result); + } + + // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released + // before the next test data builds new ones — otherwise pipes / handles leak across test cells). + foreach (var s in serializers) (s as IDisposable)?.Dispose(); + + return results; + } + + private static List CreateSerializers(TestDataSet testData, string serializerMode) + { + // FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path. + // TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[]. + // - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head. + // Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min. + // + // FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint — + // we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor + // reference earlier. Re-enable when revisiting Fast wire-mode performance. + if (serializerMode == "fastestbyte") + { + var fastestByteOptions = AcBinarySerializerOptions.FastMode; + fastestByteOptions.WireMode = Configuration.SelectedWireMode; + + return new List + { + new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"), + //new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"), + new MemoryPackBenchmark(testData.Order, "Default"), + }; + } + + // AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer). + // Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with + // the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it + // in isolation so the timing numbers reflect ONLY the streaming path. + if (serializerMode == "asyncpipe") + { + // NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here + // drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to + // the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the + // NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall + // fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level + // wire chunk AND kernel transfer unit; change ONLY this line when tuning. + var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode; + binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize; + binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode; + + return new List + { + // Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync. + // Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer + + // MRES wait-on-byte-shortage — over a kernel NamedPipe. + new AcBinaryNamedPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"), + // Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport, + // same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize(byte[]). + // No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with + // the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from + // kernel-transport-overhead (raw vs in-process Byte[]). + new AcBinaryNamedPipeRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"), + // Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel). + // Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the + // kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk + // on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this + // isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the + // in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback + // is the worst-case benchmark scenario for chunked-streaming and not representative of real network + // / file / cross-thread Pipe scenarios. + new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"), + // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples + // baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw + // byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory]. + new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"), + }; + } + + // Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the + // AsyncPipe menu / CLI mode, never bundled with the steady-state suite). + + var binaryNoInternOption = AcBinarySerializerOptions.Default; + binaryNoInternOption.UseStringInterning = StringInterningMode.None; + binaryNoInternOption.WireMode = Configuration.SelectedWireMode; + + var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default; + binaryDefaultNoSgenOption.UseGeneratedCode = false; + binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode; + + var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode; + binaryFastModeNoSgenOption.UseGeneratedCode = false; + binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode; + + var binaryFastModeOption = AcBinarySerializerOptions.FastMode; + binaryFastModeOption.WireMode = Configuration.SelectedWireMode; + + // BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives + // the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call. + // Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration + // allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead + // vs syscall count). + var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode; + binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize; + binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode; + + // In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment + // concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory + // System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task). + var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode; + binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize; + binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode; + + var defaultOptions = AcBinarySerializerOptions.Default; + defaultOptions.UseStringInterning = StringInterningMode.None; + defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId; + defaultOptions.WireMode = Configuration.SelectedWireMode; + + return new List + { + // ============================================================ + // AcBinary — Byte[] API (uncomment to compare option presets side-by-side) + // ============================================================ + // Fastest Byte[] — SGen path (UseGeneratedCode=true, default). + new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"), + // Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch. + // Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples. + // NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates + // when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish). + new AcBinaryBenchmark(testData.Order, binaryFastModeNoSgenOption, "FastMode"), + // Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) + + // UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size + // and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data. + + new AcBinaryBenchmark(testData.Order, defaultOptions, "Default"), + //new AcBinaryBenchmark(testData.Order, binaryDefaultNoSgenOption, "Default"), + //new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"), + //new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"), + + // AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario) + new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"), + + // AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario). + // 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter + // allocation. Optimum for this scenario. + new AcBinaryFreshBufferWriterBenchmark(testData.Order, binaryFastModeBufWrChunk, "FastMode (4KB)"), + + // AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the + // Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost + // (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns. + // The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode. + new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"), + + // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to- + // apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs + // chunked-streaming wire format. The IO column shows "Bytes(in-mem)". + new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"), + + // AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED + // here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport + // measurements. + + // ============================================================ + // MemoryPack — three I/O modes for apples-to-apples comparison + // ============================================================ + new MemoryPackBenchmark(testData.Order, "Default"), + new MemoryPackBufferWriterBenchmark(testData.Order, "Default"), + new MemoryPackFreshBufferWriterBenchmark(testData.Order, "Default"), + + // ============================================================ + // MessagePack — for legacy comparison + // ============================================================ +#if !AYCODE_NATIVEAOT + // MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed + // ListFormatter et al. — fails under NativeAOT publish with "No parameterless constructor". + // Excluded from the AOT build; available for regular JIT runs only. + new MessagePackBenchmark(testData.Order, "ContractBased"), +#endif + + // System.Text.Json (commented — JSON serializer for reference; not in active suite) + //new SystemTextJsonBenchmark(testData.Order, "Default") + }; + } + /// /// Forces a full GC cycle at a phase boundary in the benchmark loop. Two-pass collect with finalizer drain /// in between: the first pass moves managed garbage to the finalization queue, WaitForPendingFinalizers /// runs the finalizers, the second pass reclaims any objects the finalizers released. After this returns the /// heap is in a known-quiescent state — the next warmup/measurement phase starts on a clean slate, isolated /// from the previous phase's residual allocations (write-buffer pools, intern cache, write-plan arrays, etc.). - /// Called between every Ser-phase / Des-phase boundary in RunBenchmarksForTestData. + /// Called between every Ser-phase / Des-phase boundary in . /// [MethodImpl(MethodImplOptions.NoInlining)] internal static void ForceGcCollect() diff --git a/AyCode.Core.Serializers.Console/Program.cs b/AyCode.Core.Serializers.Console/Program.cs index 77d86d6..1287dd1 100644 --- a/AyCode.Core.Serializers.Console/Program.cs +++ b/AyCode.Core.Serializers.Console/Program.cs @@ -52,7 +52,7 @@ public static class Program if (!TryParseCliArgs(args, out var layer, out var opMode, out var serializerMode)) return; // invalid args - RunBenchmark(layer, opMode, serializerMode); + BenchmarkLoop.RunBenchmark(layer, opMode, serializerMode); return; } @@ -63,7 +63,7 @@ public static class Program var selection = Menu.ShowInteractiveMenu(); if (selection == null) return; // user pressed Q - RunBenchmark(selection.Value.layer, "all", selection.Value.serializerMode); + BenchmarkLoop.RunBenchmark(selection.Value.layer, "all", selection.Value.serializerMode); System.Console.WriteLine(); System.Console.WriteLine("─────────────────────────────────────────────────────────────────────"); @@ -124,457 +124,7 @@ public static class Program return true; } - /// - /// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup - /// + measurement → grouped results print → save to disk. Used by both the CLI and interactive - /// menu paths; the interactive loop calls this repeatedly without restarting the process. - /// - private static void RunBenchmark(string layer, string opMode, string serializerMode) - { - System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗"); - System.Console.WriteLine("║ COMPREHENSIVE SERIALIZER BENCHMARK SUITE ║"); - System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝"); - - // Stabilization: pin the entire benchmark process to a single logical CPU and bump priority - // class. Single-core affinity stops Windows from migrating the bench thread between cores - // mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority - // reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise - // randomly inflate samples by 5-15%. - // Try/finally guarantees the original state is restored even if a benchmark throws — leaving - // a developer machine pinned to one core after a crashed run is a real foot-gun. - // Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot. - var process = Process.GetCurrentProcess(); - var origAffinity = (IntPtr)0; - var origPriority = ProcessPriorityClass.Normal; - var stabilizationApplied = false; - - // ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at - // runtime; skip the affinity step there but still raise priority class (which IS supported - // on macOS, just less effective for stabilization than affinity pinning). - if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux())) - { - try - { - origAffinity = process.ProcessorAffinity; - origPriority = process.PriorityClass; - // Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one - // core, consistently" — not which one. If CPU 0 is heavily contended on the host - // (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak - // the mask here. The benchmark is single-threaded for the in-memory rows so single - // core is sufficient; round-trip-only NamedPipe rows have a server-drain thread - // that will share the core (acceptable — the bench measures end-to-end RT anyway). - process.ProcessorAffinity = (IntPtr)1; - process.PriorityClass = ProcessPriorityClass.High; - stabilizationApplied = true; - System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High."); - } - catch (Exception ex) - { - // Affinity/priority changes may fail on locked-down hosts (group policies, containers - // without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still - // works, just with the platform default scheduling. - System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}"); - } - } - - try - { - var allResults = new List(); - var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets(); - var testDataSets = BenchmarkLoop.FilterByLayer(allTestDataSets, layer); - - System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard"); - System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}"); - System.Console.WriteLine(); - - // Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens. - // Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup - // alone doesn't ensure that every Serialize/IBufferWriter overload is fully Tier 1 by the time we - // start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants - // on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger). - // Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes - // them all in the background; the per-cell warmup that follows then locks in cache + branch state. - if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration) - { - System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)..."); - - foreach (var testData in testDataSets) - { - var preSerializers = CreateSerializers(testData, serializerMode); - try - { - foreach (var s in preSerializers) - { - // Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated: - // Ser path first, then Des path — same pattern as the per-cell warmup in - // RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming). - s.WarmupSerialize(2000); - s.WarmupDeserialize(2000); - } - } - finally - { - // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources). - foreach (var s in preSerializers) (s as IDisposable)?.Dispose(); - } - } - - // Let background tiered-JIT compilation drain before we begin measuring. - if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep); - System.Console.WriteLine("✓ Global pre-warmup complete.\n"); - } - - foreach (var testData in testDataSets) - { - System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}"); - System.Console.WriteLine($"TEST DATA: {testData.DisplayName}"); - System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}"); - - var results = RunBenchmarksForTestData(testData, opMode, serializerMode); - allResults.AddRange(results); - } - - // Print grouped results - Output.PrintGroupedResults(allResults, testDataSets); - - // Save results to file - Output.SaveResults(allResults, testDataSets); - - System.Console.WriteLine("\n✓ Benchmark complete!"); - } - finally - { - // Restore process state — affinity/priority changes are process-wide and persist across - // interactive-mode iterations of the menu. Without restore, the second menu run would - // already be on CPU-0 + High priority before its own try-block applied them, masking - // any stabilization-disabled comparison. - if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux())) - { - try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ } - try { process.PriorityClass = origPriority; } catch { /* best-effort */ } - } - } - } - - #region Benchmark Execution - -private static List RunBenchmarksForTestData(TestDataSet testData, string mode, string serializerMode) - { - var results = new List(); - var serializers = CreateSerializers(testData, serializerMode); - - // Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure. - System.Console.WriteLine("Verifying round-trip correctness..."); - - foreach (var serializer in serializers) - { - if (!serializer.VerifyRoundTrip()) - { - System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}"); - System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting."); - - Environment.Exit(1); - } - } - - System.Console.WriteLine("✓ All serializers passed round-trip verification."); - - // Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and - // Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary. - // - // Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor - // in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a - // cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs - // keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement - // (steady-state). Branch-predictor history also stays clean per path. - // - // GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer - // pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent - // heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations. - // - // Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT). - // Each phase's freshly-promoted methods settle before its timing starts. - System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n"); - - foreach (var serializer in serializers) - { - var result = new BenchmarkResult - { - TestDataName = testData.DisplayName, // Use DisplayName for IId% info - Engine = serializer.Engine, - IoMode = serializer.IoMode, - DispatchMode = serializer.DispatchMode, - OptionsPreset = serializer.OptionsPreset, - OptionsDescription = serializer.OptionsDescription, - SerializedSize = serializer.SerializedSize, - SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes, - SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes, - IsRoundTripOnly = serializer.IsRoundTripOnly - }; - - // Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark - // is visibly stuck on a specific row at a specific %% rather than silently hanging. - var groupLabel = $"{result.SerializerName}"; - - if (serializer.IsRoundTripOnly) - { - // Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT, - // Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the - // entire round-trip path, then record into the RT result columns. - if (mode is "all" or "serialize" or "ser") - { - BenchmarkLoop.ForceGcCollect(); - serializer.WarmupSerialize(Configuration.WarmupIterations); - if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep); - - var rtIter = BenchmarkLoop.CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs); - var (rtMed, rtMin, rtMax, rtStd) = BenchmarkLoop.RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]"); - result.RoundTripTimeMs = rtMed; - result.RoundTripTimeMinMs = rtMin; - result.RoundTripTimeMaxMs = rtMax; - result.RoundTripTimeStdDevMs = rtStd; - result.RoundTripIterations = rtIter; - // Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len]) - // also show up — otherwise current-thread alloc would only count the client side and look ~halved. - result.RoundTripAllocBytesPerOp = BenchmarkLoop.MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]"); - } - // mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently. - } - else - { - // ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect. - if (mode is "all" or "serialize" or "ser") - { - BenchmarkLoop.ForceGcCollect(); - serializer.WarmupSerialize(Configuration.WarmupIterations); - if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep); - - var serIter = BenchmarkLoop.CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs); - var (serMed, serMin, serMax, serStd) = BenchmarkLoop.RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]"); - result.SerializeTimeMs = serMed; - result.SerializeTimeMinMs = serMin; - result.SerializeTimeMaxMs = serMax; - result.SerializeTimeStdDevMs = serStd; - result.SerializeIterations = serIter; - // Dedicated alloc-only sample (separate from timing samples; keeps timing pure) - result.SerializeAllocBytesPerOp = BenchmarkLoop.MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]"); - } - - // ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect. - // The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the - // Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph). - if (mode is "all" or "deserialize" or "des") - { - BenchmarkLoop.ForceGcCollect(); - serializer.WarmupDeserialize(Configuration.WarmupIterations); - if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep); - - var desIter = BenchmarkLoop.CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs); - var (desMed, desMin, desMax, desStd) = BenchmarkLoop.RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]"); - result.DeserializeTimeMs = desMed; - result.DeserializeTimeMinMs = desMin; - result.DeserializeTimeMaxMs = desMax; - result.DeserializeTimeStdDevMs = desStd; - result.DeserializeIterations = desIter; - result.DeserializeAllocBytesPerOp = BenchmarkLoop.MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]"); - } - - // Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration, - // batch-time addition would be misleading. Instead: compute per-op µs (iter-independent), - // then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that - // RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp. - var serPerOp = Output.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations); - var desPerOp = Output.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations); - var rtPerOp = serPerOp + desPerOp; - result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations); - result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations; - result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp; - } - - results.Add(result); - Output.PrintResult(result); - } - - // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released - // before the next test data builds new ones — otherwise pipes / handles leak across test cells). - foreach (var s in serializers) (s as IDisposable)?.Dispose(); - - return results; - } - - private static List CreateSerializers(TestDataSet testData, string serializerMode) - { - // FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path. - // TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[]. - // - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head. - // Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min. - // - // FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint — - // we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor - // reference earlier. Re-enable when revisiting Fast wire-mode performance. - if (serializerMode == "fastestbyte") - { - var fastestByteOptions = AcBinarySerializerOptions.FastMode; - fastestByteOptions.WireMode = Configuration.SelectedWireMode; - - return new List - { - new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"), - //new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"), - new MemoryPackBenchmark(testData.Order, "Default"), - }; - } - - // AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer). - // Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with - // the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it - // in isolation so the timing numbers reflect ONLY the streaming path. - if (serializerMode == "asyncpipe") - { - // NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here - // drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to - // the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the - // NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall - // fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level - // wire chunk AND kernel transfer unit; change ONLY this line when tuning. - var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode; - binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize; - binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode; - - return new List - { - // Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync. - // Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer + - // MRES wait-on-byte-shortage — over a kernel NamedPipe. - new AcBinaryNamedPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"), - // Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport, - // same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize(byte[]). - // No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with - // the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from - // kernel-transport-overhead (raw vs in-process Byte[]). - new AcBinaryNamedPipeRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"), - // Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel). - // Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the - // kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk - // on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this - // isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the - // in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback - // is the worst-case benchmark scenario for chunked-streaming and not representative of real network - // / file / cross-thread Pipe scenarios. - new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"), - // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples - // baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw - // byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory]. - new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"), - }; - } - - // Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the - // AsyncPipe menu / CLI mode, never bundled with the steady-state suite). - - var binaryNoInternOption = AcBinarySerializerOptions.Default; - binaryNoInternOption.UseStringInterning = StringInterningMode.None; - binaryNoInternOption.WireMode = Configuration.SelectedWireMode; - - var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default; - binaryDefaultNoSgenOption.UseGeneratedCode = false; - binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode; - - var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode; - binaryFastModeNoSgenOption.UseGeneratedCode = false; - binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode; - - var binaryFastModeOption = AcBinarySerializerOptions.FastMode; - binaryFastModeOption.WireMode = Configuration.SelectedWireMode; - - // BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives - // the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call. - // Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration - // allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead - // vs syscall count). - var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode; - binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize; - binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode; - - // In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment - // concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory - // System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task). - var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode; - binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize; - binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode; - - var defaultOptions = AcBinarySerializerOptions.Default; - defaultOptions.UseStringInterning = StringInterningMode.None; - defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId; - defaultOptions.WireMode = Configuration.SelectedWireMode; - - return new List - { - // ============================================================ - // AcBinary — Byte[] API (uncomment to compare option presets side-by-side) - // ============================================================ - // Fastest Byte[] — SGen path (UseGeneratedCode=true, default). - new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"), - // Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch. - // Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples. - // NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates - // when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish). - new AcBinaryBenchmark(testData.Order, binaryFastModeNoSgenOption, "FastMode"), - // Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) + - // UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size - // and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data. - - new AcBinaryBenchmark(testData.Order, defaultOptions, "Default"), - //new AcBinaryBenchmark(testData.Order, binaryDefaultNoSgenOption, "Default"), - //new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"), - //new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"), - - // AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario) - new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"), - - // AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario). - // 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter - // allocation. Optimum for this scenario. - new AcBinaryFreshBufferWriterBenchmark(testData.Order, binaryFastModeBufWrChunk, "FastMode (4KB)"), - - // AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the - // Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost - // (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns. - // The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode. - new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"), - - // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to- - // apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs - // chunked-streaming wire format. The IO column shows "Bytes(in-mem)". - new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"), - - // AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED - // here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport - // measurements. - - // ============================================================ - // MemoryPack — three I/O modes for apples-to-apples comparison - // ============================================================ - new MemoryPackBenchmark(testData.Order, "Default"), - new MemoryPackBufferWriterBenchmark(testData.Order, "Default"), - new MemoryPackFreshBufferWriterBenchmark(testData.Order, "Default"), - - // ============================================================ - // MessagePack — for legacy comparison - // ============================================================ -#if !AYCODE_NATIVEAOT - // MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed - // ListFormatter et al. — fails under NativeAOT publish with "No parameterless constructor". - // Excluded from the AOT build; available for regular JIT runs only. - new MessagePackBenchmark(testData.Order, "ContractBased"), -#endif - - // System.Text.Json (commented — JSON serializer for reference; not in active suite) - //new SystemTextJsonBenchmark(testData.Order, "Default") - }; - } - - #endregion + // RunBenchmark + RunBenchmarksForTestData + CreateSerializers → BenchmarkLoop.cs // Serializer implementations (ISerializerBenchmark + 12 concrete benchmark classes) → Benchmarks/