using AyCode.Core.Benchmarks.Reporting; using AyCode.Core.Benchmarks.Workloads.Scenarios; using AyCode.Core.Serializers.Binaries; using AyCode.Core.Tests.TestModels; using MemoryPack; using System.Diagnostics; using System.Runtime.CompilerServices; namespace AyCode.Core.Serializers.Console; /// /// Benchmark execution: end-to-end orchestration (), per-cell loop /// (), serializer factory (), /// and the timing / calibration / allocation helpers. Pure benchmark-execution infrastructure — /// no display formatting (that lives in Output) and no UX-flow (that lives in Program /// + Menu). /// internal static class BenchmarkLoop { /// /// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup /// + measurement → grouped results print → save to disk. Used by both the CLI and interactive /// menu paths; the interactive loop calls this repeatedly without restarting the process. /// internal static void RunBenchmark(BenchmarkLayer layer, BenchmarkOpMode opMode, SerializerSelectionMode serializerMode) { System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗"); System.Console.WriteLine("║ COMPREHENSIVE SERIALIZER BENCHMARK SUITE ║"); System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝"); // Stabilization: pin the entire benchmark process to a single logical CPU and bump priority // class. Single-core affinity stops Windows from migrating the bench thread between cores // mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority // reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise // randomly inflate samples by 5-15%. // Try/finally guarantees the original state is restored even if a benchmark throws — leaving // a developer machine pinned to one core after a crashed run is a real foot-gun. // Skipped on Debug single-sample mode (Configuration.BenchmarkSamples <= 1) where stabilization is moot. var process = Process.GetCurrentProcess(); var origAffinity = (IntPtr)0; var origPriority = ProcessPriorityClass.Normal; var stabilizationApplied = false; // ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at // runtime; skip the affinity step there but still raise priority class (which IS supported // on macOS, just less effective for stabilization than affinity pinning). if (Configuration.BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux())) { try { origAffinity = process.ProcessorAffinity; origPriority = process.PriorityClass; // Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one // core, consistently" — not which one. If CPU 0 is heavily contended on the host // (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak // the mask here. The benchmark is single-threaded for the in-memory rows so single // core is sufficient; round-trip-only NamedPipe rows have a server-drain thread // that will share the core (acceptable — the bench measures end-to-end RT anyway). process.ProcessorAffinity = (IntPtr)1; process.PriorityClass = ProcessPriorityClass.High; stabilizationApplied = true; System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High."); } catch (Exception ex) { // Affinity/priority changes may fail on locked-down hosts (group policies, containers // without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still // works, just with the platform default scheduling. System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}"); } } try { var allResults = new List(); var allTestDataSets = BuildMultiVariantTestDataSets(); var testDataSets = FilterByLayer(allTestDataSets, layer); System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {Configuration.GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{Configuration.TargetSampleMs} ms target) | Warmup: {Configuration.WarmupIterations} per phase (Ser/Des isolated) | Samples: {Configuration.BenchmarkSamples} (median) + pilot discard"); System.Console.WriteLine($"Build: {Configuration.BuildConfiguration} | .NET: {Environment.Version} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}"); System.Console.WriteLine(); // Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens. // Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup // alone doesn't ensure that every Serialize/IBufferWriter overload is fully Tier 1 by the time we // start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants // on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger). // Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes // them all in the background; the per-cell warmup that follows then locks in cache + branch state. if (Configuration.BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration) { System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)..."); foreach (var testData in testDataSets) { var preSerializers = CreateSerializers(testData, serializerMode); try { foreach (var s in preSerializers) { // Light warmup just to trigger Tier 0 → Tier 1 promotion. Phase-isolated: // Ser path first, then Des path — same pattern as the per-cell warmup in // RunBenchmarksForTestData (which still runs afterwards for cache/BTB warming). s.WarmupSerialize(2000); s.WarmupDeserialize(2000); } } finally { // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources). foreach (var s in preSerializers) (s as IDisposable)?.Dispose(); } } // Let background tiered-JIT compilation drain before we begin measuring. if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep); System.Console.WriteLine("✓ Global pre-warmup complete.\n"); } foreach (var testData in testDataSets) { System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}"); System.Console.WriteLine($"TEST DATA: {testData.DisplayName}"); System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}"); var results = RunBenchmarksForTestData(testData, opMode, serializerMode); allResults.AddRange(results); } // Build the reporting context (resolves path via walk-up to .sln, snapshots run-config). var ctx = new ReportingContext( SourceTag: "Console", ResultsDirectory: ReportingContext.ResolveResultsDirectory(), BuildConfiguration: Configuration.BuildConfiguration, Utf8NoBom: Configuration.Utf8NoBom, CharsetName: Configuration.GetCurrentCharsetName(), WarmupIterations: Configuration.WarmupIterations, BenchmarkSamples: Configuration.BenchmarkSamples, TargetSampleMs: Configuration.TargetSampleMs, UnstableCVThreshold: Configuration.UnstableCVThreshold, MicroOptCVThreshold: Configuration.MicroOptCVThreshold); // Print grouped results BenchmarkReportWriter.PrintGroupedResults(allResults, testDataSets); // Save results to file (.log + .LLM + .output) BenchmarkReportWriter.SaveAll(ctx, allResults, testDataSets); System.Console.WriteLine("\n✓ Benchmark complete!"); } finally { // Restore process state — affinity/priority changes are process-wide and persist across // interactive-mode iterations of the menu. Without restore, the second menu run would // already be on CPU-0 + High priority before its own try-block applied them, masking // any stabilization-disabled comparison. if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux())) { try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ } try { process.PriorityClass = origPriority; } catch { /* best-effort */ } } } } private static List RunBenchmarksForTestData(TestDataSet testData, BenchmarkOpMode mode, SerializerSelectionMode serializerMode) { var results = new List(); var serializers = CreateSerializers(testData, serializerMode); // Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure. System.Console.WriteLine("Verifying round-trip correctness..."); foreach (var serializer in serializers) { if (!serializer.VerifyRoundTrip()) { System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}"); System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting."); Environment.Exit(1); } } System.Console.WriteLine("✓ All serializers passed round-trip verification."); // Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and // Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary. // // Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor // in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a // cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs // keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement // (steady-state). Branch-predictor history also stays clean per path. // // GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer // pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent // heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations. // // Configuration.JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT). // Each phase's freshly-promoted methods settle before its timing starts. System.Console.WriteLine($"Running benchmarks (target ~{Configuration.TargetSampleMs} ms/sample × {Configuration.BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n"); foreach (var serializer in serializers) { var result = new BenchmarkResult { TestDataName = testData.DisplayName, // Use DisplayName for IId% info Engine = serializer.Engine, IoMode = serializer.IoMode, DispatchMode = serializer.DispatchMode, OptionsPreset = serializer.OptionsPreset, OrderTypeName = serializer.OrderTypeName, OptionsDescription = serializer.OptionsDescription, SerializedSize = serializer.SerializedSize, SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes, SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes, IsRoundTripOnly = serializer.IsRoundTripOnly }; // Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark // is visibly stuck on a specific row at a specific %% rather than silently hanging. var groupLabel = $"{result.SerializerName}"; if (serializer.IsRoundTripOnly) { // Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT, // Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the // entire round-trip path, then record into the RT result columns. if (mode is BenchmarkOpMode.All or BenchmarkOpMode.Serialize) { ForceGcCollect(); serializer.WarmupSerialize(Configuration.WarmupIterations); if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep); var rtIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs); var (rtMed, rtMin, rtMax, rtStd) = RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]"); result.RoundTripTimeMs = rtMed; result.RoundTripTimeMinMs = rtMin; result.RoundTripTimeMaxMs = rtMax; result.RoundTripTimeStdDevMs = rtStd; result.RoundTripIterations = rtIter; // Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len]) // also show up — otherwise current-thread alloc would only count the client side and look ~halved. result.RoundTripAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]", processWide: true); } // mode == BenchmarkOpMode.Deserialize alone is meaningless for a round-trip-only benchmark; skip silently. } else { // ── Ser phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect. if (mode is BenchmarkOpMode.All or BenchmarkOpMode.Serialize) { ForceGcCollect(); serializer.WarmupSerialize(Configuration.WarmupIterations); if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep); var serIter = CalibrateIterations(() => serializer.Serialize(), Configuration.TargetSampleMs); var (serMed, serMin, serMax, serStd) = RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]"); result.SerializeTimeMs = serMed; result.SerializeTimeMinMs = serMin; result.SerializeTimeMaxMs = serMax; result.SerializeTimeStdDevMs = serStd; result.SerializeIterations = serIter; // Dedicated alloc-only sample (separate from timing samples; keeps timing pure) result.SerializeAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]"); } // ── Des phase ── isolated warmup → Configuration.JitSleep → calibrate → time → alloc; preceded by GC.Collect. // The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the // Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph). if (mode is BenchmarkOpMode.All or BenchmarkOpMode.Deserialize) { ForceGcCollect(); serializer.WarmupDeserialize(Configuration.WarmupIterations); if (Configuration.JitSleep > 0) Thread.Sleep(Configuration.JitSleep); var desIter = CalibrateIterations(() => serializer.Deserialize(), Configuration.TargetSampleMs); var (desMed, desMin, desMax, desStd) = RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]"); result.DeserializeTimeMs = desMed; result.DeserializeTimeMinMs = desMin; result.DeserializeTimeMaxMs = desMax; result.DeserializeTimeStdDevMs = desStd; result.DeserializeIterations = desIter; result.DeserializeAllocBytesPerOp = MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]"); } // Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration, // batch-time addition would be misleading. Instead: compute per-op µs (iter-independent), // then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that // RoundTripTimeMs / RoundTripIterations * 1000 == Output.SerPerOp + Output.DesPerOp. var serPerOp = BenchmarkReportWriter.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations); var desPerOp = BenchmarkReportWriter.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations); var rtPerOp = serPerOp + desPerOp; result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations); result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations; result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp; } results.Add(result); BenchmarkReportWriter.PrintResult(result); } // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released // before the next test data builds new ones — otherwise pipes / handles leak across test cells). foreach (var s in serializers) (s as IDisposable)?.Dispose(); return results; } /// /// Phase 2 multi-variant test-data builder. Constructs each cell in both the _All_False and /// _All_True families, then cross-registers _All_True on the _All_False primaries so the /// CreateSerializers downstream can pick the matching variant per AcBinary options preset. /// /// /// Memory cost: ~600 KB across 5 cells (Large dominates at ~340 KB for both variants). The two /// families are built independently — same data values + same numeric sequence (per-family /// _idCounter reset). MemPack/MsgPack benchmarks consume the _All_True variant canonically; /// AcBinary's variant is preset-dependent (see CreateSerializers). /// private static List BuildMultiVariantTestDataSets() { var allFalse = BenchmarkTestDataProvider_All_False.CreateTestDataSets(); var allTrue = BenchmarkTestDataProvider.CreateTestDataSets(); // Zip by ordinal — both providers emit the same 5 cells in the same order // (Small / Medium / Large / Repeated / Deep), confirmed by their identical // CreateTestDataSets call sequence on the generic base. for (var i = 0; i < allFalse.Count; i++) { var falseDs = (TestDataSet)allFalse[i]; var trueDs = (TestDataSet)allTrue[i]; falseDs.RegisterVariant(trueDs.Order); } return allFalse; } /// /// Phase 2 variant dispatch rule for AcBinary: a preset uses TestOrder_All_False iff every /// AcBinary "feature flag" is off (no string interning, no reference handling, no metadata, no /// property filter). Any "true"-flagged feature promotes the benchmark to TestOrder_All_True /// — the richer graph + opt-out attribute model exercises the feature's deduplication / dispatch /// path on real shared-reference content. WireMode, SGen mode, and Compression are encoding-axis /// options and intentionally NOT part of this decision (they don't change which graph shape is /// meaningful to feed). /// private static bool UsesAllFalseVariant(AcBinarySerializerOptions options) => options.UseStringInterning == StringInterningMode.None && options.ReferenceHandling == ReferenceHandlingMode.None && !options.UseMetadata && options.PropertyFilter == null; // Per-class factory helpers — each returns ISerializerBenchmark closed over the variant T // selected by UsesAllFalseVariant(options). Compile-time T at the new T() call site preserves // SGen apples-to-apples (no runtime reflection, no type erasure across the JIT boundary). private static ISerializerBenchmark MakeAcBinary(TestDataSet td, AcBinarySerializerOptions opt, string preset) => UsesAllFalseVariant(opt) ? new AcBinaryBenchmark(td.GetOrder(), opt, preset) : new AcBinaryBenchmark(td.GetOrder(), opt, preset); private static ISerializerBenchmark MakeAcBinaryBufferWriter(TestDataSet td, AcBinarySerializerOptions opt, string preset) => UsesAllFalseVariant(opt) ? new AcBinaryBufferWriterBenchmark(td.GetOrder(), opt, preset) : new AcBinaryBufferWriterBenchmark(td.GetOrder(), opt, preset); private static ISerializerBenchmark MakeAcBinaryFreshBufferWriter(TestDataSet td, AcBinarySerializerOptions opt, string preset) => UsesAllFalseVariant(opt) ? new AcBinaryFreshBufferWriterBenchmark(td.GetOrder(), opt, preset) : new AcBinaryFreshBufferWriterBenchmark(td.GetOrder(), opt, preset); private static ISerializerBenchmark MakeAcBinaryNamedPipe(TestDataSet td, AcBinarySerializerOptions opt, string preset) => UsesAllFalseVariant(opt) ? new AcBinaryNamedPipeBenchmark(td.GetOrder(), opt, preset) : new AcBinaryNamedPipeBenchmark(td.GetOrder(), opt, preset); private static ISerializerBenchmark MakeAcBinaryNamedPipeRaw(TestDataSet td, AcBinarySerializerOptions opt, string preset) => UsesAllFalseVariant(opt) ? new AcBinaryNamedPipeRawByteArrayBenchmark(td.GetOrder(), opt, preset) : new AcBinaryNamedPipeRawByteArrayBenchmark(td.GetOrder(), opt, preset); private static ISerializerBenchmark MakeAcBinaryInMemoryPipe(TestDataSet td, AcBinarySerializerOptions opt, string preset) => UsesAllFalseVariant(opt) ? new AcBinaryInMemoryPipeBenchmark(td.GetOrder(), opt, preset) : new AcBinaryInMemoryPipeBenchmark(td.GetOrder(), opt, preset); private static ISerializerBenchmark MakeAcBinaryInMemoryRaw(TestDataSet td, AcBinarySerializerOptions opt, string preset) => UsesAllFalseVariant(opt) ? new AcBinaryInMemoryRawByteArrayBenchmark(td.GetOrder(), opt, preset) : new AcBinaryInMemoryRawByteArrayBenchmark(td.GetOrder(), opt, preset); private static List CreateSerializers(TestDataSet testData, SerializerSelectionMode serializerMode) { // Phase 2 variant dispatch (refined): AcBinary picks variant per UsesAllFalseVariant(options). // MemPack / MsgPack canonically use _All_False (no AcBinary opt-in/opt-out axis — both // produce identical MemPack/MsgPack wire on either variant since their contract is family- // agnostic). `orderFalse` is the cell primary; `orderTrue` is fetched on-demand by the AcBinary // factory helpers when an options preset has a "true" flag. var orderFalse = testData.GetOrder(); // FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path. // TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[]. // - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head. // Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min. // // FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint — // we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor // reference earlier. Re-enable when revisiting Fast wire-mode performance. if (serializerMode == SerializerSelectionMode.FastestByte) { var fastestByteOptions = AcBinarySerializerOptions.FastMode; fastestByteOptions.WireMode = Configuration.SelectedWireMode; return new List { MakeAcBinary(testData, fastestByteOptions, "FastMode"), //MakeAcBinary(testData, fastWireOptions, "FastMode (FastWire)"), // MemPack uses _All_False (the AcBinary opt-in/opt-out axis doesn't apply — MemoryPackable // serialises identical bytes either way; _All_False matches the orderFalse variant the test // data factory already built, no extra graph allocation needed). new MemoryPackBenchmark(orderFalse, Configuration.SelectedWireMode, "Default"), }; } // AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer). // Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with // the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it // in isolation so the timing numbers reflect ONLY the streaming path. if (serializerMode == SerializerSelectionMode.AsyncPipe) { // NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here // drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to // the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the // NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall // fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level // wire chunk AND kernel transfer unit; change ONLY this line when tuning. var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode; binaryFastModePipeChunkOnly.BufferWriterChunkSize = Configuration.PipeChunkSize; binaryFastModePipeChunkOnly.WireMode = Configuration.SelectedWireMode; return new List { // Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync. // Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer + // MRES wait-on-byte-shortage — over a kernel NamedPipe. MakeAcBinaryNamedPipe(testData, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"), // Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport, // same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize(byte[]). // No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with // the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from // kernel-transport-overhead (raw vs in-process Byte[]). MakeAcBinaryNamedPipeRaw(testData, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"), // Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel). // Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the // kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk // on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this // isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the // in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback // is the worst-case benchmark scenario for chunked-streaming and not representative of real network // / file / cross-thread Pipe scenarios. MakeAcBinaryInMemoryPipe(testData, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"), // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples // baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw // byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory]. MakeAcBinaryInMemoryRaw(testData, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"), }; } // Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the // AsyncPipe menu / CLI mode, never bundled with the steady-state suite). var binaryNoInternOption = AcBinarySerializerOptions.Default; binaryNoInternOption.UseStringInterning = StringInterningMode.None; binaryNoInternOption.WireMode = Configuration.SelectedWireMode; var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default; binaryDefaultNoSgenOption.UseGeneratedCode = false; binaryDefaultNoSgenOption.WireMode = Configuration.SelectedWireMode; var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode; binaryFastModeNoSgenOption.UseGeneratedCode = false; binaryFastModeNoSgenOption.WireMode = Configuration.SelectedWireMode; var binaryFastModeOption = AcBinarySerializerOptions.FastMode; binaryFastModeOption.WireMode = Configuration.SelectedWireMode; // BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives // the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call. // Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration // allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead // vs syscall count). var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode; binaryFastModeBufWrChunk.BufferWriterChunkSize = Configuration.PipeChunkSize; binaryFastModeBufWrChunk.WireMode = Configuration.SelectedWireMode; // In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment // concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory // System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task). var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode; binaryFastModePipeChunkInMem.BufferWriterChunkSize = Configuration.PipeChunkSize; binaryFastModePipeChunkInMem.WireMode = Configuration.SelectedWireMode; var defaultOptions = AcBinarySerializerOptions.Default; defaultOptions.UseStringInterning = StringInterningMode.None; defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId; defaultOptions.WireMode = Configuration.SelectedWireMode; return new List { // ============================================================ // AcBinary — Byte[] API (uncomment to compare option presets side-by-side) // ============================================================ // Fastest Byte[] — SGen path (UseGeneratedCode=true, default). MakeAcBinary(testData, binaryFastModeOption, "FastMode"), // Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch. // Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples. // NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates // when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish). MakeAcBinary(testData, binaryFastModeNoSgenOption, "FastMode"), // Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) + // UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size // and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data. // Default preset (ReferenceHandling=OnlyId + StringInterning) → _All_True graph. // Phase 2 variant-dispatch rule: any options preset with a "true"-flagged feature uses // the _All_True family (rich graph, opt-out AcBinarySerializable attribute matches). MakeAcBinary(testData, defaultOptions, "Default"), //MakeAcBinary(testData, binaryDefaultNoSgenOption, "Default"), //MakeAcBinary(testData, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"), //MakeAcBinary(testData, binaryNoInternOption, "NoIntern"), // AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario) MakeAcBinaryBufferWriter(testData, binaryFastModeOption, "FastMode"), // AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario). // 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter // allocation. Optimum for this scenario. MakeAcBinaryFreshBufferWriter(testData, binaryFastModeBufWrChunk, "FastMode (4KB)"), // AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the // Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost // (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns. // The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode. MakeAcBinaryInMemoryPipe(testData, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"), // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to- // apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs // chunked-streaming wire format. The IO column shows "Bytes(in-mem)". MakeAcBinaryInMemoryRaw(testData, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"), // AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED // here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport // measurements. // ============================================================ // MemoryPack — three I/O modes for apples-to-apples comparison // ============================================================ // MemPack uses _All_False (see FastestByte-mode comment above for rationale). new MemoryPackBenchmark(orderFalse, Configuration.SelectedWireMode, "Default"), new MemoryPackBufferWriterBenchmark(orderFalse, Configuration.SelectedWireMode, "Default"), new MemoryPackFreshBufferWriterBenchmark(orderFalse, Configuration.SelectedWireMode, "Default"), // ============================================================ // MessagePack — for legacy comparison // ============================================================ #if !AYCODE_NATIVEAOT // MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed // ListFormatter et al. — fails under NativeAOT publish with "No parameterless constructor". // Excluded from the AOT build; available for regular JIT runs only. new MessagePackBenchmark(orderFalse, "ContractBased"), #endif // System.Text.Json (commented — JSON serializer for reference; not in active suite) //new SystemTextJsonBenchmark(orderFalse, "Default") }; } /// /// Forces a full GC cycle at a phase boundary in the benchmark loop. Two-pass collect with finalizer drain /// in between: the first pass moves managed garbage to the finalization queue, WaitForPendingFinalizers /// runs the finalizers, the second pass reclaims any objects the finalizers released. After this returns the /// heap is in a known-quiescent state — the next warmup/measurement phase starts on a clean slate, isolated /// from the previous phase's residual allocations (write-buffer pools, intern cache, write-plan arrays, etc.). /// Called between every Ser-phase / Des-phase boundary in . /// [MethodImpl(MethodImplOptions.NoInlining)] internal static void ForceGcCollect() { GC.Collect(2, GCCollectionMode.Forced, blocking: true); GC.WaitForPendingFinalizers(); GC.Collect(2, GCCollectionMode.Forced, blocking: true); } /// /// Runs the action times for independent samples, /// returning the median, min, and max elapsed time. Multi-sample design reduces single-run variance /// from ~±15% to ~±5% by smoothing transient effects (background activity, thermal/turbo state). /// When <= 1, falls back to single-sample timing (Debug / quick mode). /// When is non-null, emits in-place \r progress updates so a /// stuck benchmark (e.g. deadlocked NamedPipe row) is visibly stuck at a specific %% rather than /// silently hanging. /// /// Stabilization (added 2026-05-07): /// 1) Pilot sample is run BEFORE the recorded loop and discarded. The first measurement after /// warmup tends to absorb residual JIT bookkeeping and GC bookkeeping; dropping it tightens /// the min/max range without throwing away signal (the median is the SAME data as before). /// 2) GC.Collect / WaitForPendingFinalizers / GC.Collect runs BEFORE every recorded sample. /// Without this, GC pressure from sample N occasionally triggered a Gen-2 pause inside /// sample N+1, painting it as an outlier; collecting up-front gives every sample the /// same starting heap shape. /// 3) Returns (median, min, max) so the caller can surface the inter-sample range — visible /// noise floor for the row, replacing the previous "median only" view. /// internal static (double medianMs, double minMs, double maxMs, double stdDevMs) RunTimed(Action action, int iterations, string? progressLabel = null) { var samples = Configuration.BenchmarkSamples; if (samples <= 1) { // Single-sample fast path (Debug or trivial run) — no allocation, no sort, no stddev. var sw = Stopwatch.StartNew(); RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0); sw.Stop(); var ms = sw.Elapsed.TotalMilliseconds; EndProgress(progressLabel, ms); return (ms, ms, ms, 0); } // Pilot sample (discarded). Counts as sample index 0 of (samples + 1) for progress display // so the user sees an extra "warmup-ish" tick before the recorded samples start. GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); var pilotSw = Stopwatch.StartNew(); RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: 0); pilotSw.Stop(); // intentionally not stored var times = new double[samples]; for (var s = 0; s < samples; s++) { // Per-sample GC settle. Forces every sample to start from the same heap state, so // a Gen-2 pause caused by the previous sample doesn't bleed into the next sample's // timing. Cost is paid OUTSIDE the Stopwatch window — no impact on the measurement. GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); // Inter-sample thermal-settle: CPU boost-clock can drop mid-batch under sustained load // (e.g. 10×250ms = 2.5 sec burst). InterSampleSettleMs lets the boost-clock state // settle so later samples don't read systematically slower than early ones. Skip before // the first sample (no prior heat to settle from). Set to 0 in Configuration to disable. if (s > 0 && Configuration.InterSampleSettleMs > 0) Thread.Sleep(Configuration.InterSampleSettleMs); var sw = Stopwatch.StartNew(); RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: s + 1); sw.Stop(); times[s] = sw.Elapsed.TotalMilliseconds; } // Capture min/max/sum/sumSq BEFORE sort to avoid order ambiguity (Array.Sort is in-place). var minMs = double.MaxValue; var maxMs = double.MinValue; var sum = 0.0; var sumSq = 0.0; for (var i = 0; i < times.Length; i++) { var t = times[i]; sum += t; sumSq += t * t; if (t < minMs) minMs = t; if (t > maxMs) maxMs = t; } // Population stddev (not sample-stddev — we treat the captured samples as the population for // CV computation). variance = E[X²] - E[X]² with Math.Max(0, ...) guard against tiny negative // values from FP rounding when samples are nearly identical. var mean = sum / times.Length; var variance = (sumSq / times.Length) - (mean * mean); var stdDevMs = Math.Sqrt(Math.Max(0.0, variance)); Array.Sort(times); // Trimmed median: when samples >= 4, drop the single min and single max (sorted-array // first and last) and compute median on the remaining (samples - 2) entries. Removes the // worst per-sample contamination (a thermal spike, OS preempt, or a GC pause that escaped // the per-sample GC.Collect settle) without throwing away too much signal. The min/max / // stdDev outputs still reflect the FULL sample population — the trim affects only the // headline median figure, so the visible range still shows the actual measurement extremes. var trimStart = samples >= 4 ? 1 : 0; var trimCount = samples >= 4 ? samples - 2 : samples; var medianMs = trimCount % 2 == 1 ? times[trimStart + trimCount / 2] : (times[trimStart + trimCount / 2 - 1] + times[trimStart + trimCount / 2]) / 2.0; EndProgress(progressLabel, medianMs); return (medianMs, minMs, maxMs, stdDevMs); } /// /// Per-cell adaptive iteration calibration. Runs a 100-iter measurement after warmup and computes /// how many iterations are needed to reach wall-clock per sample. /// Returns iter rounded UP to the nearest 1000, floored at 1000 (the prior fixed minimum) and /// ceiling-capped at 200_000 (sanity bound for pathologically fast ops). In Debug single-sample mode /// (Configuration.BenchmarkSamples <= 1) returns the global unchanged — /// calibration overhead is unjustified there. Calibration runs OUTSIDE the timed sample loop and /// does NOT count toward warmup; its sole purpose is to measure per-op cost. /// internal static int CalibrateIterations(Action action, int targetMs) { if (Configuration.BenchmarkSamples <= 1) return Configuration.TestIterations; // Debug fast path GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); const int calibIter = 100; var sw = Stopwatch.StartNew(); for (var i = 0; i < calibIter; i++) action(); sw.Stop(); var ms = sw.Elapsed.TotalMilliseconds; // Pathologically-fast op below Stopwatch resolution — cap at ceiling (further calibration won't help). if (ms <= 0.0001) return 200_000; var iterPerMs = calibIter / ms; var raw = (int)Math.Ceiling(targetMs * iterPerMs); // Round UP to nearest 1000 — keeps numbers human-readable in the markdown output. var rounded = ((raw + 999) / 1000) * 1000; return rounded switch { < 1000 => 1000, > 200_000 => 200_000, _ => rounded }; } /// /// Measures per-call allocation in bytes after a clean GC. Single dedicated sample (no median) — keeps /// timing samples pure. When is true, uses /// instead of /// — needed for round-trip-only benchmarks (NamedPipe etc.) where the work happens across multiple /// threads (server-side new byte[len] buffers, drain-pump-thread allocations). Per-thread mode /// is slightly cleaner for in-memory benchmarks; process-wide mode is slightly noisier (background /// threads / GC bookkeeping leak in) but over 1000 iterations the signal dominates. /// internal static long MeasureAllocation(Action action, int iterations, string? progressLabel = null, bool processWide = false) { GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); var sw = Stopwatch.StartNew(); var before = processWide ? GC.GetTotalAllocatedBytes(precise: true) : GC.GetAllocatedBytesForCurrentThread(); RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0); var after = processWide ? GC.GetTotalAllocatedBytes(precise: true) : GC.GetAllocatedBytesForCurrentThread(); sw.Stop(); EndProgress(progressLabel, sw.Elapsed.TotalMilliseconds); return (after - before) / iterations; } // ============================================================================================ // Progress reporting — \r-driven in-place updates so a stuck benchmark surfaces the exact phase // and % where it stopped, instead of appearing as a silent hang. Used by RunTimed and the // MeasureAllocation* helpers when the caller passes a non-null progressLabel. // ============================================================================================ // Tracks the longest line written by the current progress session, so EndProgress can clear // any leftover characters from a prior longer line (avoids "ghost" trailing chars after \r). private static int _progressLastLineLen; /// /// Runs times, emitting \r-overwriting /// progress every ~10% (approx. 10 progress prints per sample). When /// is null, runs without any progress output (zero overhead beyond a null check per iter). /// private static void RunWithProgress(Action action, int iterations, string? label, int samples, int sampleIndex) { if (label is null) { for (var i = 0; i < iterations; i++) action(); return; } // Batch-based progress emit — ~10 progress prints per sample. The inner loop is branchless // (no per-iter modulo / progress check), so the per-iter overhead is bare `action()` cost. // The outer loop drives the batches; progress emit happens once per batch on the boundary. // This keeps sub-µs ops cleanly measurable — the prior `if ((i + 1) % step == 0)` check // added a 1-2 cycle per-iter branch that distorted hot loops near the Stopwatch resolution. var step = Math.Max(1, iterations / 10); var done = 0; while (done < iterations) { var batch = Math.Min(step, iterations - done); // Inner tight loop: no progress check, no modulo. Just the measured action() calls. for (var i = 0; i < batch; i++) action(); done += batch; var pct = (int)(done * 100L / iterations); var line = samples > 1 ? $" > {label} sample {sampleIndex + 1}/{samples} {pct,3}% ({done}/{iterations})" : $" > {label} {pct,3}% ({done}/{iterations})"; System.Console.Write('\r'); System.Console.Write(line); if (line.Length < _progressLastLineLen) System.Console.Write(new string(' ', _progressLastLineLen - line.Length)); _progressLastLineLen = line.Length; } } /// /// Closes a progress line cleanly: clears any leftover chars and writes a final "done" line on /// the same row, terminated by \n so subsequent WriteLine calls render below. /// private static void EndProgress(string? label, double elapsedMs) { if (label is null) return; var done = $" > {label} done in {elapsedMs,7:F1} ms"; System.Console.Write('\r'); System.Console.Write(done); if (done.Length < _progressLastLineLen) System.Console.Write(new string(' ', _progressLastLineLen - done.Length)); System.Console.WriteLine(); _progressLastLineLen = 0; } /// /// Validates MemoryPack setup at startup. Aborts the benchmark if TestOrder_All_True is not [MemoryPackable]. /// Without this attribute, MemoryPack falls back to runtime resolver (slower) — comparison would be INVALID. /// internal static void ValidateMemoryPackSetup() { var typesToCheck = new[] { typeof(TestOrder_All_True) }; foreach (var type in typesToCheck) { var hasAttr = type.GetCustomAttributes(typeof(MemoryPackableAttribute), inherit: true).Any(); if (!hasAttr) { System.Console.Error.WriteLine($"❌ FATAL: {type.FullName} is not [MemoryPackable] — MemoryPack would fall back to runtime resolver, comparison is INVALID for SGen-vs-SGen claim."); System.Console.Error.WriteLine("Add [MemoryPackable] to the type and any nested types referenced from it."); Environment.Exit(1); } } } /// /// Filters test data sets by layer keyword. Layered approach lets you run only what's needed for the iteration cadence. /// P1: only "Core" data exists (Small/Medium/Large/Repeated/Deep). Comprehensive and Edge layers will be expanded in P2. /// internal static List FilterByLayer(List all, BenchmarkLayer layer) { if (layer == BenchmarkLayer.All) return all.ToList(); var coreNames = new[] { "Small", "Medium", "Large", "Repeated", "Deep" }; // P2 will add: "Flat", "Polymorphic", "Collection", "Numeric", "NonAscii", etc. var comprehensiveExtras = new string[] { /* P2 */ }; // P3 will add: "ColdStart", "VeryLarge", "PathologicalString", etc. var edgeExtras = new string[] { /* P3 */ }; return layer switch { BenchmarkLayer.Core => all.Where(t => StartsWithAny(t.Name, coreNames)).ToList(), BenchmarkLayer.Comprehensive => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras)).ToList(), BenchmarkLayer.Edge => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras) || StartsWithAny(t.Name, edgeExtras)).ToList(), // Single-cell A/B mini-suite filters — match by case-insensitive prefix on Name. // Use case: tight optimization-iteration loop on one specific cell (e.g. `dotnet run -- repeated` // or interactive menu shortcut), avoiding the full ~110 sec suite when only one cell is in scope. BenchmarkLayer.Small => all.Where(t => t.Name.StartsWith("Small", StringComparison.OrdinalIgnoreCase)).ToList(), BenchmarkLayer.Medium => all.Where(t => t.Name.StartsWith("Medium", StringComparison.OrdinalIgnoreCase)).ToList(), BenchmarkLayer.Large => all.Where(t => t.Name.StartsWith("Large", StringComparison.OrdinalIgnoreCase)).ToList(), BenchmarkLayer.Repeated => all.Where(t => t.Name.StartsWith("Repeated", StringComparison.OrdinalIgnoreCase)).ToList(), BenchmarkLayer.Deep => all.Where(t => t.Name.StartsWith("Deep", StringComparison.OrdinalIgnoreCase)).ToList(), _ => all.ToList() }; static bool StartsWithAny(string name, string[] prefixes) => prefixes.Any(name.StartsWith); } }