using AyCode.Core.Compression; using AyCode.Core.Serializers.Attributes; using AyCode.Core.Serializers.Binaries; using AyCode.Core.Tests.Serialization; // DrainFromAsync extension (test-only, used by benchmark) using AyCode.Core.Tests.TestModels; using MemoryPack; #if !AYCODE_NATIVEAOT using MessagePack; using MessagePack.Resolvers; #endif using Microsoft.Extensions.Options; using System.Buffers; using System.Diagnostics; using System.IO.Pipelines; using System.IO.Pipes; using System.Reflection; using System.Runtime.CompilerServices; using System.Text; using System.Text.Json; namespace AyCode.Core.Serializers.Console; /// /// Comprehensive benchmark application for all serializers. /// Compares: AcBinary (all options), MemoryPack, MessagePack, Newtonsoft.Json, System.Text.Json /// /// Usage: /// dotnet run # Run all benchmarks /// dotnet run -- quick # Quick mode (fewer iterations) /// dotnet run -- serialize # Serialize only /// dotnet run -- deserialize # Deserialize only /// public static class Program { private const string ResultsDirectory = @"H:\Applications\Aycode\Source\AyCode.Core\Test_Benchmark_Results\Benchmark"; #if DEBUG private const string BuildConfiguration = "Debug"; #else private const string BuildConfiguration = "Release"; #endif #if DEBUG private static int WarmupIterations = 0; private static int TestIterations = 1; private static int BenchmarkSamples = 1; // Debug: single sample, fast iteration #else private static int WarmupIterations = 10000; //5000 private static int TestIterations = 1000; //1000 private static int BenchmarkSamples = 10; #endif // Interactive settings: selected AcBinary wire mode for benchmark runs. // 1 = Compact, 2 = Fast private static WireMode SelectedWireMode = WireMode.Compact; // Serializer name constants // Engine identifiers (used in Engine column + comparison logic) private const string EngineAcBinary = "AcBinary"; private const string EngineMemoryPack = "MemoryPack"; #if !AYCODE_NATIVEAOT private const string EngineMessagePack = "MessagePack"; #endif private const string EngineSystemTextJson = "System.Text.Json"; // IO mode identifiers (used in IO column + comparison logic) private const string IoByteArray = "Byte[]"; private const string IoBufWrReuse = "BufWr reuse"; private const string IoBufWrNew = "BufWr new"; private const string IoString = "String"; private const string IoNamedPipe = "NamedPipe"; private const string IoNamedPipeRaw = "NamedPipe"; private const string IoInMemoryPipe = "Pipe(in-mem)"; private const string IoInMemoryRaw = "Pipe(in-mem)"; // Single source of truth for the chunk size used by ALL pipe-related benchmarks (NamedPipe PipeChunk, // NamedPipe PipeRaw, in-memory Pipe, in-memory RawMem) AND the NamedPipe server's inBufferSize/outBufferSize. // Same value across both layers ensures apples-to-apples comparison: chunked-streaming chunk-on-wire size // matches the kernel pipe-buffer slot exactly. Tweak HERE when experimenting; do NOT scatter chunkSize // overrides across individual benchmark rows. private const int PipeChunkSize = 4096; // Dispatch mode identifiers — describes how property access / type dispatch happens for a given run. // SGen = compile-time source generator path (Unsafe.As direct fields, slot-array wrapper lookup). // Runtime= reflection / compiled-delegate path. // Hybrid = SGen root with non-SGen child types reached via bridge methods. See docs/BINARY/BINARY_SGEN.md. private const string ModeSGen = "SGen"; private const string ModeRuntime = "Runtime"; private const string ModeHybrid = "Hybrid"; // Per-cell adaptive iteration target wall-clock duration. Each Ser/Des function calibrates its // own iteration count post-warmup so the sample batch lands in this range — equalizes the // per-sample window across cells of vastly different per-op cost (Small ~6 ns/op vs Large // ~140 µs/op). Below ~100 ms Stopwatch precision and OS preempt spikes start to dominate. private const int TargetSampleMs = 250; // CV (coefficient of variation = stddev / mean) threshold above which a row's range is flagged // as "unstable" in the markdown output (⚠️ marker). 3% is a reasonable noise-floor expectation // for stabilized in-memory benchmarks; rows above it should be discounted when reading // sub-3% inter-engine deltas. private const double UnstableCVThreshold = 0.03; // JIT-tier-promotion drain delay between warmup and measurement. // - JIT mode (RuntimeFeature.IsDynamicCodeCompiled == true): tiered JIT promotes hot methods // in a background thread; we wait briefly for the queue to drain so the first measurement // sample doesn't catch a Tier-0 → Tier-1 transition mid-flight. // - AOT mode (NativeAOT publish): no dynamic compilation happens; the sleep is pure noise. // 250ms (vs the historical 3000ms) is sufficient for a few-method working set under .NET 9's // tiered JIT — empirically the queue drains in <100ms for the bench's hot path. private static int JitSleep => System.Runtime.CompilerServices.RuntimeFeature.IsDynamicCodeCompiled ? 250 : 0; // OptionsPreset values are passed per-instance (constructor argument), not constants — // each CreateSerializers call line specifies its own preset name (e.g. "FastMode", "NoIntern"). private static readonly UTF8Encoding Utf8NoBom = new(encoderShouldEmitUTF8Identifier: false); /// /// Aggregated feature flags across every type tagged with /// the attribute in the loaded assemblies. Cached on first access (single reflection scan at startup). /// Used by so the per-row Options column shows BOTH the /// configured options-level value AND the effective attribute-level enable flag — a feature flagged /// off at the type level overrides the options regardless of preset, and that asymmetry must surface /// in the log to avoid misreading a "RefHandling=OnlyId" / "Interning=All" line as actually active. /// Aggregation rule: if ALL tagged types have the feature enabled → true; if any tagged type /// disables it → false (a single disabling type suppresses the feature on the type-graph). /// private static readonly (bool refHandling, bool internString, bool metadata, bool idTracking) _attrFlags = ScanAcBinaryAttributeFlags(); private static (bool refHandling, bool internString, bool metadata, bool idTracking) ScanAcBinaryAttributeFlags() { var attrs = AppDomain.CurrentDomain.GetAssemblies() .SelectMany(a => { try { return a.GetTypes(); } catch { return Array.Empty(); } }) .Select(t => t.GetCustomAttribute()) .Where(a => a != null) .ToList(); if (attrs.Count == 0) return (false, false, false, false); return ( refHandling: attrs.All(a => a!.EnableRefHandlingFeature), internString: attrs.All(a => a!.EnableInternStringFeature), metadata: attrs.All(a => a!.EnableMetadataFeature), idTracking: attrs.All(a => a!.EnableIdTrackingFeature)); } /// /// Common Options-column formatter for every AcBinary serializer benchmark row. Renders the /// configured options-level value AND the effective attribute-level enable flag side-by-side /// (e.g. Interning=All(opt) | False (attr)) so attribute-suppressed features cannot /// silently mislead. Pass any benchmark-specific extras (e.g. ", BufferSize=4096B") /// in — they are appended after the common fields. /// private static string BuildAcBinaryOptionsDescription(AcBinarySerializerOptions options, string extra = "") { return $"WireMode={options.WireMode}, " + $"RefHandling={options.ReferenceHandling}(opt) | {_attrFlags.refHandling} (attr), " + $"Interning={options.UseStringInterning}(opt) | {_attrFlags.internString} (attr), " + $"Metadata={options.UseMetadata}(opt) | {_attrFlags.metadata} (attr), " + $"SGen={options.UseGeneratedCode}, " + $"Compression={options.UseCompression}{extra}"; } /// /// Returns MemoryPack serializer options aligned with for a fair /// apples-to-apples wire-format comparison: /// /// (UTF-8) — both /// engines encode UTF-8, comparison is purely about header / tier / dispatch overhead. /// (UTF-16 raw memcpy) — /// both engines write UTF-16 raw bytes, so wire-size and CPU comparison reflect the same string-encoding family. /// /// Without this alignment the FastWire vs MemPack-default comparison conflates two unrelated dimensions /// (UTF-16 raw vs UTF-8 encoded) and produces a misleading +40% wire-size delta that is structurally /// the encoding-family difference, NOT an AcBinary-specific overhead. /// private static MemoryPackSerializerOptions GetMemPackOptions() => SelectedWireMode == WireMode.Fast ? MemoryPackSerializerOptions.Utf16 : MemoryPackSerializerOptions.Default; /// /// Converts a total-time (in ms across ) into per-operation microseconds. /// Formula: totalMs / iterations × 1000. The benchmark stores *TimeMs as the cumulative /// median over the timing run; the display layer renders per-op µs to make numbers iteration-count /// independent (e.g. switching TestIterations 1000 → 100 leaves the displayed µs/op unchanged /// — only its sample noise grows). Symmetric with the already-per-op *AllocBytesPerOp fields. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] /// /// Converts a total-time (in ms across ) into per-operation microseconds. /// Per-op µs is the iter-independent unit: 1000 iter and 50000 iter of the same operation should /// produce the same per-op µs (within noise). Necessary because per-cell adaptive iteration makes /// iterations a per-row property — there is no longer a single global TestIterations to divide by. /// private static double ToPerOpMicros(double totalMs, int iterations) => iterations > 0 ? totalMs / iterations * 1000.0 : 0; // Per-row per-op µs accessors — pull batch-time + iter from BenchmarkResult and convert. Used wherever // averaging or comparison happens across rows with potentially different iter counts (Winners summary, // Overall comparison, per-cell summary row). Keeping these as methods rather than properties on // BenchmarkResult preserves the result-as-data-bag distinction. private static double SerPerOp(BenchmarkResult r) => ToPerOpMicros(r.SerializeTimeMs, r.SerializeIterations); private static double DesPerOp(BenchmarkResult r) => ToPerOpMicros(r.DeserializeTimeMs, r.DeserializeIterations); private static double RtPerOp(BenchmarkResult r) => ToPerOpMicros(r.RoundTripTimeMs, r.RoundTripIterations); /// /// Per-cell-paired aggregation of an overall comparison. Captures three different aggregation /// strategies so the reader can judge whether the headline delta is dominated by one large cell /// (arithmetic mean) or representative of typical workload (geometric mean / median). /// /// Arithmetic mean of µs/op — magnitude-weighted; biased toward Large cell. /// Geometric mean of per-cell ratios — magnitude-neutral; each cell weighted equally. /// Median of per-cell ratios — outlier-resistant. /// Arithmetic mean AcBinary value (µs/op or bytes). /// Arithmetic mean MemPack value. /// Number of paired cells contributing to the geo/median. private record OverallStats(double ArithMeanPct, double GeoMeanPct, double MedianPct, double AcAvg, double MpAvg, int CellCount); /// /// Computes arithmetic + geometric + median aggregation of an AcBinary-vs-MemPack comparison /// across paired cells (joined by TestDataName). Per-cell pairing is required for the /// geo/median variants — a cell where AcBinary or MemPack is missing is dropped from all stats. /// Returns null when no paired cell has a valid value. /// private static OverallStats? ComputeOverallStats( List acResults, List mpResults, Func getValue) { if (acResults.Count == 0 || mpResults.Count == 0) return null; var pairs = (from ac in acResults join mp in mpResults on ac.TestDataName equals mp.TestDataName let acV = getValue(ac) let mpV = getValue(mp) where acV > 0 && mpV > 0 select (ac: acV, mp: mpV)).ToList(); if (pairs.Count == 0) return null; var acAvg = pairs.Average(p => p.ac); var mpAvg = pairs.Average(p => p.mp); var ratios = pairs.Select(p => p.ac / p.mp).ToList(); // Geometric mean: exp(avg(ln(ratios))) — numerically stable vs Π ratios then ^(1/N). var geoMean = Math.Exp(ratios.Sum(Math.Log) / ratios.Count); // Median (paired-ratio): for even N use the midpoint of the two middle values. var sorted = ratios.OrderBy(r => r).ToList(); var median = sorted.Count % 2 == 1 ? sorted[sorted.Count / 2] : (sorted[sorted.Count / 2 - 1] + sorted[sorted.Count / 2]) / 2.0; return new OverallStats( ArithMeanPct: (acAvg / mpAvg - 1) * 100, GeoMeanPct: (geoMean - 1) * 100, MedianPct: (median - 1) * 100, AcAvg: acAvg, MpAvg: mpAvg, CellCount: ratios.Count); } /// /// Formats a per-op micros value with its inter-sample range and CV-threshold marker as /// "26.86 (24.5..29.1)" or "26.86 (24.5..29.1) ⚠️5.2%". Median first, range in parentheses, /// CV warning suffix only when CV > . When min == max == median /// (single-sample / Debug / quick mode), collapses to bare median to avoid visual clutter. /// All time inputs are total-batch milliseconds; is the per-row iter /// count (post-adaptive-calibration). /// private static string FormatMicrosWithRange(double medianMs, double minMs, double maxMs, double stdDevMs, int iterations, System.Globalization.CultureInfo inv) { var med = ToPerOpMicros(medianMs, iterations); // No range data (single-sample fast path) — surface as bare median, identical to the prior format. if (minMs <= 0 && maxMs <= 0) return med.ToString("F2", inv); if (minMs >= medianMs && maxMs <= medianMs) return med.ToString("F2", inv); var min = ToPerOpMicros(minMs, iterations); var max = ToPerOpMicros(maxMs, iterations); var range = $"{med.ToString("F2", inv)} ({min.ToString("F2", inv)}..{max.ToString("F2", inv)})"; // CV (coefficient of variation = stddev / mean) — flag rows above the unstable threshold so a // small inter-engine delta on a high-CV row is easy to discount as noise. if (medianMs > 0 && stdDevMs > 0) { var cv = stdDevMs / medianMs; if (cv > UnstableCVThreshold) { var cvPct = (cv * 100).ToString("F1", inv); return $"{range} ⚠️{cvPct}%"; } } return range; } /// /// Converts a byte count to KB (1 KB = 1024 B). Display-only helper so allocation columns can /// render compact F2 KB values (e.g. 4.05 KB instead of 4,144 B) — header carries /// the unit so per-row entries stay numbers-only. CSV / raw-data outputs keep the precise byte /// integers untouched. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] private static double ToKilobytes(long bytes) => bytes / 1024.0; public static void Main(string[] args) { // Set console encoding to UTF-8 for proper Unicode character display System.Console.OutputEncoding = Encoding.UTF8; // Setup validation — abort BEFORE any benchmark logic if MemoryPack baseline is invalid. // Done early so user is told immediately, not after warmup. ValidateMemoryPackSetup(); // CLI mode (args provided): run once, parse args, exit. Backward-compatible behaviour. if (args.Length > 0) { if (!TryParseCliArgs(args, out var layer, out var opMode, out var serializerMode)) return; // profiler mode (already ran) or invalid args RunBenchmark(layer, opMode, serializerMode); return; } // Interactive mode (no args): loop the menu so the user doesn't have to restart between runs. // Q exits the menu (and the application). while (true) { var selection = ShowInteractiveMenu(); if (selection == null) return; // user pressed Q RunBenchmark(selection.Value.layer, "all", selection.Value.serializerMode); System.Console.WriteLine(); System.Console.WriteLine("─────────────────────────────────────────────────────────────────────"); System.Console.WriteLine("Returning to menu — press any key to continue, or Q to quit..."); var key = System.Console.ReadKey(intercept: true); if (key.Key == ConsoleKey.Q) return; System.Console.WriteLine(); } } /// /// Parses CLI arguments into (layer, opMode, serializerMode). Returns false if the args /// indicate a special mode that has already been handled (e.g. profiler) or are invalid; /// the caller should then exit without running the standard benchmark. /// private static bool TryParseCliArgs(string[] args, out string layer, out string opMode, out string serializerMode) { layer = "all"; opMode = "all"; serializerMode = "standard"; var arg = args[0].ToLower(); // Profiler mode: warmup only, then exit (for memory profiler analysis) if (arg == "profiler") { RunProfilerMode(); return false; } // Quick mode: short warmup, few iterations, small sample count if (arg == "quick") { WarmupIterations = 5; TestIterations = 100; BenchmarkSamples = 3; layer = "all"; } else if (arg is "core" or "comprehensive" or "edge" or "all" or "small" or "medium" or "large" or "repeated" or "deep") { layer = arg; } else if (arg is "asyncpipe" or "pipe") { // AsyncPipe-only mode: streaming I/O isolation across all test data. layer = "all"; serializerMode = "asyncpipe"; } else if (arg is "ser" or "serialize") { opMode = "serialize"; layer = "all"; } else if (arg is "des" or "deserialize") { opMode = "deserialize"; layer = "all"; } else { // Backwards compat: unknown arg → treat as layer keyword layer = arg; } return true; } /// /// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup /// + measurement → grouped results print → save to disk. Used by both the CLI and interactive /// menu paths; the interactive loop calls this repeatedly without restarting the process. /// private static void RunBenchmark(string layer, string opMode, string serializerMode) { System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗"); System.Console.WriteLine("║ COMPREHENSIVE SERIALIZER BENCHMARK SUITE ║"); System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝"); // Stabilization: pin the entire benchmark process to a single logical CPU and bump priority // class. Single-core affinity stops Windows from migrating the bench thread between cores // mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority // reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise // randomly inflate samples by 5-15%. // Try/finally guarantees the original state is restored even if a benchmark throws — leaving // a developer machine pinned to one core after a crashed run is a real foot-gun. // Skipped on Debug single-sample mode (BenchmarkSamples <= 1) where stabilization is moot. var process = Process.GetCurrentProcess(); var origAffinity = (IntPtr)0; var origPriority = ProcessPriorityClass.Normal; var stabilizationApplied = false; // ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at // runtime; skip the affinity step there but still raise priority class (which IS supported // on macOS, just less effective for stabilization than affinity pinning). if (BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux())) { try { origAffinity = process.ProcessorAffinity; origPriority = process.PriorityClass; // Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one // core, consistently" — not which one. If CPU 0 is heavily contended on the host // (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak // the mask here. The benchmark is single-threaded for the in-memory rows so single // core is sufficient; round-trip-only NamedPipe rows have a server-drain thread // that will share the core (acceptable — the bench measures end-to-end RT anyway). process.ProcessorAffinity = (IntPtr)1; process.PriorityClass = ProcessPriorityClass.High; stabilizationApplied = true; System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High."); } catch (Exception ex) { // Affinity/priority changes may fail on locked-down hosts (group policies, containers // without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still // works, just with the platform default scheduling. System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}"); } } try { var allResults = new List(); var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets(); var testDataSets = FilterByLayer(allTestDataSets, layer); System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{TargetSampleMs} ms target) | Warmup: {WarmupIterations} | Samples: {BenchmarkSamples} (median) + pilot discard"); System.Console.WriteLine($"Build: {BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}"); System.Console.WriteLine(); // Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens. // Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup // alone doesn't ensure that every Serialize/IBufferWriter overload is fully Tier 1 by the time we // start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants // on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger). // Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes // them all in the background; the per-cell warmup that follows then locks in cache + branch state. if (BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration) { System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)..."); foreach (var testData in testDataSets) { var preSerializers = CreateSerializers(testData, serializerMode); try { foreach (var s in preSerializers) { // Light warmup just to trigger Tier 0 → Tier 1 promotion. The per-cell 5000-iter warmup // inside RunBenchmarksForTestData still runs afterwards for cache/BTB warming. s.Warmup(2000); } } finally { // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources). foreach (var s in preSerializers) (s as IDisposable)?.Dispose(); } } // Let background tiered-JIT compilation drain before we begin measuring. if (JitSleep > 0) Thread.Sleep(JitSleep); System.Console.WriteLine("✓ Global pre-warmup complete.\n"); } foreach (var testData in testDataSets) { System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}"); System.Console.WriteLine($"TEST DATA: {testData.DisplayName}"); System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}"); var results = RunBenchmarksForTestData(testData, opMode, serializerMode); allResults.AddRange(results); } // Print grouped results PrintGroupedResults(allResults, testDataSets); // Save results to file SaveResults(allResults, testDataSets); System.Console.WriteLine("\n✓ Benchmark complete!"); } finally { // Restore process state — affinity/priority changes are process-wide and persist across // interactive-mode iterations of the menu. Without restore, the second menu run would // already be on CPU-0 + High priority before its own try-block applied them, masking // any stabilization-disabled comparison. if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux())) { try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ } try { process.PriorityClass = origPriority; } catch { /* best-effort */ } } } } /// /// Profiler mode: warmup only, then EXIT immediately. /// Usage: dotnet run -- profiler /// private static void RunProfilerMode() { System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗"); System.Console.WriteLine("║ PROFILER MODE (AcBinary only) ║"); System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝"); System.Console.WriteLine($"Build: {BuildConfiguration} | .NET: {Environment.Version}"); System.Console.WriteLine(); var order = BenchmarkTestDataProvider.CreateProfilerOrder(); var options = AcBinarySerializerOptions.WithoutReferenceHandling; options.UseStringInterning = StringInterningMode.None; var bytes = AcBinarySerializer.Serialize(order, options); // Warmup (fills caches) System.Console.WriteLine("Warming up (1000 iterations)..."); for (var i = 0; i < 1000; i++) { _ = AcBinarySerializer.Serialize(order, options); _ = AcBinaryDeserializer.Deserialize(bytes); } Thread.Sleep(2000); System.Console.WriteLine("Warmup complete. Caches are now populated."); System.Console.WriteLine(); // HOT PATH - this is what the profiler should capture! System.Console.WriteLine("Running hot path serialization (1000 iterations for profiling)..."); for (var i = 0; i < 1000; i++) { _ = AcBinarySerializer.Serialize(order, options); //_ = AcBinaryDeserializer.Deserialize(bytes); } System.Console.WriteLine("Running hot path deserialization (1000 iterations for profiling)..."); for (var i = 0; i < 1000; i++) { _ = AcBinaryDeserializer.Deserialize(bytes); } System.Console.WriteLine("Hot path complete."); System.Console.WriteLine(); System.Console.WriteLine(">>> ATTACH MEMORY PROFILER NOW <<<"); System.Console.WriteLine("Press any key to exit..."); System.Console.ReadKey(intercept: true); System.Console.WriteLine(); System.Console.WriteLine("✓ Profiler mode complete. Exiting now."); } #region Benchmark Execution private static List RunBenchmarksForTestData(TestDataSet testData, string mode, string serializerMode) { var results = new List(); var serializers = CreateSerializers(testData, serializerMode); // Round-trip correctness check — once per (cell × serializer), BEFORE warmup. Aborts the entire benchmark on failure. System.Console.WriteLine("Verifying round-trip correctness..."); foreach (var serializer in serializers) { if (!serializer.VerifyRoundTrip()) { System.Console.Error.WriteLine($"❌ FATAL: Round-trip verification FAILED for {serializer.Name} on {testData.DisplayName}"); System.Console.Error.WriteLine("Benchmark numbers from a serializer with broken round-trip would be meaningless. Aborting."); Environment.Exit(1); } } System.Console.WriteLine("✓ All serializers passed round-trip verification."); // Per-serializer (warmup → calibrate → measurement) cycle: each serializer warms up IMMEDIATELY // before its own bench, then calibrates iter per-function (Ser and Des independently) so each // sample lands at ~TargetSampleMs wall-clock. This avoids cache pollution AND equalizes sample // window length across cells of vastly different per-op cost. System.Console.WriteLine($"Running benchmarks (target ~{TargetSampleMs} ms/sample × {BenchmarkSamples} samples median, per-serializer warmup + adaptive iter)...\n"); foreach (var serializer in serializers) { // Warmup THIS serializer right before benching it — keeps its hot code/data in cache. serializer.Warmup(WarmupIterations); // Wait for tiered JIT background compilation to drain (mode-aware: 0ms in AOT). // Per-serializer instead of once globally — guarantees this serializer's freshly-promoted // methods are settled before timing, regardless of when it appears in the iteration order. if (JitSleep > 0) Thread.Sleep(JitSleep); // Adaptive iter calibration — per Ser/Des/RT function, post-warmup. Each function gets its // own iter count tuned to TargetSampleMs (typically 250 ms). The 100-iter calibration cost // is amortized over the BenchmarkSamples + 1 (pilot) recorded measurements that follow. int serIter = TestIterations, desIter = TestIterations, rtIter = TestIterations; if (serializer.IsRoundTripOnly) { if (mode is "all" or "serialize" or "ser") rtIter = CalibrateIterations(() => serializer.Serialize(), TargetSampleMs); } else { if (mode is "all" or "serialize" or "ser") serIter = CalibrateIterations(() => serializer.Serialize(), TargetSampleMs); if (mode is "all" or "deserialize" or "des") desIter = CalibrateIterations(() => serializer.Deserialize(), TargetSampleMs); } var result = new BenchmarkResult { TestDataName = testData.DisplayName, // Use DisplayName for IId% info Engine = serializer.Engine, IoMode = serializer.IoMode, DispatchMode = serializer.DispatchMode, OptionsPreset = serializer.OptionsPreset, OptionsDescription = serializer.OptionsDescription, SerializedSize = serializer.SerializedSize, SetupSerializeAllocBytes = serializer.SetupSerializeAllocBytes, SetupDeserializeAllocBytes = serializer.SetupDeserializeAllocBytes, IsRoundTripOnly = serializer.IsRoundTripOnly }; // Group label for in-place \r progress. Identifies (cell × serializer) so a stuck benchmark // is visibly stuck on a specific row at a specific %% rather than silently hanging. var groupLabel = $"{result.SerializerName}"; if (serializer.IsRoundTripOnly) { // Round-trip-only benchmarks (NamedPipe etc.): measure the full pipe round-trip directly into the RT // columns. Ser ms / SerAlloc / Des ms / DesAlloc stay 0 → display as "N/A". Allocation uses the // process-wide measurement so the server-drain-thread allocations (e.g. server-side new byte[len]) // also show up — otherwise current-thread alloc would only count the client side and look ~halved. if (mode is "all" or "serialize" or "ser") { var (rtMed, rtMin, rtMax, rtStd) = RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]"); result.RoundTripTimeMs = rtMed; result.RoundTripTimeMinMs = rtMin; result.RoundTripTimeMaxMs = rtMax; result.RoundTripTimeStdDevMs = rtStd; result.RoundTripIterations = rtIter; result.RoundTripAllocBytesPerOp = MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]"); } // mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently. } else { if (mode is "all" or "serialize" or "ser") { var (serMed, serMin, serMax, serStd) = RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]"); result.SerializeTimeMs = serMed; result.SerializeTimeMinMs = serMin; result.SerializeTimeMaxMs = serMax; result.SerializeTimeStdDevMs = serStd; result.SerializeIterations = serIter; // Dedicated alloc-only sample (separate from timing samples; keeps timing pure) result.SerializeAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]"); } if (mode is "all" or "deserialize" or "des") { var (desMed, desMin, desMax, desStd) = RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]"); result.DeserializeTimeMs = desMed; result.DeserializeTimeMinMs = desMin; result.DeserializeTimeMaxMs = desMax; result.DeserializeTimeStdDevMs = desStd; result.DeserializeIterations = desIter; result.DeserializeAllocBytesPerOp = MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]"); } // Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration, // batch-time addition would be misleading. Instead: compute per-op µs (iter-independent), // then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that // RoundTripTimeMs / RoundTripIterations * 1000 == SerPerOp + DesPerOp. var serPerOp = ToPerOpMicros(result.SerializeTimeMs, serIter); var desPerOp = ToPerOpMicros(result.DeserializeTimeMs, desIter); var rtPerOp = serPerOp + desPerOp; result.RoundTripIterations = Math.Max(serIter, desIter); result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations; result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp; } results.Add(result); PrintResult(result); } // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources that must be released // before the next test data builds new ones — otherwise pipes / handles leak across test cells). foreach (var s in serializers) (s as IDisposable)?.Dispose(); return results; } private static List CreateSerializers(TestDataSet testData, string serializerMode) { // FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path. // TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[]. // - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head. // Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min. // // FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint — // we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor // reference earlier. Re-enable when revisiting Fast wire-mode performance. if (serializerMode == "fastestbyte") { var fastestByteOptions = AcBinarySerializerOptions.FastMode; fastestByteOptions.WireMode = SelectedWireMode; return new List { new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"), //new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"), new MemoryPackBenchmark(testData.Order, "Default"), }; } // AsyncPipe-only mode — return ONLY the AsyncPipe streaming benchmark (no other serializer). // Streaming I/O has long-lived pipe setup + kernel-buffer overhead that, when interleaved with // the standard byte-array / IBufferWriter measurements, masks the steady-state numbers. Run it // in isolation so the timing numbers reflect ONLY the streaming path. if (serializerMode == "asyncpipe") { // NamedPipe — pipe-aligned chunk size for the long-lived IPC scenario. The chunkSize here // drives the AsyncPipeWriterOutput's chunk-on-wire size (header + data, page-aligned thanks to // the AcquireChunk fix) AND the kernel pipe buffer size (inBufferSize/outBufferSize on the // NamedPipeServerStream ctor). Same value across both layers = one WriteFile(chunkSize) syscall // fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level // wire chunk AND kernel transfer unit; change ONLY this line when tuning. var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode; binaryFastModePipeChunkOnly.BufferWriterChunkSize = PipeChunkSize; binaryFastModePipeChunkOnly.WireMode = SelectedWireMode; return new List { // Chunked-framed AsyncPipe: SerializeChunkedFramed + AsyncPipeReaderInput.DrainFromAsync. // Measures the FULL streaming-I/O stack — wire framing + drain task + sliding-window buffer + // MRES wait-on-byte-shortage — over a kernel NamedPipe. new AcBinaryNamedPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"), // Raw byte[] over NamedPipe (sync receive, no chunk-framing). Same kernel-pipe transport, // same inBufferSize, but: serialize → byte[] → Stream.Write → Stream.Read → Deserialize(byte[]). // No drain task, no AsyncPipeReaderInput, no [201][UINT16][data]…[202] framing. Side-by-side with // the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from // kernel-transport-overhead (raw vs in-process Byte[]). new AcBinaryNamedPipeRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"), // Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel). // Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the // kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk // on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this // isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the // in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback // is the worst-case benchmark scenario for chunked-streaming and not representative of real network // / file / cross-thread Pipe scenarios. new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"), // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples // baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw // byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory]. new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"), }; } // Standard mode — all serializers EXCEPT AsyncPipe (the streaming benchmark is opt-in via the // AsyncPipe menu / CLI mode, never bundled with the steady-state suite). var binaryNoInternOption = AcBinarySerializerOptions.Default; binaryNoInternOption.UseStringInterning = StringInterningMode.None; binaryNoInternOption.WireMode = SelectedWireMode; var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default; binaryDefaultNoSgenOption.UseGeneratedCode = false; binaryDefaultNoSgenOption.WireMode = SelectedWireMode; var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode; binaryFastModeNoSgenOption.UseGeneratedCode = false; binaryFastModeNoSgenOption.WireMode = SelectedWireMode; var binaryFastModeOption = AcBinarySerializerOptions.FastMode; binaryFastModeOption.WireMode = SelectedWireMode; // BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives // the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call. // Small chunk = small per-call allocation, optimum for one-shot serialization where each iteration // allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead // vs syscall count). var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode; binaryFastModeBufWrChunk.BufferWriterChunkSize = PipeChunkSize; binaryFastModeBufWrChunk.WireMode = SelectedWireMode; // In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment // concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory // System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task). var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode; binaryFastModePipeChunkInMem.BufferWriterChunkSize = PipeChunkSize; binaryFastModePipeChunkInMem.WireMode = SelectedWireMode; var defaultOptions = AcBinarySerializerOptions.Default; defaultOptions.UseStringInterning = StringInterningMode.None; defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId; defaultOptions.WireMode = SelectedWireMode; return new List { // ============================================================ // AcBinary — Byte[] API (uncomment to compare option presets side-by-side) // ============================================================ // Fastest Byte[] — SGen path (UseGeneratedCode=true, default). new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"), // Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch. // Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples. // NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates // when RuntimeFeature.IsDynamicCodeSupported is false (slower but works under AOT publish). new AcBinaryBenchmark(testData.Order, binaryFastModeNoSgenOption, "FastMode"), // Default preset Byte[] — RefHandling=OnlyId (deduplicates IId-shared references on the wire) + // UseStringInterning=All (deduplicates repeated strings). Showcases the Default preset's wire-size // and CPU trade-off vs FastMode on the ~20% IId-ref / repeated-string test data. new AcBinaryBenchmark(testData.Order, defaultOptions, "Default"), //new AcBinaryBenchmark(testData.Order, binaryDefaultNoSgenOption, "Default"), //new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.WithoutReferenceHandling, "NoRef"), //new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"), // AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario) new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"), // AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario). // 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter // allocation. Optimum for this scenario. new AcBinaryFreshBufferWriterBenchmark(testData.Order, binaryFastModeBufWrChunk, "FastMode (4KB)"), // AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the // Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost // (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns. // The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode. new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"), // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to- // apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs // chunked-streaming wire format. The IO column shows "Bytes(in-mem)". new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"), // AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED // here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport // measurements. // ============================================================ // MemoryPack — three I/O modes for apples-to-apples comparison // ============================================================ new MemoryPackBenchmark(testData.Order, "Default"), new MemoryPackBufferWriterBenchmark(testData.Order, "Default"), new MemoryPackFreshBufferWriterBenchmark(testData.Order, "Default"), // ============================================================ // MessagePack — for legacy comparison // ============================================================ #if !AYCODE_NATIVEAOT // MessagePack v3's DynamicGenericResolver uses Activator.CreateInstance on trimmed // ListFormatter et al. — fails under NativeAOT publish with "No parameterless constructor". // Excluded from the AOT build; available for regular JIT runs only. new MessagePackBenchmark(testData.Order, "ContractBased"), #endif // System.Text.Json (commented — JSON serializer for reference; not in active suite) //new SystemTextJsonBenchmark(testData.Order, "Default") }; } /// /// Runs the action times for independent samples, /// returning the median, min, and max elapsed time. Multi-sample design reduces single-run variance /// from ~±15% to ~±5% by smoothing transient effects (background activity, thermal/turbo state). /// When <= 1, falls back to single-sample timing (Debug / quick mode). /// When is non-null, emits in-place \r progress updates so a /// stuck benchmark (e.g. deadlocked NamedPipe row) is visibly stuck at a specific %% rather than /// silently hanging. /// /// Stabilization (added 2026-05-07): /// 1) Pilot sample is run BEFORE the recorded loop and discarded. The first measurement after /// warmup tends to absorb residual JIT bookkeeping and GC bookkeeping; dropping it tightens /// the min/max range without throwing away signal (the median is the SAME data as before). /// 2) GC.Collect / WaitForPendingFinalizers / GC.Collect runs BEFORE every recorded sample. /// Without this, GC pressure from sample N occasionally triggered a Gen-2 pause inside /// sample N+1, painting it as an outlier; collecting up-front gives every sample the /// same starting heap shape. /// 3) Returns (median, min, max) so the caller can surface the inter-sample range — visible /// noise floor for the row, replacing the previous "median only" view. /// private static (double medianMs, double minMs, double maxMs, double stdDevMs) RunTimed(Action action, int iterations, string? progressLabel = null) { var samples = BenchmarkSamples; if (samples <= 1) { // Single-sample fast path (Debug or trivial run) — no allocation, no sort, no stddev. var sw = Stopwatch.StartNew(); RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0); sw.Stop(); var ms = sw.Elapsed.TotalMilliseconds; EndProgress(progressLabel, ms); return (ms, ms, ms, 0); } // Pilot sample (discarded). Counts as sample index 0 of (samples + 1) for progress display // so the user sees an extra "warmup-ish" tick before the recorded samples start. GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); var pilotSw = Stopwatch.StartNew(); RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: 0); pilotSw.Stop(); // intentionally not stored var times = new double[samples]; for (var s = 0; s < samples; s++) { // Per-sample GC settle. Forces every sample to start from the same heap state, so // a Gen-2 pause caused by the previous sample doesn't bleed into the next sample's // timing. Cost is paid OUTSIDE the Stopwatch window — no impact on the measurement. GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); var sw = Stopwatch.StartNew(); RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: s + 1); sw.Stop(); times[s] = sw.Elapsed.TotalMilliseconds; } // Capture min/max/sum/sumSq BEFORE sort to avoid order ambiguity (Array.Sort is in-place). var minMs = double.MaxValue; var maxMs = double.MinValue; var sum = 0.0; var sumSq = 0.0; for (var i = 0; i < times.Length; i++) { var t = times[i]; sum += t; sumSq += t * t; if (t < minMs) minMs = t; if (t > maxMs) maxMs = t; } // Population stddev (not sample-stddev — we treat the captured samples as the population for // CV computation). variance = E[X²] - E[X]² with Math.Max(0, ...) guard against tiny negative // values from FP rounding when samples are nearly identical. var mean = sum / times.Length; var variance = (sumSq / times.Length) - (mean * mean); var stdDevMs = Math.Sqrt(Math.Max(0.0, variance)); Array.Sort(times); // Median: middle value for odd sample counts, average of two middles for even counts. var medianMs = samples % 2 == 1 ? times[samples / 2] : (times[samples / 2 - 1] + times[samples / 2]) / 2.0; EndProgress(progressLabel, medianMs); return (medianMs, minMs, maxMs, stdDevMs); } /// /// Per-cell adaptive iteration calibration. Runs a 100-iter measurement after warmup and computes /// how many iterations are needed to reach wall-clock per sample. /// Returns iter rounded UP to the nearest 1000, floored at 1000 (the prior fixed minimum) and /// ceiling-capped at 200_000 (sanity bound for pathologically fast ops). In Debug single-sample mode /// (BenchmarkSamples <= 1) returns the global unchanged — /// calibration overhead is unjustified there. Calibration runs OUTSIDE the timed sample loop and /// does NOT count toward warmup; its sole purpose is to measure per-op cost. /// private static int CalibrateIterations(Action action, int targetMs) { if (BenchmarkSamples <= 1) return TestIterations; // Debug fast path GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); const int calibIter = 100; var sw = Stopwatch.StartNew(); for (var i = 0; i < calibIter; i++) action(); sw.Stop(); var ms = sw.Elapsed.TotalMilliseconds; // Pathologically-fast op below Stopwatch resolution — cap at ceiling (further calibration won't help). if (ms <= 0.0001) return 200_000; var iterPerMs = calibIter / ms; var raw = (int)Math.Ceiling(targetMs * iterPerMs); // Round UP to nearest 1000 — keeps numbers human-readable in the markdown output. var rounded = ((raw + 999) / 1000) * 1000; if (rounded < 1000) return 1000; if (rounded > 200_000) return 200_000; return rounded; } /// /// Measures per-call allocation in bytes after a clean GC. Single dedicated sample (no median) — keeps timing samples pure. /// private static long MeasureAllocation(Action action, int iterations, string? progressLabel = null) { GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); var sw = Stopwatch.StartNew(); var before = GC.GetAllocatedBytesForCurrentThread(); RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0); var after = GC.GetAllocatedBytesForCurrentThread(); sw.Stop(); EndProgress(progressLabel, sw.Elapsed.TotalMilliseconds); return (after - before) / iterations; } /// /// Process-wide allocation measurement — needed for round-trip-only benchmarks (NamedPipe etc.) where /// the work happens across multiple threads. would /// only count the caller-thread allocations, missing the server-side new byte[len] buffers and /// any drain-pump-thread allocations. covers the entire process. /// Slightly noisier than the per-thread variant (background threads / GC bookkeeping leak in), but /// over 1000 iterations the signal dominates. /// private static long MeasureAllocationTotal(Action action, int iterations, string? progressLabel = null) { GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); var sw = Stopwatch.StartNew(); var before = GC.GetTotalAllocatedBytes(precise: true); RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0); var after = GC.GetTotalAllocatedBytes(precise: true); sw.Stop(); EndProgress(progressLabel, sw.Elapsed.TotalMilliseconds); return (after - before) / iterations; } // ============================================================================================ // Progress reporting — \r-driven in-place updates so a stuck benchmark surfaces the exact phase // and % where it stopped, instead of appearing as a silent hang. Used by RunTimed and the // MeasureAllocation* helpers when the caller passes a non-null progressLabel. // ============================================================================================ // Tracks the longest line written by the current progress session, so EndProgress can clear // any leftover characters from a prior longer line (avoids "ghost" trailing chars after \r). private static int _progressLastLineLen; /// /// Runs times, emitting \r-overwriting /// progress every ~10% (approx. 10 progress prints per sample). When /// is null, runs without any progress output (zero overhead beyond a null check per iter). /// private static void RunWithProgress(Action action, int iterations, string? label, int samples, int sampleIndex) { if (label is null) { for (var i = 0; i < iterations; i++) action(); return; } // ~10 progress emits per sample run. Avoid emitting on every iter (Console.Write is // expensive enough to skew sub-µs benchmarks if overdone). var step = Math.Max(1, iterations / 10); for (var i = 0; i < iterations; i++) { action(); if ((i + 1) % step == 0 || i == iterations - 1) { var pct = (int)((i + 1) * 100L / iterations); var line = samples > 1 ? $" > {label} sample {sampleIndex + 1}/{samples} {pct,3}% ({i + 1}/{iterations})" : $" > {label} {pct,3}% ({i + 1}/{iterations})"; System.Console.Write('\r'); System.Console.Write(line); if (line.Length < _progressLastLineLen) System.Console.Write(new string(' ', _progressLastLineLen - line.Length)); _progressLastLineLen = line.Length; } } } /// /// Closes a progress line cleanly: clears any leftover chars and writes a final "done" line on /// the same row, terminated by \n so subsequent WriteLine calls render below. /// private static void EndProgress(string? label, double elapsedMs) { if (label is null) return; var done = $" > {label} done in {elapsedMs,7:F1} ms"; System.Console.Write('\r'); System.Console.Write(done); if (done.Length < _progressLastLineLen) System.Console.Write(new string(' ', _progressLastLineLen - done.Length)); System.Console.WriteLine(); _progressLastLineLen = 0; } #if !AYCODE_NATIVEAOT private static readonly JsonSerializerOptions VerifyJsonOpts = new() { WriteIndented = false, DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull, ReferenceHandler = System.Text.Json.Serialization.ReferenceHandler.IgnoreCycles }; #endif /// /// Round-trip equality check: serialize both via System.Text.Json (canonical form) and compare strings. /// Slower than property-by-property compare, but universal — works for any object graph without custom comparer. /// /// /// AOT publish skip: System.Text.Json's reflection path uses runtime closed-generic instantiation /// (JsonPropertyInfo<TestStatus> et al.) that the trimmer drops, causing /// NotSupportedException: missing native code or metadata. The validation is JIT-only — the actual /// benchmark Serialize/Deserialize loops don't touch this path. Under AOT we return true so all /// VerifyRoundTrip() calls pass without running the cross-format validation. /// private static bool DeepEqualsViaJson(object? a, object? b) { #if AYCODE_NATIVEAOT // Skip cross-format validation under AOT — STJ reflection path is incompatible. The roundtrip // itself still runs (caller-side Serialize+Deserialize), just the JSON-canonical compare is bypassed. return true; #else if (a == null && b == null) return true; if (a == null || b == null) return false; var jsonA = JsonSerializer.Serialize(a, VerifyJsonOpts); var jsonB = JsonSerializer.Serialize(b, VerifyJsonOpts); return jsonA == jsonB; #endif } /// /// Validates MemoryPack setup at startup. Aborts the benchmark if TestOrder is not [MemoryPackable]. /// Without this attribute, MemoryPack falls back to runtime resolver (slower) — comparison would be INVALID. /// private static void ValidateMemoryPackSetup() { var typesToCheck = new[] { typeof(TestOrder) }; foreach (var type in typesToCheck) { var hasAttr = type.GetCustomAttributes(typeof(MemoryPackableAttribute), inherit: true).Any(); if (!hasAttr) { System.Console.Error.WriteLine($"❌ FATAL: {type.FullName} is not [MemoryPackable] — MemoryPack would fall back to runtime resolver, comparison is INVALID for SGen-vs-SGen claim."); System.Console.Error.WriteLine("Add [MemoryPackable] to the type and any nested types referenced from it."); Environment.Exit(1); } } } /// /// Interactive menu shown when no CLI args. Returns the layer keyword (core/comprehensive/edge/all) or null on Quit. /// Loops on settings-changes ([S]) — user is returned to this menu after modifying iteration counts. /// private static (string layer, string serializerMode)? ShowInteractiveMenu() { while (true) { System.Console.WriteLine(); System.Console.WriteLine("╔══════════════════════════════════════════════════════════╗"); System.Console.WriteLine("║ AcBinary Benchmark Suite ║"); System.Console.WriteLine("╚══════════════════════════════════════════════════════════╝"); System.Console.WriteLine(); System.Console.WriteLine("Select benchmark layer:"); System.Console.WriteLine(); System.Console.WriteLine(" [1] Core — daily iteration"); System.Console.WriteLine(" [2] Comprehensive — release validation"); System.Console.WriteLine(" [3] Edge cases — refactor verification"); System.Console.WriteLine(" [A] All layers"); System.Console.WriteLine(" [F] FastestByte — AcBinary FastMode Byte[] vs MemoryPack Byte[] only (tight optimization loop)"); System.Console.WriteLine(" [P] AsyncPipe — streaming I/O isolation (only AsyncPipe, all test data)"); System.Console.WriteLine($" [S] Settings — Iteration / WireMode (current: {SelectedWireMode})"); System.Console.WriteLine(" [Q] Quit"); System.Console.Write("\nSelection: "); var key = System.Console.ReadKey(intercept: false).KeyChar; System.Console.WriteLine(); switch (char.ToLower(key)) { case '1': return ("core", "standard"); case '2': return ("comprehensive", "standard"); case '3': return ("edge", "standard"); case 'a': return ("all", "standard"); case 'f': return ("all", "fastestbyte"); case 'p': return ("all", "asyncpipe"); case 's': ShowSettingsMenu(); continue; // re-display the main menu after settings update case 'q': return null; default: return ("all", "standard"); } } } /// /// Settings sub-menu — prompts for Warmup / Iterations / Samples values. Empty input keeps the /// current value. Validation: WarmupIterations ≥ 0; TestIterations ≥ 1; BenchmarkSamples ≥ 1. /// Returns to the caller (which re-displays the main menu). /// private static void ShowSettingsMenu() { while (true) { System.Console.WriteLine(); System.Console.WriteLine("─────────────────────────────────────────────"); System.Console.WriteLine("Settings"); System.Console.WriteLine("─────────────────────────────────────────────"); System.Console.WriteLine(" [1] Iteration — Warmup / Iterations / Samples"); System.Console.WriteLine($" [2] WireMode — current: {SelectedWireMode}"); System.Console.WriteLine($" [3] Charset — current: {GetCurrentCharsetName()}"); System.Console.WriteLine(" [B] Back"); System.Console.Write("\nSelection: "); var key = System.Console.ReadKey(intercept: false).KeyChar; System.Console.WriteLine(); switch (char.ToLower(key)) { case '1': ShowIterationSettingsMenu(); break; case '2': ShowWireModeSettingsMenu(); break; case '3': ShowCharsetSettingsMenu(); break; case 'b': return; default: continue; } } } /// /// Returns a human-readable name for the currently-active BenchmarkTestDataProvider.LongStringSuffix /// charset. Returns "Custom" when the suffix doesn't match any of the predefined /// constants. Used in menu state display, console run header, and /// the .LLM markdown output header so per-charset bench files are self-documenting. /// private static string GetCurrentCharsetName() { var s = BenchmarkTestDataProvider.LongStringSuffix; if (s == CharsetSuffixes.Latin1FixAscii) return "Latin1FixAscii"; if (s == CharsetSuffixes.Latin1Short) return "Latin1Short"; if (s == CharsetSuffixes.Latin1Long) return "Latin1Long"; if (s == CharsetSuffixes.CjkBmp) return "CjkBmp"; if (s == CharsetSuffixes.Cyrillic) return "Cyrillic"; if (s == CharsetSuffixes.Mixed) return "Mixed"; return "Custom"; } private static void ShowCharsetSettingsMenu() { while (true) { System.Console.WriteLine(); System.Console.WriteLine("─────────────────────────────────────────────"); System.Console.WriteLine("Charset settings — long-string suffix profile"); System.Console.WriteLine("─────────────────────────────────────────────"); System.Console.WriteLine($"Current: {GetCurrentCharsetName()}"); System.Console.WriteLine(); System.Console.WriteLine(" [1] Latin1FixAscii — empty suffix; short FixStr-fast-path stress (Latin1 baseline values stay short)"); System.Console.WriteLine(" [2] Latin1Short — \" árvíztűrő tükörfúrógép\" (~24 char Hungarian mixed)"); System.Console.WriteLine(" [3] Latin1Long — ~47-char Latin1 mixed (default; exceeds FixStr boundary)"); System.Console.WriteLine(" [4] CjkBmp — CJK BMP (long 3-byte runs)"); System.Console.WriteLine(" [5] Cyrillic — Russian Cyrillic (long 2-byte runs)"); System.Console.WriteLine(" [6] Mixed — Hungarian + CJK + Cyrillic + emoji (full-spectrum + surrogate pairs)"); System.Console.WriteLine(" [B] Back"); System.Console.Write("\nSelection: "); var key = System.Console.ReadKey(intercept: false).KeyChar; System.Console.WriteLine(); switch (char.ToLower(key)) { case '1': BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Latin1FixAscii; System.Console.WriteLine("✓ Charset set to Latin1FixAscii"); return; case '2': BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Latin1Short; System.Console.WriteLine("✓ Charset set to Latin1Short"); return; case '3': BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Latin1Long; System.Console.WriteLine("✓ Charset set to Latin1Long"); return; case '4': BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.CjkBmp; System.Console.WriteLine("✓ Charset set to CjkBmp"); return; case '5': BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Cyrillic; System.Console.WriteLine("✓ Charset set to Cyrillic"); return; case '6': BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Mixed; System.Console.WriteLine("✓ Charset set to Mixed"); return; case 'b': return; default: continue; } } } private static void ShowIterationSettingsMenu() { System.Console.WriteLine(); System.Console.WriteLine("─────────────────────────────────────────────"); System.Console.WriteLine("Iteration settings — press Enter to keep current value"); System.Console.WriteLine("─────────────────────────────────────────────"); System.Console.WriteLine(); WarmupIterations = PromptInt("WarmupIterations", WarmupIterations, min: 0); TestIterations = PromptInt("TestIterations ", TestIterations, min: 1); BenchmarkSamples = PromptInt("BenchmarkSamples", BenchmarkSamples, min: 1); System.Console.WriteLine(); System.Console.WriteLine($"✓ Iteration settings updated: Warmup={WarmupIterations} | Iterations={TestIterations} | Samples={BenchmarkSamples}"); } private static void ShowWireModeSettingsMenu() { while (true) { System.Console.WriteLine(); System.Console.WriteLine("─────────────────────────────────────────────"); System.Console.WriteLine("WireMode settings"); System.Console.WriteLine("─────────────────────────────────────────────"); System.Console.WriteLine($"Current: {SelectedWireMode}"); System.Console.WriteLine(" [1] Compact"); System.Console.WriteLine(" [2] Fast"); System.Console.WriteLine(" [B] Back"); System.Console.Write("\nSelection: "); var key = System.Console.ReadKey(intercept: false).KeyChar; System.Console.WriteLine(); switch (char.ToLower(key)) { case '1': SelectedWireMode = WireMode.Compact; System.Console.WriteLine("✓ WireMode set to Compact"); return; case '2': SelectedWireMode = WireMode.Fast; System.Console.WriteLine("✓ WireMode set to Fast"); return; case 'b': return; default: continue; } } } /// /// Prompts the user for an integer with a default (current value). Returns the current value if /// the user presses Enter on empty input or if parsing fails / value is below the minimum. /// private static int PromptInt(string name, int currentValue, int min) { System.Console.Write($" {name} [{currentValue}]: "); var input = System.Console.ReadLine()?.Trim() ?? ""; if (input.Length == 0) return currentValue; if (int.TryParse(input, out var newValue) && newValue >= min) return newValue; System.Console.WriteLine($" ! Invalid value (need int ≥ {min}) — keeping {currentValue}"); return currentValue; } /// /// Filters test data sets by layer keyword. Layered approach lets you run only what's needed for the iteration cadence. /// P1: only "Core" data exists (Small/Medium/Large/Repeated/Deep). Comprehensive and Edge layers will be expanded in P2. /// private static List FilterByLayer(List all, string layer) { if (layer == "all") return all.ToList(); var coreNames = new[] { "Small", "Medium", "Large", "Repeated", "Deep" }; // P2 will add: "Flat", "Polymorphic", "Collection", "Numeric", "NonAscii", etc. var comprehensiveExtras = new string[] { /* P2 */ }; // P3 will add: "ColdStart", "VeryLarge", "PathologicalString", etc. var edgeExtras = new string[] { /* P3 */ }; return layer switch { "core" => all.Where(t => StartsWithAny(t.Name, coreNames)).ToList(), "comprehensive" => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras)).ToList(), "edge" => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras) || StartsWithAny(t.Name, edgeExtras)).ToList(), // Single-cell A/B mini-suite filters — match by case-insensitive prefix on Name. // Use case: tight optimization-iteration loop on one specific cell (e.g. `dotnet run -- repeated` // or interactive menu shortcut), avoiding the full ~110 sec suite when only one cell is in scope. "small" => all.Where(t => t.Name.StartsWith("Small", StringComparison.OrdinalIgnoreCase)).ToList(), "medium" => all.Where(t => t.Name.StartsWith("Medium", StringComparison.OrdinalIgnoreCase)).ToList(), "large" => all.Where(t => t.Name.StartsWith("Large", StringComparison.OrdinalIgnoreCase)).ToList(), "repeated" => all.Where(t => t.Name.StartsWith("Repeated", StringComparison.OrdinalIgnoreCase)).ToList(), "deep" => all.Where(t => t.Name.StartsWith("Deep", StringComparison.OrdinalIgnoreCase)).ToList(), _ => all.ToList() }; static bool StartsWithAny(string name, string[] prefixes) => prefixes.Any(name.StartsWith); } #endregion #region Serializer Implementations private interface ISerializerBenchmark { /// Serializer engine — e.g. "AcBinary", "MemoryPack", "MessagePack". string Engine { get; } /// I/O mode — e.g. "Byte[]", "BufWr reuse", "BufWr new", "NamedPipe", "FileStream". string IoMode { get; } /// Dispatch mode — "SGen", "Runtime", or "Hybrid". For AcBinary derived from UseGeneratedCode + child-type SGen coverage; non-AcBinary engines report their own native dispatch model. string DispatchMode { get; } /// Options preset name — e.g. "FastMode", "Default", "NoIntern", "WithCompression". string OptionsPreset { get; } /// Synthesized display name from Engine + IoMode + OptionsPreset. string Name => $"{Engine} ({IoMode}, {OptionsPreset})"; int SerializedSize { get; } string? OptionsDescription => null; /// One-time SERIALIZER-side setup allocation cost (e.g., pre-allocated ArrayBufferWriter with internal buffer). Captured in constructor; 0 for byte[] API and Fresh-BufWriter variants. long SetupSerializeAllocBytes { get; } /// One-time DESERIALIZER-side setup allocation cost (e.g., long-lived AsyncPipeReaderInput's ArrayPool rent + ManualResetEventSlim, drain-task scaffolding). Captured in constructor; 0 for byte[] API and any setup-free deserialize path. long SetupDeserializeAllocBytes { get; } /// True when Serialize() does a full round-trip (e.g. NamedPipe) and Deserialize() is a no-op. /// Used by the SUMMARY: WINNERS section to skip such cells from "Fastest Serialize" and "Fastest Deserialize" /// rankings (because both metrics are misleading there) — they still participate in "Fastest Round-trip". /// Default false for in-memory IO modes which measure Ser and Des separately. bool IsRoundTripOnly => false; void Warmup(int iterations); void Serialize(); void Deserialize(); /// Round-trip correctness check — called once per cell before warmup. Returns true if Serialize+Deserialize preserves data. bool VerifyRoundTrip(); } private sealed class AcBinaryBenchmark : ISerializerBenchmark { private readonly TestOrder _order; private readonly AcBinarySerializerOptions _options; private readonly byte[] _serialized; public string Engine => EngineAcBinary; public string IoMode => IoByteArray; public string DispatchMode => _options.UseGeneratedCode ? ModeSGen : ModeRuntime; public string OptionsPreset { get; } public int SerializedSize => _serialized.Length; public long SetupSerializeAllocBytes => 0; public long SetupDeserializeAllocBytes => 0; public string OptionsDescription => BuildAcBinaryOptionsDescription(_options); public AcBinaryBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset) { _order = order; _options = options; OptionsPreset = optionsPreset; _serialized = AcBinarySerializer.Serialize(order, options); //_options.UseCompression = Lz4CompressionMode.Block; } public void Warmup(int iterations) { for (var i = 0; i < iterations; i++) { Serialize(); Deserialize(); } } [MethodImpl(MethodImplOptions.NoInlining)] public void Serialize() { AcBinarySerializer.Serialize(_order, _options); //if (_options.ReferenceHandling != ReferenceHandlingMode.None || _options.UseStringInterning != StringInterningMode.None) //{ // AcBinarySerializer.ScanOnly(_order, _options); //} //else AcBinarySerializer.Serialize(_order, _options); } [MethodImpl(MethodImplOptions.NoInlining)] public void Deserialize() => AcBinaryDeserializer.Deserialize(_serialized, _options); public bool VerifyRoundTrip() { var bytes = AcBinarySerializer.Serialize(_order, _options); var roundTripped = AcBinaryDeserializer.Deserialize(bytes, _options); return DeepEqualsViaJson(_order, roundTripped); } } private sealed class MemoryPackBenchmark : ISerializerBenchmark { private readonly TestOrder _order; private readonly MemoryPackSerializerOptions _options; private readonly byte[] _serialized; public string Engine => EngineMemoryPack; public string IoMode => IoByteArray; public string DispatchMode => ModeSGen; // MemoryPack always uses [MemoryPackable] source-generated formatters public string OptionsPreset { get; } public int SerializedSize => _serialized.Length; public long SetupSerializeAllocBytes => 0; public long SetupDeserializeAllocBytes => 0; public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}"; public MemoryPackBenchmark(TestOrder order, string optionsPreset) { _order = order; OptionsPreset = optionsPreset; _options = GetMemPackOptions(); _serialized = MemoryPackSerializer.Serialize(order, _options); } public void Warmup(int iterations) { for (var i = 0; i < iterations; i++) { Serialize(); Deserialize(); } } [MethodImpl(MethodImplOptions.NoInlining)] public void Serialize() => MemoryPackSerializer.Serialize(_order, _options); [MethodImpl(MethodImplOptions.NoInlining)] public void Deserialize() => MemoryPackSerializer.Deserialize(_serialized, _options); public bool VerifyRoundTrip() { var bytes = MemoryPackSerializer.Serialize(_order, _options); var roundTripped = MemoryPackSerializer.Deserialize(bytes, _options); return DeepEqualsViaJson(_order, roundTripped); } } #if !AYCODE_NATIVEAOT // MessagePack benchmark — excluded from NativeAOT build because v3's StandardResolver falls back // to DynamicGenericResolver for closed-generic types (List et al.), which uses // Activator.CreateInstance on formatter types the AOT trimmer drops → MissingMethodException at runtime. // Available for regular JIT runs (`dotnet run`) only. private sealed class MessagePackBenchmark : ISerializerBenchmark { private readonly TestOrder _order; private readonly MessagePackSerializerOptions _options; private readonly byte[] _serialized; public string Engine => EngineMessagePack; public string IoMode => IoByteArray; public string DispatchMode => ModeSGen; // MessagePack uses [MessagePackObject] source-generated formatters (StandardResolver) public string OptionsPreset { get; } public int SerializedSize => _serialized.Length; public long SetupSerializeAllocBytes => 0; public long SetupDeserializeAllocBytes => 0; public string OptionsDescription { get; } public MessagePackBenchmark(TestOrder order, string optionsPreset) { _order = order; OptionsPreset = optionsPreset; //_options = ContractlessStandardResolver.Options.WithCompression(MessagePackCompression.None); //_options = ContractlessStandardResolver.Options.WithCompression(MessagePackCompression.Lz4Block); _options = MessagePackSerializerOptions.Standard.WithCompression(MessagePackCompression.None); var isContractless = _options.Resolver is ContractlessStandardResolver; OptionsDescription = $"Mode={( isContractless ? "Contractless" : "ContractBased")}, Compression={_options.Compression}"; _serialized = MessagePackSerializer.Serialize(order, _options); } public void Warmup(int iterations) { for (var i = 0; i < iterations; i++) { Serialize(); Deserialize(); } } [MethodImpl(MethodImplOptions.NoInlining)] public void Serialize() => MessagePackSerializer.Serialize(_order, _options); [MethodImpl(MethodImplOptions.NoInlining)] public void Deserialize() => MessagePackSerializer.Deserialize(_serialized, _options); public bool VerifyRoundTrip() { var bytes = MessagePackSerializer.Serialize(_order, _options); var roundTripped = MessagePackSerializer.Deserialize(bytes, _options); return DeepEqualsViaJson(_order, roundTripped); } } #endif /// /// Benchmarks AcBinary via the IBufferWriter overload with a pre-allocated, reused ArrayBufferWriter. /// Realistic IBufferWriter usage pattern: caller owns + reuses the writer (zero alloc per call after warmup). /// /// /// Benchmarks AcBinary via the IBufferWriter overload, allocating a FRESH ArrayBufferWriter on EVERY call. /// One-shot scenario — represents code that doesn't reuse a writer across calls. /// Uses BufferWriterChunkSize=4096 (production-realistic, SignalR-aligned) instead of the 65535 default — /// otherwise AcBinary would request 64KB upfront via GetSpan(), forcing the fresh ABW to allocate 64KB /// regardless of payload size (heavy over-allocation for small payloads). /// private sealed class AcBinaryFreshBufferWriterBenchmark : ISerializerBenchmark { private readonly TestOrder _order; private readonly AcBinarySerializerOptions _options; private readonly byte[] _serialized; public string Engine => EngineAcBinary; public string IoMode => IoBufWrNew; public string DispatchMode => _options.UseGeneratedCode ? ModeSGen : ModeRuntime; public string OptionsPreset { get; } public int SerializedSize => _serialized.Length; public long SetupSerializeAllocBytes => 0; public long SetupDeserializeAllocBytes => 0; public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B"); public AcBinaryFreshBufferWriterBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset) { _order = order; // BufferWriterChunkSize comes from the caller (central source of truth in CreateSerializers // — the binaryFastMode4KbChunk options instance). Do NOT mutate _options here; tune the chunk // size in CreateSerializers only. _options = options; OptionsPreset = optionsPreset; _serialized = AcBinarySerializer.Serialize(order, _options); } public void Warmup(int iterations) { for (var i = 0; i < iterations; i++) { Serialize(); Deserialize(); } } [MethodImpl(MethodImplOptions.NoInlining)] public void Serialize() { var abw = new ArrayBufferWriter(); // FRESH every call — alloc + grow as needed AcBinarySerializer.Serialize(_order, abw, _options); } [MethodImpl(MethodImplOptions.NoInlining)] public void Deserialize() => AcBinaryDeserializer.Deserialize(_serialized, _options); public bool VerifyRoundTrip() { var abw = new ArrayBufferWriter(); AcBinarySerializer.Serialize(_order, abw, _options); var roundTripped = AcBinaryDeserializer.Deserialize(abw.WrittenSpan.ToArray(), _options); return DeepEqualsViaJson(_order, roundTripped); } } /// /// Benchmarks AcBinary over a long-lived NamedPipe IPC connection using the AcBinary native streaming API /// ( /// + + ). /// Mirrors what a real consumer (e.g. DeserializeFromPipeReaderAsync) does per message: /// long-lived with multi-message wire framing on top of a long-lived NamedPipe. /// /// Architecture: /// /// Constructor (NOT timed): sets up + , /// waits for connection, creates one long-lived / /// pair, ONE long-lived /// in multiMessage = true mode, ONE drain Task that pumps /// forever, and ONE deserialize Task that loops AcBinaryDeserializer.Deserialize<T>(input, opts) /// producing into a . /// Per-iteration (timed): sender writes via /// /// — multi-message wire ([201][UINT16][data]...[202]); the [202] end marker arms the input's /// _readPos = -1 sentinel, so the next message's first AppendToBuffer recycles the buffer to 0. /// Then receiver awaits the channel for the deserialized result. /// is a no-op (full round-trip captured in ); /// =true → Ser ms / SerAlloc oszlopok N/A, RT ms = full round-trip. /// /// /// Per-iter overhead: 0 new Task.Run, 0 new AsyncPipeReaderInput, 0 new CancellationTokenSource. /// Pure cost = SerializeChunkedFramed (CPU + chunk-onkénti flush) + kernel write/read syscalls + 1 sync barrier /// (channel) + deserialized graph alloc. The "multi-message reuse" pattern enabled by Q4T8 fix (R5K2 minimum: _readPos = -1 /// sentinel + AppendToBuffer sliding-window cycling). /// /// Approximation note: single-process loopback NamedPipe. Real cross-process / cross-machine SignalR /// adds further transport latency (TCP, WebSocket framing) on top. The benchmark gives a lower bound. /// private sealed class AcBinaryNamedPipeBenchmark : ISerializerBenchmark, IDisposable { private readonly TestOrder _order; private readonly AcBinarySerializerOptions _options; private readonly byte[] _serialized; // for SerializedSize reporting only // Long-lived pipe lifecycle (set up once in ctor — NOT timed). private readonly NamedPipeServerStream _pipeServer; private readonly NamedPipeClientStream _pipeClient; private readonly PipeWriter _pipeWriter; private readonly PipeReader _pipeReader; // Long-lived multi-message receive infrastructure (set up once in ctor). private readonly AsyncPipeReaderInput _input; private readonly CancellationTokenSource _cts; private readonly Task _drainTask; // BG: PipeReader → input.Feed (continuous pump) private readonly Task _consumerTask; // BG: per-iter Deserialize(input) loop, signaled by calling thread private readonly ManualResetEventSlim _consumeRequest = new(false); private readonly ManualResetEventSlim _consumeDone = new(false); private object? _lastResult; // captured during VerifyRoundTrip; null in benchmark iters private bool _captureResult; // toggle: when true, ConsumeLoop stores result; otherwise discards private bool _disposed; public string Engine => EngineAcBinary; public string IoMode => IoNamedPipe; public string DispatchMode => _options.UseGeneratedCode ? ModeSGen : ModeRuntime; public string OptionsPreset { get; } public int SerializedSize => _serialized.Length; public long SetupSerializeAllocBytes { get; } public long SetupDeserializeAllocBytes { get; } public bool IsRoundTripOnly => true; public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=NamedPipe(long-lived,multiMessage,2-task)"); public AcBinaryNamedPipeBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset) { _order = order; // BufferWriterChunkSize comes from the caller (central source of truth in CreateSerializers // — the binaryFastMode4KbChunk options instance). Do NOT mutate _options here; tune the chunk // size in CreateSerializers only. _options = options; OptionsPreset = optionsPreset; _serialized = AcBinarySerializer.Serialize(order, _options); // 1× pipe setup. Kernel-side pipe buffer (inBufferSize / outBufferSize on the server ctor — the // client inherits the server-defined buffer size at connect time) matches BufferWriterChunkSize // exactly: AsyncPipeWriterOutput now treats chunkSize as the chunk-on-wire total size (header + // data), so one WriteFile(chunkSize) syscall lands in exactly one kernel-page slot — page-aligned, // no fragmentation, no IRP reordering. _options.BufferWriterChunkSize is the single tunable source. var pipeName = $"AcBinaryBench-{Guid.NewGuid():N}"; // === SERIALIZE-side setup measurement === // pipe-pair (server + client) + connect handshake + writer-side PipeWriter wrapper. GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); var beforeSer = GC.GetAllocatedBytesForCurrentThread(); _pipeServer = new NamedPipeServerStream(pipeName, PipeDirection.In, 1, PipeTransmissionMode.Byte, System.IO.Pipes.PipeOptions.Asynchronous, inBufferSize: _options.BufferWriterChunkSize, outBufferSize: _options.BufferWriterChunkSize); _pipeClient = new NamedPipeClientStream(".", pipeName, PipeDirection.Out, System.IO.Pipes.PipeOptions.Asynchronous); var serverWait = _pipeServer.WaitForConnectionAsync(); _pipeClient.Connect(); serverWait.GetAwaiter().GetResult(); _pipeWriter = PipeWriter.Create(_pipeClient); var afterSer = GC.GetAllocatedBytesForCurrentThread(); SetupSerializeAllocBytes = afterSer - beforeSer; // === DESERIALIZE-side setup measurement === // PipeReader wrapper + AsyncPipeReaderInput (ArrayPool rent + ManualResetEventSlim) + drain // task + consumer task scaffolding. Two long-lived BG tasks total: drain pumps bytes from the // kernel pipe into input; consumer drives Deserialize(input) per iter on signal. GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); var beforeDes = GC.GetAllocatedBytesForCurrentThread(); _pipeReader = PipeReader.Create(_pipeServer); _input = new AsyncPipeReaderInput(_options.BufferWriterChunkSize * 2, multiMessage: true); _cts = new CancellationTokenSource(); // Drain task: pumps PipeReader → input.Feed forever (or until cancel). Single Task.Run for // the full benchmark lifetime — its overhead is amortised across all messages. _drainTask = Task.Run(() => _input.DrainFromAsync(_pipeReader, _cts.Token)); // Consumer task: per-iter Deserialize(input) loop. Started here once; signaled per-iter via // _consumeRequest. Enables Ser↔Des streaming overlap — calling thread runs SerializeChunkedFramed // while THIS task simultaneously runs Deserialize, both consuming/producing through the // sliding-window buffer pipelined by the drain task. _consumerTask = Task.Run(ConsumeLoop); var afterDes = GC.GetAllocatedBytesForCurrentThread(); SetupDeserializeAllocBytes = afterDes - beforeDes; } // BG consumer: parks on _consumeRequest, runs Deserialize(_input) when signaled, signals _consumeDone. // The Deserialize call internally blocks on the input's MRES whenever the drain hasn't yet fed enough // bytes for the next read — that's where the streaming-pipeline overlap with the calling thread (Ser) // happens. private void ConsumeLoop() { var ct = _cts.Token; try { while (true) { _consumeRequest.Wait(ct); if (ct.IsCancellationRequested) return; _consumeRequest.Reset(); try { var result = AcBinaryDeserializer.Deserialize(_input, _options); if (_captureResult) _lastResult = result; } catch { // Swallow — calling thread sees the failure via missing/incorrect _lastResult during VerifyRoundTrip, // or the benchmark loop just continues (timing impacted). Production teardown handled in Dispose. } finally { _consumeDone.Set(); } } } catch (OperationCanceledException) { // Cooperative cancel — Dispose path. Swallow. } } public void Warmup(int iterations) { for (var i = 0; i < iterations; i++) { Serialize(); } } [MethodImpl(MethodImplOptions.NoInlining)] public void Serialize() { // 2-task streaming pipeline: // 1. Calling thread signals consumer task to begin Deserialize(input). Consumer immediately // starts; first read blocks on input's MRES because no bytes flowed yet. // 2. Calling thread starts SerializeChunkedFramed → chunks flow through PipeWriter → kernel pipe → // drain task (BG) feeds input.Feed → MRES pulses → consumer's Deserialize consumes bytes // chunk by chunk. Ser↔Des truly overlap here. // 3. Calling thread waits for _consumeDone (signaling Deserialize returned). _consumeDone.Reset(); _consumeRequest.Set(); AcBinarySerializer.SerializeChunkedFramed(_order, _pipeWriter, _options); _consumeDone.Wait(); } [MethodImpl(MethodImplOptions.NoInlining)] public void Deserialize() { // No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract. } public bool VerifyRoundTrip() { // Use the same 2-task streaming path as the benchmark, but capture the result for graph-equality. _captureResult = true; try { Serialize(); var result = _lastResult as TestOrder; return result != null && DeepEqualsViaJson(_order, result); } finally { _captureResult = false; _lastResult = null; } } public void Dispose() { if (_disposed) return; _disposed = true; // Cancel drain + consumer tasks → both exit. Pulse _consumeRequest in case consumer is parked. try { _cts.Cancel(); } catch { /* swallow on teardown */ } try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ } try { _drainTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ } try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ } // Complete writer + dispose pipe lifecycle. try { _pipeWriter.CompleteAsync().AsTask().Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ } try { _pipeReader.Complete(); } catch { /* swallow on teardown */ } try { _pipeClient.Dispose(); } catch { /* swallow on teardown */ } try { _pipeServer.Dispose(); } catch { /* swallow on teardown */ } try { _input.Dispose(); } catch { /* swallow on teardown */ } try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ } try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ } try { _cts.Dispose(); } catch { /* swallow on teardown */ } } } /// /// Same chunked-framed AsyncPipe code path as , but the transport /// is an in-memory instead of a kernel NamedPipe. The Pipe's /// Writer/Reader pair is a managed-only zero-copy slab handoff — no syscalls, no kernel /// buffer copy, no IRP queueing. /// /// Why this benchmark matters: by holding ALL other variables constant (same SerializeChunkedFramed, /// same AsyncPipeReaderInput, same drain task, same consumer task, same multi-message wire format), this /// row isolates the kernel-NamedPipe transport overhead from the chunked-streaming framework's pure /// CPU cost. The expected delta vs : per-chunk overhead drops from /// ~25-30 µs (kernel-syscall pair + IRP) to ~1-2 µs (managed slab handoff). Multi-chunk Large-message rows /// should converge dramatically toward . /// /// Real-world relevance: in-memory Pipe is the typical primitive used for cross-thread serializer /// pipelines inside a single process (e.g. SignalR's Kestrel transport adapter, gRPC framework internals, /// custom message brokers). The numbers from this row reflect that scenario, NOT the kernel-pipe loopback /// of the NamedPipe benchmark. /// private sealed class AcBinaryInMemoryPipeBenchmark : ISerializerBenchmark, IDisposable { private readonly TestOrder _order; private readonly AcBinarySerializerOptions _options; private readonly byte[] _serialized; // for SerializedSize reporting only // Long-lived in-memory pipe lifecycle (set up once in ctor — NOT timed). private readonly Pipe _pipe; private readonly PipeWriter _pipeWriter; private readonly PipeReader _pipeReader; // Long-lived multi-message receive infrastructure (set up once in ctor) — same pattern as the NamedPipe // variant: drain pumps reader into AsyncPipeReaderInput, consumer task drives Deserialize(input). private readonly AsyncPipeReaderInput _input; private readonly CancellationTokenSource _cts; private readonly Task _drainTask; private readonly Task _consumerTask; private readonly ManualResetEventSlim _consumeRequest = new(false); private readonly ManualResetEventSlim _consumeDone = new(false); private object? _lastResult; private bool _captureResult; private bool _disposed; public string Engine => EngineAcBinary; public string IoMode => IoInMemoryPipe; public string DispatchMode => _options.UseGeneratedCode ? ModeSGen : ModeRuntime; public string OptionsPreset { get; } public int SerializedSize => _serialized.Length; public long SetupSerializeAllocBytes { get; } public long SetupDeserializeAllocBytes { get; } public bool IsRoundTripOnly => true; public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=Pipe(in-memory,multiMessage,2-task)"); public AcBinaryInMemoryPipeBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset) { _order = order; _options = options; OptionsPreset = optionsPreset; _serialized = AcBinarySerializer.Serialize(order, _options); // === SERIALIZE-side setup measurement === // In-memory Pipe construction. NO kernel-pipe pair, NO Connect handshake — just a managed Pipe object // and a reference to its Writer side. PipeWriterImpl (parallel-flush capable, NOT StreamPipeWriter). GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); var beforeSer = GC.GetAllocatedBytesForCurrentThread(); _pipe = new Pipe(); _pipeWriter = _pipe.Writer; var afterSer = GC.GetAllocatedBytesForCurrentThread(); SetupSerializeAllocBytes = afterSer - beforeSer; // === DESERIALIZE-side setup measurement === // PipeReader reference + AsyncPipeReaderInput (ArrayPool rent + ManualResetEventSlim) + drain task + // consumer task scaffolding. Identical to the NamedPipe variant on the receive side. GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); var beforeDes = GC.GetAllocatedBytesForCurrentThread(); _pipeReader = _pipe.Reader; _input = new AsyncPipeReaderInput(_options.BufferWriterChunkSize * 2, multiMessage: true); _cts = new CancellationTokenSource(); _drainTask = Task.Run(() => _input.DrainFromAsync(_pipeReader, _cts.Token)); _consumerTask = Task.Run(ConsumeLoop); var afterDes = GC.GetAllocatedBytesForCurrentThread(); SetupDeserializeAllocBytes = afterDes - beforeDes; } // BG consumer: parks on _consumeRequest, runs Deserialize(_input) when signaled, signals _consumeDone. // Mirror of AcBinaryNamedPipeBenchmark.ConsumeLoop — same pattern, same MRES protocol. private void ConsumeLoop() { var ct = _cts.Token; try { while (true) { _consumeRequest.Wait(ct); if (ct.IsCancellationRequested) return; _consumeRequest.Reset(); try { var result = AcBinaryDeserializer.Deserialize(_input, _options); if (_captureResult) _lastResult = result; } catch { // Swallow — see ConsumeLoop in NamedPipe variant for rationale. } finally { _consumeDone.Set(); } } } catch (OperationCanceledException) { // Cooperative cancel — Dispose path. Swallow. } } public void Warmup(int iterations) { for (var i = 0; i < iterations; i++) Serialize(); } [MethodImpl(MethodImplOptions.NoInlining)] public void Serialize() { // Same 2-task streaming pipeline as NamedPipe variant — only the transport differs (in-memory Pipe // instead of kernel NamedPipe). Per-chunk SerializeChunkedFramed → PipeWriter slab → drain task // reads from PipeReader → input.Feed → consumer Deserialize consumes byte-by-byte. // // Uses the Pipe-overload (instead of the PipeWriter-overload) so the FlushPolicy parameter is // exposed for tuning. Toggle between FlushPolicy.PerChunk (bounded peak memory, per-chunk await // FlushAsync) and FlushPolicy.Coalesced (fire-and-forget per chunk, pipe-coalesced flushes up to // PauseWriterThreshold ~64 KB) to A/B-test the streaming-pipeline overhead. FlushPolicy.PerChunk // is functionally equivalent to the PipeWriter-overload (both internally route to // SerializeToPipeWriterCore with FlushPolicy.PerChunk). _consumeDone.Reset(); _consumeRequest.Set(); AcBinarySerializer.SerializeChunkedFramed(_order, _pipe, _options, FlushPolicy.Coalesced); _consumeDone.Wait(); } [MethodImpl(MethodImplOptions.NoInlining)] public void Deserialize() { // No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract. } public bool VerifyRoundTrip() { _captureResult = true; try { Serialize(); var result = _lastResult as TestOrder; return result != null && DeepEqualsViaJson(_order, result); } finally { _captureResult = false; _lastResult = null; } } public void Dispose() { if (_disposed) return; _disposed = true; // Cancel drain + consumer tasks → both exit. Pulse _consumeRequest in case consumer is parked. try { _cts.Cancel(); } catch { /* swallow on teardown */ } try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ } try { _drainTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ } try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ } // Complete writer + reader (in-memory Pipe — no underlying stream to dispose). try { _pipeWriter.CompleteAsync().AsTask().Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ } try { _pipeReader.Complete(); } catch { /* swallow on teardown */ } try { _input.Dispose(); } catch { /* swallow on teardown */ } try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ } try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ } try { _cts.Dispose(); } catch { /* swallow on teardown */ } } } /// /// Raw byte[] over a long-lived NamedPipe — NO chunk-framing, NO AsyncPipeReaderInput, /// NO sliding-window buffer. Calling thread serialises + writes; a long-lived background consumer task /// reads and deserialises. Two-task pattern enables Ser↔Read overlap (kernel-pipe-pipelined) AND /// avoids the kernel-buffer-full deadlock when bytes.Length > inBufferSize. /// /// Side-by-side with (chunked-framed AsyncPipe stack) this /// isolates two cost components on the SAME kernel-pipe transport with the SAME inBufferSize: /// /// This row vs (Byte[]) — pure kernel-NamedPipe /// overhead (WriteFile / ReadFile syscalls + IRP queueing + buffer-copy + thread-handoff). /// This row vs (chunked-framed) — pure /// AsyncPipe-framework overhead (chunk header writes + sliding-window Feed + MRES wait inside /// AsyncPipeReaderInput) AND the streaming-pipeline benefit of intra-message Ser↔Des overlap (which /// raw lacks — raw can only Ser↔Read overlap, with Des sequential after Read completes). /// /// Per-iter byte[] allocation from AcBinarySerializer.Serialize is part of the cost (matches /// 's API contract); the receive-side scratch buffer is also allocated per-iter /// on the consumer-task (counted via GC.GetTotalAllocatedBytes in MeasureAllocationTotal). /// private sealed class AcBinaryNamedPipeRawByteArrayBenchmark : ISerializerBenchmark, IDisposable { private readonly TestOrder _order; private readonly AcBinarySerializerOptions _options; private readonly byte[] _serialized; // for SerializedSize reporting + receive-side size known upfront // Long-lived pipe lifecycle (set up once in ctor — NOT timed). private readonly NamedPipeServerStream _pipeServer; private readonly NamedPipeClientStream _pipeClient; // Long-lived consumer-task infrastructure (Read + Deserialize on BG thread, signaled per iter). // Mirrors AcBinaryNamedPipeBenchmark's drain+consumer pair, but raw byte[] doesn't have an // intermediate sliding-window buffer, so Read+Des happen sequentially in one BG task: Read N bytes // → Deserialize(bytes) → signal done. Calling thread's Ser↔Write overlaps with this BG Read+Des // through kernel-pipe pipelining. private readonly CancellationTokenSource _cts; private readonly Task _consumerTask; private readonly ManualResetEventSlim _consumeRequest = new(false); private readonly ManualResetEventSlim _consumeDone = new(false); private int _pendingReadSize; private object? _lastResult; // captured during VerifyRoundTrip; null in benchmark iters private bool _captureResult; // toggle: when true, ConsumerLoop stores result; otherwise discards private bool _disposed; public string Engine => EngineAcBinary; public string IoMode => IoNamedPipeRaw; public string DispatchMode => _options.UseGeneratedCode ? ModeSGen : ModeRuntime; public string OptionsPreset { get; } public int SerializedSize => _serialized.Length; public long SetupSerializeAllocBytes { get; } public long SetupDeserializeAllocBytes { get; } public bool IsRoundTripOnly => true; public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=NamedPipe(raw,2-task)"); public AcBinaryNamedPipeRawByteArrayBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset) { _order = order; // BufferWriterChunkSize comes from the caller — same source-of-truth contract as // AcBinaryNamedPipeBenchmark. The kernel pipe-buffer (inBufferSize) is wired to it so the // raw-vs-chunked comparison runs on identical transport conditions. _options = options; OptionsPreset = optionsPreset; _serialized = AcBinarySerializer.Serialize(order, _options); var pipeName = $"AcBinaryBenchRaw-{Guid.NewGuid():N}"; // === SERIALIZE-side setup measurement === // pipe-pair (server + client) + connect handshake. NO PipeWriter wrapper — we use the raw // Stream.Write API directly, matching the no-framing semantics of this benchmark. GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); var beforeSer = GC.GetAllocatedBytesForCurrentThread(); _pipeServer = new NamedPipeServerStream(pipeName, PipeDirection.In, 1, PipeTransmissionMode.Byte, System.IO.Pipes.PipeOptions.Asynchronous, inBufferSize: _options.BufferWriterChunkSize, outBufferSize: _options.BufferWriterChunkSize); _pipeClient = new NamedPipeClientStream(".", pipeName, PipeDirection.Out, System.IO.Pipes.PipeOptions.Asynchronous); var serverWait = _pipeServer.WaitForConnectionAsync(); _pipeClient.Connect(); serverWait.GetAwaiter().GetResult(); var afterSer = GC.GetAllocatedBytesForCurrentThread(); SetupSerializeAllocBytes = afterSer - beforeSer; // === DESERIALIZE-side setup measurement === // 1× background consumer-task + 2× MRES (request / done) + cancellation source. Matches the // chunked benchmark's deserialize-side setup cost shape. GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); var beforeDes = GC.GetAllocatedBytesForCurrentThread(); _cts = new CancellationTokenSource(); _consumerTask = Task.Run(ConsumerLoop); var afterDes = GC.GetAllocatedBytesForCurrentThread(); SetupDeserializeAllocBytes = afterDes - beforeDes; } // BG consumer: parks on _consumeRequest, reads N bytes from pipe, runs Deserialize(bytes), signals // _consumeDone. The Read overlaps with the calling thread's Write through the kernel-pipe; Des happens // sequentially after Read completes (raw byte[] needs the full message to deserialize). private void ConsumerLoop() { var ct = _cts.Token; try { while (true) { _consumeRequest.Wait(ct); if (ct.IsCancellationRequested) return; _consumeRequest.Reset(); try { var size = _pendingReadSize; var bytes = new byte[size]; // per-iter alloc — counted by MeasureAllocationTotal var totalRead = 0; while (totalRead < size) { var n = _pipeServer.Read(bytes, totalRead, size - totalRead); if (n == 0) break; // pipe closed / EOF — partial read swallowed totalRead += n; } var result = AcBinaryDeserializer.Deserialize(bytes, _options); if (_captureResult) _lastResult = result; } catch { // Swallow — calling thread sees the failure via missing/incorrect _lastResult during VerifyRoundTrip, // or the benchmark loop just continues (timing impacted). Production teardown handled in Dispose. } finally { _consumeDone.Set(); } } } catch (OperationCanceledException) { // Cooperative cancel — Dispose path. Swallow. } } public void Warmup(int iterations) { for (var i = 0; i < iterations; i++) { Serialize(); } } [MethodImpl(MethodImplOptions.NoInlining)] public void Serialize() { // 2-task streaming pipeline: // 1. Calling thread serialises → fresh byte[] (per-iter alloc, matches AcBinaryBenchmark contract). // 2. Calling thread hands off expected size + signals consumer task. Consumer task starts Read loop // on the pipe (BG thread). Calling thread proceeds to Write the bytes — Read and Write overlap // through the kernel-pipe (kernel buffer fills, drains as consumer reads, sender resumes). // 3. Calling thread waits for _consumeDone (consumer task finished Read+Des). // // Note: unlike chunked, raw byte[] cannot do Ser↔Des overlap (Des needs the full bytes before // starting). Only Write↔Read overlaps here. The Des sequence on BG thread is: Read full bytes → // Des the full graph → signal done. This is the architectural difference between raw and chunked. var bytes = AcBinarySerializer.Serialize(_order, _options); _pendingReadSize = bytes.Length; _consumeDone.Reset(); _consumeRequest.Set(); _pipeClient.Write(bytes, 0, bytes.Length); _pipeClient.Flush(); _consumeDone.Wait(); } [MethodImpl(MethodImplOptions.NoInlining)] public void Deserialize() { // No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract. } public bool VerifyRoundTrip() { // Use the same 2-task streaming path as the benchmark, but capture the result for graph-equality. _captureResult = true; try { Serialize(); var result = _lastResult as TestOrder; return result != null && DeepEqualsViaJson(_order, result); } finally { _captureResult = false; _lastResult = null; } } public void Dispose() { if (_disposed) return; _disposed = true; // Cancel the consumer task → ConsumerLoop exits its Wait via OperationCanceledException. try { _cts.Cancel(); } catch { /* swallow on teardown */ } try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ } try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ } // Symmetric teardown — close client first (writer side), then server. try { _pipeClient.Dispose(); } catch { /* swallow on teardown */ } try { _pipeServer.Dispose(); } catch { /* swallow on teardown */ } try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ } try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ } try { _cts.Dispose(); } catch { /* swallow on teardown */ } } } /// /// Raw byte[] over an in-memory cross-thread handoff — NO transport (no NamedPipe, no Pipe, no /// Channel). Calling thread serialises into a fresh byte[], hands it to a /// background consumer task via a single byte[] slot + MRES pair; the consumer deserialises and signals done. /// /// Why this benchmark matters: completes the 2x2 transport × wire-format matrix: /// /// NamedPipe + Chunked = /// NamedPipe + Raw = /// In-memory Pipe + Chunked = /// In-memory + Raw = THIS row — apples-to-apples baseline for the in-memory chunked row /// /// Side-by-side with this isolates the chunked-streaming /// framework's pure CPU cost, with the same in-memory transport (zero kernel involvement) on both sides. /// Side-by-side with this isolates the kernel-NamedPipe /// overhead on the raw-byte[] side. /// private sealed class AcBinaryInMemoryRawByteArrayBenchmark : ISerializerBenchmark, IDisposable { private readonly TestOrder _order; private readonly AcBinarySerializerOptions _options; private readonly byte[] _serialized; // for SerializedSize reporting only // Long-lived consumer-task infrastructure (Deserialize on BG thread, signaled per iter). // No transport — just a byte[] slot for handoff between calling thread and consumer task. private readonly CancellationTokenSource _cts; private readonly Task _consumerTask; private readonly ManualResetEventSlim _consumeRequest = new(false); private readonly ManualResetEventSlim _consumeDone = new(false); private byte[]? _pendingBytes; // calling thread → consumer task handoff slot private object? _lastResult; // captured during VerifyRoundTrip; null in benchmark iters private bool _captureResult; private bool _disposed; public string Engine => EngineAcBinary; public string IoMode => IoInMemoryRaw; public string DispatchMode => _options.UseGeneratedCode ? ModeSGen : ModeRuntime; public string OptionsPreset { get; } public int SerializedSize => _serialized.Length; public long SetupSerializeAllocBytes { get; } public long SetupDeserializeAllocBytes { get; } public bool IsRoundTripOnly => true; public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=in-memory(raw,2-task)"); public AcBinaryInMemoryRawByteArrayBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset) { _order = order; _options = options; OptionsPreset = optionsPreset; _serialized = AcBinarySerializer.Serialize(order, _options); // === SERIALIZE-side setup measurement === // Nothing to set up — calling thread allocates byte[] per iter via AcBinarySerializer.Serialize. SetupSerializeAllocBytes = 0; // === DESERIALIZE-side setup measurement === // 1× background consumer-task + 2× MRES (request / done) + cancellation source. GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); var beforeDes = GC.GetAllocatedBytesForCurrentThread(); _cts = new CancellationTokenSource(); _consumerTask = Task.Run(ConsumerLoop); var afterDes = GC.GetAllocatedBytesForCurrentThread(); SetupDeserializeAllocBytes = afterDes - beforeDes; } // BG consumer: parks on _consumeRequest, picks up the byte[] from _pendingBytes, runs Deserialize(bytes), // signals _consumeDone. Direct in-process handoff — no transport syscall, no buffer copy beyond the byte[] // reference itself (zero-copy by reference). private void ConsumerLoop() { var ct = _cts.Token; try { while (true) { _consumeRequest.Wait(ct); if (ct.IsCancellationRequested) return; _consumeRequest.Reset(); try { var bytes = _pendingBytes; if (bytes != null) { var result = AcBinaryDeserializer.Deserialize(bytes, _options); if (_captureResult) _lastResult = result; } } catch { // Swallow — see ConsumerLoop in NamedPipe variant for rationale. } finally { _consumeDone.Set(); } } } catch (OperationCanceledException) { // Cooperative cancel — Dispose path. Swallow. } } public void Warmup(int iterations) { for (var i = 0; i < iterations; i++) Serialize(); } [MethodImpl(MethodImplOptions.NoInlining)] public void Serialize() { // 2-task in-memory pipeline: // 1. Calling thread serialises → fresh byte[] (per-iter alloc, matches AcBinaryBenchmark contract). // 2. Calling thread parks the byte[] into _pendingBytes and signals consumer task. Consumer task // picks up the reference (zero-copy) and runs Deserialize(bytes). // 3. Calling thread waits for _consumeDone (consumer task finished Des). // // Same architectural limitation as the NamedPipe-raw variant: Des cannot start until full bytes // are available. Only the per-iter Ser↔Des thread-handoff overlaps slightly (calling thread starts // signalling and waiting while consumer thread takes the byte[]). var bytes = AcBinarySerializer.Serialize(_order, _options); _pendingBytes = bytes; _consumeDone.Reset(); _consumeRequest.Set(); _consumeDone.Wait(); } [MethodImpl(MethodImplOptions.NoInlining)] public void Deserialize() { // No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract. } public bool VerifyRoundTrip() { _captureResult = true; try { Serialize(); var result = _lastResult as TestOrder; return result != null && DeepEqualsViaJson(_order, result); } finally { _captureResult = false; _lastResult = null; } } public void Dispose() { if (_disposed) return; _disposed = true; try { _cts.Cancel(); } catch { /* swallow on teardown */ } try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ } try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ } try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ } try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ } try { _cts.Dispose(); } catch { /* swallow on teardown */ } } } /// /// Benchmarks MemoryPack via the IBufferWriter overload, allocating a FRESH ArrayBufferWriter on EVERY call. /// Apples-to-apples counterpart to AcBinaryFreshBufferWriterBenchmark. /// private sealed class MemoryPackFreshBufferWriterBenchmark : ISerializerBenchmark { private readonly TestOrder _order; private readonly MemoryPackSerializerOptions _options; private readonly byte[] _serialized; public string Engine => EngineMemoryPack; public string IoMode => IoBufWrNew; public string DispatchMode => ModeSGen; // MemoryPack always uses [MemoryPackable] source-generated formatters public string OptionsPreset { get; } public int SerializedSize => _serialized.Length; public long SetupSerializeAllocBytes => 0; public long SetupDeserializeAllocBytes => 0; public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}"; public MemoryPackFreshBufferWriterBenchmark(TestOrder order, string optionsPreset) { _order = order; OptionsPreset = optionsPreset; _options = GetMemPackOptions(); _serialized = MemoryPackSerializer.Serialize(order, _options); } public void Warmup(int iterations) { for (var i = 0; i < iterations; i++) { Serialize(); Deserialize(); } } [MethodImpl(MethodImplOptions.NoInlining)] public void Serialize() { var abw = new ArrayBufferWriter(); MemoryPackSerializer.Serialize(abw, _order, _options); } [MethodImpl(MethodImplOptions.NoInlining)] public void Deserialize() => MemoryPackSerializer.Deserialize(_serialized, _options); public bool VerifyRoundTrip() { var abw = new ArrayBufferWriter(); MemoryPackSerializer.Serialize(abw, _order, _options); var roundTripped = MemoryPackSerializer.Deserialize(abw.WrittenSpan.ToArray(), _options); return DeepEqualsViaJson(_order, roundTripped); } } private sealed class AcBinaryBufferWriterBenchmark : ISerializerBenchmark { private readonly TestOrder _order; private readonly AcBinarySerializerOptions _options; private readonly byte[] _serialized; private readonly ArrayBufferWriter _bufferWriter; public string Engine => EngineAcBinary; public string IoMode => IoBufWrReuse; public string DispatchMode => _options.UseGeneratedCode ? ModeSGen : ModeRuntime; public string OptionsPreset { get; } public int SerializedSize => _serialized.Length; public long SetupSerializeAllocBytes { get; } public long SetupDeserializeAllocBytes => 0; public string OptionsDescription => BuildAcBinaryOptionsDescription(_options); public AcBinaryBufferWriterBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset) { _order = order; _options = options; OptionsPreset = optionsPreset; _serialized = AcBinarySerializer.Serialize(order, options); // Measure ONLY the BufferWriter infrastructure setup on the serialize side (excluding the // helper Serialize above). Deserialize side reads directly from `_serialized` byte[] — no // dedicated setup allocation, hence SetupDeserializeAllocBytes = 0. GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); var beforeSetup = GC.GetAllocatedBytesForCurrentThread(); _bufferWriter = new ArrayBufferWriter(_serialized.Length * 2); var afterSetup = GC.GetAllocatedBytesForCurrentThread(); SetupSerializeAllocBytes = afterSetup - beforeSetup; } public void Warmup(int iterations) { for (var i = 0; i < iterations; i++) { Serialize(); Deserialize(); } } [MethodImpl(MethodImplOptions.NoInlining)] public void Serialize() { _bufferWriter.ResetWrittenCount(); // reuse — no alloc, no zeroing AcBinarySerializer.Serialize(_order, _bufferWriter, _options); } [MethodImpl(MethodImplOptions.NoInlining)] public void Deserialize() => AcBinaryDeserializer.Deserialize(_serialized, _options); public bool VerifyRoundTrip() { _bufferWriter.ResetWrittenCount(); AcBinarySerializer.Serialize(_order, _bufferWriter, _options); var roundTripped = AcBinaryDeserializer.Deserialize(_bufferWriter.WrittenSpan.ToArray(), _options); return DeepEqualsViaJson(_order, roundTripped); } } /// /// Benchmarks MemoryPack via the IBufferWriter overload with a pre-allocated, reused ArrayBufferWriter. /// Apples-to-apples counterpart to AcBinaryBufferWriterBenchmark — MemoryPack's IBufferWriter is the path it's designed for. /// private sealed class MemoryPackBufferWriterBenchmark : ISerializerBenchmark { private readonly TestOrder _order; private readonly MemoryPackSerializerOptions _options; private readonly byte[] _serialized; private readonly ArrayBufferWriter _bufferWriter; public string Engine => EngineMemoryPack; public string IoMode => IoBufWrReuse; public string DispatchMode => ModeSGen; // MemoryPack always uses [MemoryPackable] source-generated formatters public string OptionsPreset { get; } public int SerializedSize => _serialized.Length; public long SetupSerializeAllocBytes { get; } public long SetupDeserializeAllocBytes => 0; public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}"; public MemoryPackBufferWriterBenchmark(TestOrder order, string optionsPreset) { _order = order; OptionsPreset = optionsPreset; _options = GetMemPackOptions(); _serialized = MemoryPackSerializer.Serialize(order, _options); // Serialize-side setup only — see AcBinaryBufferWriterBenchmark for the full rationale. GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); var beforeSetup = GC.GetAllocatedBytesForCurrentThread(); _bufferWriter = new ArrayBufferWriter(_serialized.Length * 2); var afterSetup = GC.GetAllocatedBytesForCurrentThread(); SetupSerializeAllocBytes = afterSetup - beforeSetup; } public void Warmup(int iterations) { for (var i = 0; i < iterations; i++) { Serialize(); Deserialize(); } } [MethodImpl(MethodImplOptions.NoInlining)] public void Serialize() { _bufferWriter.ResetWrittenCount(); MemoryPackSerializer.Serialize(_bufferWriter, _order, _options); } [MethodImpl(MethodImplOptions.NoInlining)] public void Deserialize() => MemoryPackSerializer.Deserialize(_serialized, _options); public bool VerifyRoundTrip() { _bufferWriter.ResetWrittenCount(); MemoryPackSerializer.Serialize(_bufferWriter, _order, _options); var roundTripped = MemoryPackSerializer.Deserialize(_bufferWriter.WrittenSpan.ToArray(), _options); return DeepEqualsViaJson(_order, roundTripped); } } private sealed class SystemTextJsonBenchmark : ISerializerBenchmark { private readonly TestOrder _order; private readonly JsonSerializerOptions _options; private readonly string _serialized; private readonly byte[] _serializedUtf8; public string Engine => EngineSystemTextJson; public string IoMode => IoString; public string DispatchMode => ModeRuntime; // System.Text.Json default uses reflection-based metadata (no source generator opt-in here) public string OptionsPreset { get; } public int SerializedSize => _serializedUtf8.Length; public long SetupSerializeAllocBytes => 0; public long SetupDeserializeAllocBytes => 0; public SystemTextJsonBenchmark(TestOrder order, string optionsPreset) { _order = order; OptionsPreset = optionsPreset; _options = new JsonSerializerOptions { WriteIndented = false, DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull, ReferenceHandler = System.Text.Json.Serialization.ReferenceHandler.IgnoreCycles }; _serialized = JsonSerializer.Serialize(order, _options); _serializedUtf8 = Utf8NoBom.GetBytes(_serialized); } public void Warmup(int iterations) { for (var i = 0; i < iterations; i++) { Serialize(); Deserialize(); } } [MethodImpl(MethodImplOptions.NoInlining)] public void Serialize() => JsonSerializer.Serialize(_order, _options); [MethodImpl(MethodImplOptions.NoInlining)] public void Deserialize() => JsonSerializer.Deserialize(_serialized, _options); public bool VerifyRoundTrip() { var json = JsonSerializer.Serialize(_order, _options); var roundTripped = JsonSerializer.Deserialize(json, _options); return DeepEqualsViaJson(_order, roundTripped); } } #endregion #region Results private sealed class BenchmarkResult { public string TestDataName { get; set; } = ""; public string Engine { get; set; } = ""; public string IoMode { get; set; } = ""; public string DispatchMode { get; set; } = ""; public string OptionsPreset { get; set; } = ""; /// True if Serialize() captures a full round-trip and Deserialize() is a no-op /// (single-use streaming transports like NamedPipe). Excluded from "Fastest Serialize" / "Fastest Deserialize" /// winners rankings; still ranked in "Fastest Round-trip". Display-side: Ser µs/op / SerAlloc / Des µs/op / DesAlloc /// all show "N/A" since they were never measured separately; RT µs/op / RT Alloc carry the full round-trip values. public bool IsRoundTripOnly { get; set; } /// Synthesized display name for backwards compatibility / single-string-row scenarios. Includes DispatchMode so SGen and Runtime variants of the same preset don't collide in grouping (e.g. SUMMARY: WINNERS). public string SerializerName => $"{Engine} ({IoMode}, {OptionsPreset}, {DispatchMode})"; public string? OptionsDescription { get; set; } public int SerializedSize { get; set; } public double SerializeTimeMs { get; set; } public double DeserializeTimeMs { get; set; } // Per-sample min/max alongside the median (median is the *Time*Ms field above). Surfaces // inter-sample range — the visible noise floor for the row. 0 when the operation was skipped // (mode != "all"/"ser"/"des") or when a single-sample fast path was used (min == max == median). public double SerializeTimeMinMs { get; set; } public double SerializeTimeMaxMs { get; set; } public double DeserializeTimeMinMs { get; set; } public double DeserializeTimeMaxMs { get; set; } // Sample-population stddev (ms). Used by FormatMicrosWithRange to compute CV (stddev/mean) // and emit the ⚠️ marker on rows above UnstableCVThreshold. 0 in single-sample mode. public double SerializeTimeStdDevMs { get; set; } public double DeserializeTimeStdDevMs { get; set; } // Per-row adaptive iteration count (post-CalibrateIterations). Each Ser and Des function calibrates // independently to land its sample window at ~TargetSampleMs; per-op µs is then iter-independent // (`SerializeTimeMs / SerializeIterations * 1000`). For round-trip-only rows (NamedPipe etc.), // RoundTripIterations carries the calibrated iter count; SerializeIterations and DeserializeIterations // stay 0 (Ser and Des are not separately measurable on those rows). public int SerializeIterations { get; set; } public int DeserializeIterations { get; set; } public int RoundTripIterations { get; set; } public long SerializeAllocBytesPerOp { get; set; } public long DeserializeAllocBytesPerOp { get; set; } public long SetupSerializeAllocBytes { get; set; } public long SetupDeserializeAllocBytes { get; set; } /// Total round-trip time. For in-memory benchmarks: synthesized so that /// RoundTripTimeMs / RoundTripIterations yields the correct SerPerOp + DesPerOp µs/op /// (necessary because Ser and Des may have different iter counts post-calibration). /// For round-trip-only benchmarks (NamedPipe etc.): the directly-measured pipe round-trip time. public double RoundTripTimeMs { get; set; } // Round-trip min/max + stddev — only populated for round-trip-only benchmarks (NamedPipe etc.) where // RT is directly measured. For in-memory rows RT = Ser + Des, which has no single-sample // distribution; surface Ser/Des range separately instead. public double RoundTripTimeMinMs { get; set; } public double RoundTripTimeMaxMs { get; set; } public double RoundTripTimeStdDevMs { get; set; } /// Total round-trip allocation per op. For in-memory benchmarks: SerializeAlloc + DeserializeAlloc. /// For round-trip-only benchmarks: process-wide allocation measured via /// (covers ALL threads — client, server-drain, channel internals — not just the caller). public long RoundTripAllocBytesPerOp { get; set; } } private static void PrintResult(BenchmarkResult result) { // Numbers-only per-row entries; the column-headers carry units (µs/op, KB/op). var ser = result.SerializeTimeMs > 0 ? $"{SerPerOp(result),7:F2}" : " N/A"; var des = result.DeserializeTimeMs > 0 ? $"{DesPerOp(result),7:F2}" : " N/A"; var serAlloc = result.SerializeTimeMs > 0 ? $"{ToKilobytes(result.SerializeAllocBytesPerOp),7:F2}" : " N/A"; var desAlloc = result.DeserializeTimeMs > 0 ? $"{ToKilobytes(result.DeserializeAllocBytesPerOp),7:F2}" : " N/A"; System.Console.WriteLine($" {result.SerializerName,-40} | Size: {result.SerializedSize,8:N0} B | Ser: {ser} µs/op ({serAlloc} KB/op) | Des: {des} µs/op ({desAlloc} KB/op)"); } private static void PrintGroupedResults(List results, List testDataSets) { System.Console.WriteLine("\n"); System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════════════════════════════════════╗"); System.Console.WriteLine("║ GROUPED RESULTS BY TEST DATA ║"); System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════════════════════════════════════╝"); // Print serializer options var optionsMap = results .Where(r => r.OptionsDescription != null) .Select(r => (r.SerializerName, r.OptionsDescription!)) .Distinct() .ToList(); if (optionsMap.Count > 0) { System.Console.WriteLine(); System.Console.WriteLine(" Serializer Options:"); foreach (var (name, opts) in optionsMap) System.Console.WriteLine($" {name}: {opts}"); } foreach (var testData in testDataSets) { // Order by per-op µs (iter-independent) — rows may have different iter counts post-calibration. var testResults = results.Where(r => r.TestDataName == testData.DisplayName).OrderBy(r => RtPerOp(r)).ToList(); // Baseline switched MessagePack → MemoryPack: MemoryPack is the SOTA performance leader. var memPackResult = testResults.FirstOrDefault(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)); // Pin the comparison to AcBinary's SGen variant — apples-to-apples vs MemoryPack (also source-generated). // The Runtime variant is shown alongside in the table for context, not used as the headline number. var acBinaryResult = testResults.FirstOrDefault(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)); System.Console.WriteLine($"\n┌─ {testData.DisplayName} ─".PadRight(172, '─') + "┐"); // Header-only units; per-row entries are numbers (µs/op for time, KB/op for alloc, KB pair "ser / des" for Setup, B for Size). System.Console.WriteLine($"│ {"#",-4} │ {"Engine",-11} │ {"Options",-22} │ {"IO",-12} │ {"Mode",-8} │ {"Setup S/D KB",-14} │ {"Size B",-8} │ {"Ser µs/op",-10} │ {"SerAlc KB",-10} │ {"Des µs/op",-10} │ {"DesAlc KB",-10} │ {"RT µs/op",-10} │ {"RTAlc KB",-10} │"); System.Console.WriteLine($"├{"─".PadRight(6, '─')}┼{"─".PadRight(13, '─')}┼{"─".PadRight(24, '─')}┼{"─".PadRight(14, '─')}┼{"─".PadRight(10, '─')}┼{"─".PadRight(16, '─')}┼{"─".PadRight(10, '─')}┼{"─".PadRight(12, '─')}┼{"─".PadRight(12, '─')}┼{"─".PadRight(12, '─')}┼{"─".PadRight(12, '─')}┼{"─".PadRight(12, '─')}┼{"─".PadRight(12, '─')}┤"); var rank = 1; foreach (var result in testResults) { var size = $"{result.SerializedSize:N0}"; var setup = $"{ToKilobytes(result.SetupSerializeAllocBytes):F2} / {ToKilobytes(result.SetupDeserializeAllocBytes):F2}"; var ser = result.SerializeTimeMs > 0 ? $"{SerPerOp(result):F2}" : "N/A"; var des = result.DeserializeTimeMs > 0 ? $"{DesPerOp(result):F2}" : "N/A"; var rt = result.RoundTripTimeMs > 0 ? $"{RtPerOp(result):F2}" : "N/A"; var serAlloc = result.SerializeTimeMs > 0 ? $"{ToKilobytes(result.SerializeAllocBytesPerOp):F2}" : "N/A"; var desAlloc = result.DeserializeTimeMs > 0 ? $"{ToKilobytes(result.DeserializeAllocBytesPerOp):F2}" : "N/A"; var rtAlloc = result.RoundTripAllocBytesPerOp > 0 ? $"{ToKilobytes(result.RoundTripAllocBytesPerOp):F2}" : "N/A"; // Highlight MemoryPack baseline (any Byte[]) and AcBinary headline contender (Byte[] + SGen) with win/lose colors. // The AcBinary Byte[]+Runtime variant is shown unhighlighted — it's contextual (SGen speed-up reference), not the headline. var isHighlighted = (result.Engine == EngineMemoryPack && result.IoMode == IoByteArray) || (result.Engine == EngineAcBinary && result.IoMode == IoByteArray && result.DispatchMode == ModeSGen); var prefix = isHighlighted ? "│►" : "│ "; var suffix = isHighlighted ? "◄│" : " │"; // Color logic: Green = winner (faster), Red = loser (slower) if (isHighlighted && memPackResult != null && acBinaryResult != null) { var isMemPack = (result.Engine == EngineMemoryPack && result.IoMode == IoByteArray); var memPackFaster = RtPerOp(memPackResult) < RtPerOp(acBinaryResult); if (isMemPack) { System.Console.ForegroundColor = memPackFaster ? ConsoleColor.Green : ConsoleColor.Red; } else { System.Console.ForegroundColor = memPackFaster ? ConsoleColor.Red : ConsoleColor.Green; } } System.Console.WriteLine($"{prefix}{rank++,4} │ {result.Engine,-11} │ {result.OptionsPreset,-22} │ {result.IoMode,-12} │ {result.DispatchMode,-8} │ {setup,14} │ {size,8} │ {ser,10} │ {serAlloc,10} │ {des,10} │ {desAlloc,10} │ {rt,10} │ {rtAlloc,10}{suffix}"); if (isHighlighted) { System.Console.ResetColor(); } } // Footer row: AcBinary (Byte[]) vs MemoryPack (Byte[]) comparison per column if (memPackResult != null && acBinaryResult != null) { var sizePct = (acBinaryResult.SerializedSize / (double)memPackResult.SerializedSize - 1) * 100; // Per-op µs ratio (iter-independent) — Ser/Des may have different iter counts on the two rows. var serPct = SerPerOp(memPackResult) > 0 ? (SerPerOp(acBinaryResult) / SerPerOp(memPackResult) - 1) * 100 : 0; var desPct = DesPerOp(memPackResult) > 0 ? (DesPerOp(acBinaryResult) / DesPerOp(memPackResult) - 1) * 100 : 0; var rtPct = RtPerOp(memPackResult) > 0 ? (RtPerOp(acBinaryResult) / RtPerOp(memPackResult) - 1) * 100 : 0; var serAllocPct = memPackResult.SerializeAllocBytesPerOp > 0 ? (acBinaryResult.SerializeAllocBytesPerOp / (double)memPackResult.SerializeAllocBytesPerOp - 1) * 100 : 0; var desAllocPct = memPackResult.DeserializeAllocBytesPerOp > 0 ? (acBinaryResult.DeserializeAllocBytesPerOp / (double)memPackResult.DeserializeAllocBytesPerOp - 1) * 100 : 0; var rtAllocPct = memPackResult.RoundTripAllocBytesPerOp > 0 ? (acBinaryResult.RoundTripAllocBytesPerOp / (double)memPackResult.RoundTripAllocBytesPerOp - 1) * 100 : 0; // Footer separator: merge first 5 cols (#, Engine, Options, IO, Mode) → comparison label; // remaining 8 cols stay aligned (Setup S/D KB, Size, Ser µs/op, SerAlc KB, Des µs/op, DesAlc KB, RT µs/op, RTAlc KB). System.Console.WriteLine($"├{"─".PadRight(6, '─')}┴{"─".PadRight(13, '─')}┴{"─".PadRight(24, '─')}┴{"─".PadRight(14, '─')}┴{"─".PadRight(10, '─')}┼{"─".PadRight(16, '─')}┼{"─".PadRight(10, '─')}┼{"─".PadRight(12, '─')}┼{"─".PadRight(12, '─')}┼{"─".PadRight(12, '─')}┼{"─".PadRight(12, '─')}┼{"─".PadRight(12, '─')}┼{"─".PadRight(12, '─')}┤"); // Merged label cell width = 4 + 11 + 22 + 12 + 8 + 4*3 (dropped separators) = 69 System.Console.Write($"│ {"► AcBinary (Byte[]) vs MemoryPack (Byte[])",-69} │ "); // Setup S/D KB (n/a for Byte[] vs Byte[] — neither pre-allocates) System.Console.Write($"{"—",14}"); System.Console.Write(" │ "); // Size System.Console.ForegroundColor = sizePct <= 0 ? ConsoleColor.Green : ConsoleColor.Red; System.Console.Write($"{sizePct,+7:+0;-0}%"); System.Console.ResetColor(); System.Console.Write(" │ "); // Serialize System.Console.ForegroundColor = serPct <= 0 ? ConsoleColor.Green : ConsoleColor.Red; System.Console.Write($"{serPct,+9:+0;-0}%"); System.Console.ResetColor(); System.Console.Write(" │ "); // Serialize Alloc System.Console.ForegroundColor = serAllocPct <= 0 ? ConsoleColor.Green : ConsoleColor.Red; System.Console.Write($"{serAllocPct,+9:+0;-0}%"); System.Console.ResetColor(); System.Console.Write(" │ "); // Deserialize System.Console.ForegroundColor = desPct <= 0 ? ConsoleColor.Green : ConsoleColor.Red; System.Console.Write($"{desPct,+9:+0;-0}%"); System.Console.ResetColor(); System.Console.Write(" │ "); // Deserialize Alloc System.Console.ForegroundColor = desAllocPct <= 0 ? ConsoleColor.Green : ConsoleColor.Red; System.Console.Write($"{desAllocPct,+9:+0;-0}%"); System.Console.ResetColor(); System.Console.Write(" │ "); // Round-trip System.Console.ForegroundColor = rtPct <= 0 ? ConsoleColor.Green : ConsoleColor.Red; System.Console.Write($"{rtPct,+9:+0;-0}%"); System.Console.ResetColor(); System.Console.Write(" │ "); // Round-trip Alloc System.Console.ForegroundColor = rtAllocPct <= 0 ? ConsoleColor.Green : ConsoleColor.Red; System.Console.Write($"{rtAllocPct,+9:+0;-0}%"); System.Console.ResetColor(); System.Console.WriteLine(" │"); } // Closing line: merged on left (─ between cols 1-5), ┴ on the right (cols 6-13 boundary, 8 unmerged cells). System.Console.WriteLine($"└{"─".PadRight(6, '─')}─{"─".PadRight(13, '─')}─{"─".PadRight(24, '─')}─{"─".PadRight(14, '─')}─{"─".PadRight(10, '─')}┴{"─".PadRight(16, '─')}┴{"─".PadRight(10, '─')}┴{"─".PadRight(12, '─')}┴{"─".PadRight(12, '─')}┴{"─".PadRight(12, '─')}┴{"─".PadRight(12, '─')}┴{"─".PadRight(12, '─')}┴{"─".PadRight(12, '─')}┘"); //System.Console.WriteLine($"GrowBufferCount: {AcBinarySerializer.GrowBufferCount}"); //System.Console.WriteLine($"GrowBufferTotalBytes: {AcBinarySerializer.GrowBufferTotalBytes:N0} bytes"); } // Summary: Best serializer for each category System.Console.WriteLine("\n"); System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════════════════════════════════════╗"); System.Console.WriteLine("║ SUMMARY: WINNERS ║"); System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════════════════════════════════════╝"); System.Console.WriteLine($"\n{"Category",-20} │ {"Winner",-40} │ {"Avg Value",-18}"); System.Console.WriteLine($"{"─".PadRight(20, '─')}─┼─{"─".PadRight(40, '─')}─┼─{"─".PadRight(18, '─')}"); // Fastest Serialize — round-trip-only serializers (NamedPipe etc.) excluded: // their Serialize() captures the full round-trip and isn't comparable to a pure Ser metric. // Average is over per-op µs (iter-independent) instead of batch-time, since rows may now // have different iter counts post-calibration. var fastestSer = results.Where(r => r.SerializeTimeMs > 0 && !r.IsRoundTripOnly) .GroupBy(r => r.SerializerName) .Select(g => new { Name = g.Key, AvgPerOp = g.Average(r => SerPerOp(r)) }) .OrderBy(x => x.AvgPerOp) .FirstOrDefault(); if (fastestSer != null) System.Console.WriteLine($"{"Fastest Serialize",-20} │ {fastestSer.Name,-40} │ {fastestSer.AvgPerOp,12:F2} µs/op"); // Fastest Deserialize — round-trip-only serializers excluded (their Deserialize() is a no-op). var fastestDes = results.Where(r => r.DeserializeTimeMs > 0 && !r.IsRoundTripOnly) .GroupBy(r => r.SerializerName) .Select(g => new { Name = g.Key, AvgPerOp = g.Average(r => DesPerOp(r)) }) .OrderBy(x => x.AvgPerOp) .FirstOrDefault(); if (fastestDes != null) System.Console.WriteLine($"{"Fastest Deserialize",-20} │ {fastestDes.Name,-40} │ {fastestDes.AvgPerOp,12:F2} µs/op"); // Smallest Size var smallestSize = results .GroupBy(r => r.SerializerName) .Select(g => new { Name = g.Key, AvgSize = g.Average(r => r.SerializedSize) }) .OrderBy(x => x.AvgSize) .FirstOrDefault(); if (smallestSize != null) System.Console.WriteLine($"{"Smallest Size",-20} │ {smallestSize.Name,-40} │ {smallestSize.AvgSize,15:F0} B"); // Fastest Round-trip — iter-independent per-op average. var fastestRt = results.Where(r => r.RoundTripTimeMs > 0) .GroupBy(r => r.SerializerName) .Select(g => new { Name = g.Key, AvgPerOp = g.Average(r => RtPerOp(r)) }) .OrderBy(x => x.AvgPerOp) .FirstOrDefault(); if (fastestRt != null) System.Console.WriteLine($"{"Fastest Round-trip",-20} │ {fastestRt.Name,-40} │ {fastestRt.AvgPerOp,12:F2} µs/op"); // Overall AcBinary (SGen) vs MemoryPack comparison (baseline switched MessagePack → MemoryPack as SOTA reference). // AcBinary side is restricted to DispatchMode == SGen — apples-to-apples vs MemoryPack which is also source-generated. // The Runtime variant is shown side-by-side in each per-test fancy table for SGen-speedup context, but excluded from this headline. var memPackSerResults = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray) && r.SerializeTimeMs > 0).ToList(); var memPackDesResults = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray) && r.DeserializeTimeMs > 0).ToList(); var memPackRtResults = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray) && r.RoundTripTimeMs > 0).ToList(); var acBinarySerResults = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen) && r.SerializeTimeMs > 0).ToList(); var acBinaryDesResults = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen) && r.DeserializeTimeMs > 0).ToList(); var acBinaryRtResults = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen) && r.RoundTripTimeMs > 0).ToList(); // Skip comparison if no data available if (memPackRtResults.Count == 0 || acBinaryRtResults.Count == 0) { System.Console.WriteLine(); System.Console.WriteLine($"── {"AcBinary (Byte[], SGen)"} vs {"MemoryPack (Byte[])"} (Overall) ──"); System.Console.WriteLine(" (Comparison requires both serialize and deserialize data)"); return; } // All averages are over per-op µs (iter-independent). Batch-time averaging would mix rows // measured with different iter counts (post-calibration), producing meaningless numbers. // Three aggregations per metric: // - Arithmetic mean (current behavior) — magnitude-weighted, biased toward Large cell. // - Geometric mean of per-cell ratios — magnitude-neutral, each cell weighted equally. // - Median of per-cell ratios — outlier-resistant. // The geo/median variants surface when a single cell dominates the arithmetic average // (typical when one cell's µs-per-op is an order of magnitude larger than the others). var sizeAcResults = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)).ToList(); var sizeMpResults = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).ToList(); var serStats = ComputeOverallStats(acBinarySerResults, memPackSerResults, SerPerOp); var desStats = ComputeOverallStats(acBinaryDesResults, memPackDesResults, DesPerOp); var rtStats = ComputeOverallStats(acBinaryRtResults, memPackRtResults, RtPerOp); var sizeStats = ComputeOverallStats(sizeAcResults, sizeMpResults, r => r.SerializedSize); var serAllocStats = ComputeOverallStats(acBinarySerResults, memPackSerResults, r => r.SerializeAllocBytesPerOp); var desAllocStats = ComputeOverallStats(acBinaryDesResults, memPackDesResults, r => r.DeserializeAllocBytesPerOp); System.Console.WriteLine(); System.Console.WriteLine($"── {"AcBinary (Byte[], SGen)"} vs {"MemoryPack (Byte[])"} (Overall) ──"); WriteOverallLine("Serialize", "µs/op", serStats); WriteOverallLine("Deserialize", "µs/op", desStats); WriteOverallLine("Round-trip", "µs/op", rtStats); WriteOverallLine("Size", "B", sizeStats, "F0"); WriteOverallLine("Ser Alloc", "B/op", serAllocStats, "F0"); WriteOverallLine("Des Alloc", "B/op", desAllocStats, "F0"); } /// /// Formats a signed percent delta with explicit sign for positive values (`+1.5%`, `-3.0%`, `0.0%`). /// Padded to 7 chars (e.g. ` +12.3%`, `-100.0%`) for column alignment in the Overall block. /// private static string FormatPctSigned(double pct) => pct.ToString("+0.0;-0.0;0.0", System.Globalization.CultureInfo.InvariantCulture).PadLeft(6) + "%"; /// /// Renders one Overall row with arith / geo / median deltas + AcBinary/MemPack absolute means. /// Color is driven by the geometric-mean delta (magnitude-neutral signal). Skips silently when /// stats is null (no paired data). /// private static void WriteOverallLine(string label, string unit, OverallStats? stats, string fmt = "F2") { if (stats == null) return; // Color follows geo-mean (the magnitude-neutral signal). The arith-mean column may show a // different sign when a single big cell dominates — that's exactly the signal we want to surface. System.Console.ForegroundColor = stats.GeoMeanPct <= 0 ? ConsoleColor.Green : ConsoleColor.Red; System.Console.WriteLine($" {label,-12} arith {FormatPctSigned(stats.ArithMeanPct)} │ geo {FormatPctSigned(stats.GeoMeanPct)} │ median {FormatPctSigned(stats.MedianPct)} ({stats.AcAvg.ToString(fmt, System.Globalization.CultureInfo.InvariantCulture)} {unit} vs {stats.MpAvg.ToString(fmt, System.Globalization.CultureInfo.InvariantCulture)} {unit}, {stats.CellCount} cells)"); System.Console.ResetColor(); } /// /// Same as but appends to a (no color). /// Used by the .log and .LLM file writers. /// private static void AppendOverallLine(StringBuilder sb, string label, string unit, OverallStats? stats, string fmt = "F2") { if (stats == null) return; sb.AppendLine($" {label,-12} arith {FormatPctSigned(stats.ArithMeanPct)} | geo {FormatPctSigned(stats.GeoMeanPct)} | median {FormatPctSigned(stats.MedianPct)} ({stats.AcAvg.ToString(fmt, System.Globalization.CultureInfo.InvariantCulture)} {unit} vs {stats.MpAvg.ToString(fmt, System.Globalization.CultureInfo.InvariantCulture)} {unit}, {stats.CellCount} cells)"); } private static void SaveResults(List results, List testDataSets) { Directory.CreateDirectory(ResultsDirectory); var timestamp = DateTime.Now.ToString("yyyy-MM-dd_HH-mm-ss"); var baseFileName = $"Console.FullBenchmark_{BuildConfiguration}_{timestamp}"; var logFilePath = Path.Combine(ResultsDirectory, $"{baseFileName}.log"); var outputFilePath = Path.Combine(ResultsDirectory, $"{baseFileName}.output"); // Save binary output to separate .output file var largeTestData = testDataSets.FirstOrDefault(t => t.Name.StartsWith("Large")); if (largeTestData != null) { var outputSb = new StringBuilder(); outputSb.AppendLine("╔══════════════════════════════════════════════════════════════════════════════════════════════════════╗"); outputSb.AppendLine("║ SERIALIZED BINARY OUTPUT ║"); outputSb.AppendLine($"║ Generated: {DateTime.Now:yyyy-MM-dd HH:mm:ss}".PadRight(100) + "║"); outputSb.AppendLine("╚══════════════════════════════════════════════════════════════════════════════════════════════════════╝"); outputSb.AppendLine(); outputSb.AppendLine("=== SERIALIZED BYTES: Large (5x5x5x10) - AcBinary (Default) ==="); var serializedBytes = AcBinarySerializer.Serialize(largeTestData.Order, AcBinarySerializerOptions.Default); outputSb.AppendLine($"Size: {serializedBytes.Length:N0} bytes"); outputSb.AppendLine(); outputSb.AppendLine("Hex dump:"); outputSb.AppendLine(FormatHexDump(serializedBytes)); File.WriteAllText(outputFilePath, outputSb.ToString(), Utf8NoBom); System.Console.WriteLine($"✓ Binary output saved to: {outputFilePath}"); } // Save benchmark results to .log file var sb = new StringBuilder(); sb.AppendLine("╔══════════════════════════════════════════════════════════════════════════════════════════════════════╗"); sb.AppendLine("║ SERIALIZER BENCHMARK RESULTS ║"); sb.AppendLine($"║ Generated: {DateTime.Now:yyyy-MM-dd HH:mm:ss}".PadRight(100) + "║"); sb.AppendLine($"║ Build: {BuildConfiguration}".PadRight(100) + "║"); sb.AppendLine($"║ Charset: {GetCurrentCharsetName()}".PadRight(100) + "║"); sb.AppendLine($"║ Iterations: per-cell adaptive (~{TargetSampleMs} ms target)".PadRight(100) + "║"); sb.AppendLine($"║ Samples: {BenchmarkSamples} (median) + 1 pilot discarded".PadRight(100) + "║"); sb.AppendLine($"║ Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"}".PadRight(100) + "║"); sb.AppendLine("╚══════════════════════════════════════════════════════════════════════════════════════════════════════╝"); sb.AppendLine(); // Serializer options summary var optionsMap = results .Where(r => r.OptionsDescription != null) .Select(r => (r.SerializerName, r.OptionsDescription!)) .Distinct() .ToList(); if (optionsMap.Count > 0) { sb.AppendLine("=== SERIALIZER OPTIONS ==="); foreach (var (name, opts) in optionsMap) sb.AppendLine($" {name}: {opts}"); sb.AppendLine(); } // CSV-like data for easy import — keeps raw byte integers (no KB rounding) so external tools can compute precisely. sb.AppendLine("=== RAW DATA (CSV) ==="); sb.AppendLine("TestData,Engine,IO,Mode,Options,Size,SerializeMicrosPerOp,DeserializeMicrosPerOp,RoundTripMicrosPerOp,SerializeAllocBytesPerOp,DeserializeAllocBytesPerOp,RoundTripAllocBytesPerOp,SetupSerializeAllocBytes,SetupDeserializeAllocBytes"); foreach (var testData in testDataSets) { var testResults = results.Where(r => r.TestDataName == testData.DisplayName).ToList(); foreach (var result in testResults) { sb.AppendLine($"{result.TestDataName},{result.Engine},{result.IoMode},{result.DispatchMode},{result.OptionsPreset},{result.SerializedSize},{SerPerOp(result):F2},{DesPerOp(result):F2},{RtPerOp(result):F2},{result.SerializeAllocBytesPerOp},{result.DeserializeAllocBytesPerOp},{result.RoundTripAllocBytesPerOp},{result.SetupSerializeAllocBytes},{result.SetupDeserializeAllocBytes}"); } } sb.AppendLine(); // Formatted results sb.AppendLine("=== FORMATTED RESULTS BY TEST DATA ==="); sb.AppendLine($"(►) = Highlighted: {"MemoryPack (Byte[])"} (baseline) and {"AcBinary (Byte[])"}"); sb.AppendLine(); foreach (var testData in testDataSets) { // Order by per-op µs (iter-independent) — rows may have different iter counts post-calibration. var testResults = results.Where(r => r.TestDataName == testData.DisplayName).OrderBy(r => RtPerOp(r)).ToList(); var memPackResult = testResults.FirstOrDefault(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)); // Pin the comparison to AcBinary's SGen variant — apples-to-apples vs MemoryPack (also source-generated). // The Runtime variant is shown alongside in the table for context, not used as the headline number. var acBinaryResult = testResults.FirstOrDefault(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)); sb.AppendLine(); sb.AppendLine($"--- {testData.DisplayName} ---"); sb.AppendLine($"{"#",-4} {"Serializer",-42} {"Size B",-12} {"Setup S/D KB",-14} {"Ser µs/op",-12} {"Des µs/op",-12} {"RT µs/op",-12} {"SerAlc KB",-11} {"DesAlc KB",-11}"); sb.AppendLine(new string('-', 140)); var rank = 1; foreach (var result in testResults) { var isHighlighted = ((result.Engine == EngineMemoryPack || result.Engine == EngineAcBinary) && result.IoMode == IoByteArray); var prefix = isHighlighted ? "► " : " "; var size = $"{result.SerializedSize:N0}"; var setup = $"{ToKilobytes(result.SetupSerializeAllocBytes):F2} / {ToKilobytes(result.SetupDeserializeAllocBytes):F2}"; var ser = result.SerializeTimeMs > 0 ? $"{SerPerOp(result):F2}" : "N/A"; var des = result.DeserializeTimeMs > 0 ? $"{DesPerOp(result):F2}" : "N/A"; var rt = result.RoundTripTimeMs > 0 ? $"{RtPerOp(result):F2}" : "N/A"; var serAlloc = result.SerializeTimeMs > 0 ? $"{ToKilobytes(result.SerializeAllocBytesPerOp):F2}" : "N/A"; var desAlloc = result.DeserializeTimeMs > 0 ? $"{ToKilobytes(result.DeserializeAllocBytesPerOp):F2}" : "N/A"; sb.AppendLine($"{rank++,2} {prefix}{result.SerializerName,-40} {size,-12} {setup,-14} {ser,-12} {des,-12} {rt,-12} {serAlloc,-11} {desAlloc,-11}"); } // Summary row for this test data (vs MemoryPack — baseline switched MessagePack → MemoryPack) if (memPackResult != null && acBinaryResult != null) { var sizePct = (acBinaryResult.SerializedSize / (double)memPackResult.SerializedSize - 1) * 100; // Per-op µs ratio (iter-independent) — Ser/Des may have different iter counts on the two rows. var serPct = SerPerOp(memPackResult) > 0 ? (SerPerOp(acBinaryResult) / SerPerOp(memPackResult) - 1) * 100 : 0; var desPct = DesPerOp(memPackResult) > 0 ? (DesPerOp(acBinaryResult) / DesPerOp(memPackResult) - 1) * 100 : 0; var rtPct = RtPerOp(memPackResult) > 0 ? (RtPerOp(acBinaryResult) / RtPerOp(memPackResult) - 1) * 100 : 0; sb.AppendLine($" {"AcBinary (Byte[])"} vs {"MemoryPack (Byte[])"}: Size {sizePct:+0;-0}% │ Ser {serPct:+0;-0}% │ Des {desPct:+0;-0}% │ RT {rtPct:+0;-0}%"); } //sb.AppendLine($"GrowBufferCount: {AcBinarySerializer.GrowBufferCount}"); //sb.AppendLine($"GrowBufferTotalBytes: {AcBinarySerializer.GrowBufferTotalBytes:N0} bytes"); } // Summary comparison (vs MemoryPack) // Restrict AcBinary side to SGen — the SGen vs Runtime variants are shown side-by-side // in the per-test fancy table; the headline should compare apples-to-apples (both source-generated). sb.AppendLine(); sb.AppendLine($"=== {"AcBinary (Byte[], SGen)"} vs {"MemoryPack (Byte[])"} (Overall) ==="); var memPackSerResults2 = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray) && r.SerializeTimeMs > 0).ToList(); var memPackDesResults2 = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray) && r.DeserializeTimeMs > 0).ToList(); var memPackRtResults2 = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray) && r.RoundTripTimeMs > 0).ToList(); var acBinarySerResults2 = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen) && r.SerializeTimeMs > 0).ToList(); var acBinaryDesResults2 = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen) && r.DeserializeTimeMs > 0).ToList(); var acBinaryRtResults2 = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen) && r.RoundTripTimeMs > 0).ToList(); // Skip comparison block if either side has no Byte[] data — happens in AsyncPipe-only mode // where only NamedPipe rows exist (no MemoryPack baseline, no AcBinary Byte[] reference). // Mirrors the same early-return guard in PrintGroupedResults. if (memPackRtResults2.Count == 0 || acBinaryRtResults2.Count == 0) { sb.AppendLine(" (Comparison requires both serialize and deserialize data)"); File.WriteAllText(logFilePath, sb.ToString(), Utf8NoBom); System.Console.WriteLine($"✓ Results saved to: {logFilePath}"); var llmFilePathEarly = Path.Combine(ResultsDirectory, $"{baseFileName}.LLM"); SaveLlmResults(llmFilePathEarly, results, testDataSets); return; } // Per-cell-paired aggregation: arithmetic / geometric / median. See PrintSummary's parallel // block + the OverallStats record for the rationale (per-cell ratio vs magnitude-weighted mean). var sizeAcResults2 = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)).ToList(); var sizeMpResults2 = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).ToList(); AppendOverallLine(sb, "Serialize", "µs/op", ComputeOverallStats(acBinarySerResults2, memPackSerResults2, SerPerOp)); AppendOverallLine(sb, "Ser Alloc", "B/op", ComputeOverallStats(acBinarySerResults2, memPackSerResults2, r => r.SerializeAllocBytesPerOp), "F0"); AppendOverallLine(sb, "Deserialize", "µs/op", ComputeOverallStats(acBinaryDesResults2, memPackDesResults2, DesPerOp)); AppendOverallLine(sb, "Des Alloc", "B/op", ComputeOverallStats(acBinaryDesResults2, memPackDesResults2, r => r.DeserializeAllocBytesPerOp), "F0"); AppendOverallLine(sb, "Round-trip", "µs/op", ComputeOverallStats(acBinaryRtResults2, memPackRtResults2, RtPerOp)); AppendOverallLine(sb, "Size", "B", ComputeOverallStats(sizeAcResults2, sizeMpResults2, r => r.SerializedSize), "F0"); File.WriteAllText(logFilePath, sb.ToString(), Utf8NoBom); System.Console.WriteLine($"✓ Results saved to: {logFilePath}"); // Save LLM-optimized results var llmFilePath = Path.Combine(ResultsDirectory, $"{baseFileName}.LLM"); SaveLlmResults(llmFilePath, results, testDataSets); } private static void SaveLlmResults(string filePath, List results, List testDataSets) { var sb = new StringBuilder(); var testTypeName = testDataSets.FirstOrDefault()?.TypeName ?? "unknown"; sb.AppendLine($"# AcBinary Benchmark {BuildConfiguration} {DateTime.Now:yyyy-MM-dd HH:mm:ss}"); sb.AppendLine($"Charset: {GetCurrentCharsetName()} | Iterations: per-cell adaptive (target ~{TargetSampleMs} ms/sample) | Warmup: {WarmupIterations} | Samples: {BenchmarkSamples} (median) + 1 pilot discarded | .NET: {Environment.Version} | TestType: {testTypeName} | UnstableCV threshold: {UnstableCVThreshold * 100:F0}%"); sb.AppendLine($"Baseline: {"MemoryPack (Byte[])"} (SOTA reference) | Verified: round-trip correctness checked once per cell before warmup"); // Options summary var optionsMap = results .Where(r => r.OptionsDescription != null) .Select(r => (r.SerializerName, r.OptionsDescription!)) .Distinct() .ToList(); if (optionsMap.Count > 0) { sb.AppendLine(); sb.AppendLine("## Options"); sb.AppendLine(); foreach (var (name, opts) in optionsMap) sb.AppendLine($"- **{name}**: {opts}"); } // Flat results table sorted by test data then round-trip (now includes Alloc + Iter columns). // Iter column shows per-row Ser/Des iteration counts (post-adaptive-calibration), so the reader // can verify that each cell's batch sample landed near the TargetSampleMs window. sb.AppendLine(); sb.AppendLine("## Results"); sb.AppendLine(); sb.AppendLine("TestData | Engine | IO | Mode | Options | Size(B) | Ser(µs/op) | Deser(µs/op) | RT(µs/op) | SerAlloc(KB/op) | DesAlloc(KB/op) | RTAlloc(KB/op) | Setup S/D(KB) | Iter Ser/Des"); sb.AppendLine("---|---|---|---|---|---|---|---|---|---|---|---|---|---"); foreach (var testData in testDataSets) { var testResults = results .Where(r => r.TestDataName == testData.DisplayName) // Per-op µs (iter-independent) ordering — mixing iter counts within a cell is now expected. .OrderBy(r => RtPerOp(r)) .ToList(); foreach (var r in testResults) { var inv = System.Globalization.CultureInfo.InvariantCulture; // Per-cell median + inter-sample range (min..max) + CV-threshold marker (⚠️X.X% when CV > 3%). // Range surfaces the noise floor for each row so a small inter-engine delta is easy to // judge against the row's noise. Format: "26.86 (24.50..29.10)" or // "26.86 (24.50..29.10) ⚠️5.2%" when stddev/mean exceeds the unstable threshold. // When only one sample was taken (Debug / quick mode) min == max == median; collapse // to bare median to avoid visual clutter. var ser = r.SerializeTimeMs > 0 ? FormatMicrosWithRange(r.SerializeTimeMs, r.SerializeTimeMinMs, r.SerializeTimeMaxMs, r.SerializeTimeStdDevMs, r.SerializeIterations, inv) : "-"; var des = r.DeserializeTimeMs > 0 ? FormatMicrosWithRange(r.DeserializeTimeMs, r.DeserializeTimeMinMs, r.DeserializeTimeMaxMs, r.DeserializeTimeStdDevMs, r.DeserializeIterations, inv) : "-"; var rt = r.RoundTripTimeMs > 0 ? (r.IsRoundTripOnly ? FormatMicrosWithRange(r.RoundTripTimeMs, r.RoundTripTimeMinMs, r.RoundTripTimeMaxMs, r.RoundTripTimeStdDevMs, r.RoundTripIterations, inv) : RtPerOp(r).ToString("F2", inv)) : "-"; var serAlloc = r.SerializeTimeMs > 0 ? ToKilobytes(r.SerializeAllocBytesPerOp).ToString("F2", inv) : "-"; var desAlloc = r.DeserializeTimeMs > 0 ? ToKilobytes(r.DeserializeAllocBytesPerOp).ToString("F2", inv) : "-"; var rtAlloc = r.RoundTripAllocBytesPerOp > 0 ? ToKilobytes(r.RoundTripAllocBytesPerOp).ToString("F2", inv) : "-"; var setupAlloc = $"{ToKilobytes(r.SetupSerializeAllocBytes).ToString("F2", inv)} / {ToKilobytes(r.SetupDeserializeAllocBytes).ToString("F2", inv)}"; // Iter Ser/Des column — per-row adaptive iter counts. RT-only rows show Iter for RT. var iterCol = r.IsRoundTripOnly ? r.RoundTripIterations.ToString(inv) : $"{(r.SerializeIterations > 0 ? r.SerializeIterations.ToString(inv) : "-")} / {(r.DeserializeIterations > 0 ? r.DeserializeIterations.ToString(inv) : "-")}"; sb.AppendLine($"{r.TestDataName} | {r.Engine} | {r.IoMode} | {r.DispatchMode} | {r.OptionsPreset} | {r.SerializedSize} | {ser} | {des} | {rt} | {serAlloc} | {desAlloc} | {rtAlloc} | {setupAlloc} | {iterCol}"); } } // Overall AcBinary (SGen, Byte[]) vs MemoryPack (Byte[]) comparison — same three aggregations // as the .log / console output (arithmetic / geometric / median of per-cell ratios). The // arith mean is magnitude-weighted (Large cell dominates); geo/median are per-cell-equal // signals. Adding this lets an LLM diagnose whether a headline delta is a real overall // win/loss or a single-cell artifact. var memPackByteArrayResults = results.Where(r => r.Engine == EngineMemoryPack && r.IoMode == IoByteArray).ToList(); var acBinarySGenByteArrayResults = results.Where(r => r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen).ToList(); var memPackSerResultsLlm = memPackByteArrayResults.Where(r => r.SerializeTimeMs > 0).ToList(); var memPackDesResultsLlm = memPackByteArrayResults.Where(r => r.DeserializeTimeMs > 0).ToList(); var memPackRtResultsLlm = memPackByteArrayResults.Where(r => r.RoundTripTimeMs > 0).ToList(); var acBinarySerResultsLlm = acBinarySGenByteArrayResults.Where(r => r.SerializeTimeMs > 0).ToList(); var acBinaryDesResultsLlm = acBinarySGenByteArrayResults.Where(r => r.DeserializeTimeMs > 0).ToList(); var acBinaryRtResultsLlm = acBinarySGenByteArrayResults.Where(r => r.RoundTripTimeMs > 0).ToList(); if (memPackRtResultsLlm.Count > 0 && acBinaryRtResultsLlm.Count > 0) { sb.AppendLine(); sb.AppendLine("## Overall: AcBinary (Byte[], SGen) vs MemoryPack (Byte[])"); sb.AppendLine(); sb.AppendLine("Three aggregations of per-cell results: **arith** = arithmetic mean of µs/op (magnitude-weighted, Large cell dominates); **geo** = geometric mean of per-cell ratios (each cell weighted equally); **median** = median of per-cell ratios (outlier-resistant). Negative % = AcBinary faster/smaller; positive % = MemPack faster/smaller. The geo/median variants surface when a single big cell skews the arithmetic mean."); sb.AppendLine(); sb.AppendLine("```"); AppendOverallLine(sb, "Serialize", "µs/op", ComputeOverallStats(acBinarySerResultsLlm, memPackSerResultsLlm, SerPerOp)); AppendOverallLine(sb, "Ser Alloc", "B/op", ComputeOverallStats(acBinarySerResultsLlm, memPackSerResultsLlm, r => r.SerializeAllocBytesPerOp), "F0"); AppendOverallLine(sb, "Deserialize", "µs/op", ComputeOverallStats(acBinaryDesResultsLlm, memPackDesResultsLlm, DesPerOp)); AppendOverallLine(sb, "Des Alloc", "B/op", ComputeOverallStats(acBinaryDesResultsLlm, memPackDesResultsLlm, r => r.DeserializeAllocBytesPerOp), "F0"); AppendOverallLine(sb, "Round-trip", "µs/op", ComputeOverallStats(acBinaryRtResultsLlm, memPackRtResultsLlm, RtPerOp)); AppendOverallLine(sb, "Size", "B", ComputeOverallStats(acBinarySGenByteArrayResults, memPackByteArrayResults, r => r.SerializedSize), "F0"); sb.AppendLine("```"); } File.WriteAllText(filePath, sb.ToString(), Utf8NoBom); System.Console.WriteLine($"✓ LLM results saved to: {filePath}"); } /// /// Formats byte array as hex dump with offset, hex values, and ASCII representation. /// private static string FormatHexDump(byte[] bytes, int bytesPerLine = 16) { var sb = new StringBuilder(); for (var i = 0; i < bytes.Length; i += bytesPerLine) { // Offset sb.Append($"{i:X8} "); // Hex bytes for (var j = 0; j < bytesPerLine; j++) { if (i + j < bytes.Length) sb.Append($"{bytes[i + j]:X2} "); else sb.Append(" "); if (j == 7) sb.Append(' '); // Extra space in middle } sb.Append(" |"); // ASCII representation for (var j = 0; j < bytesPerLine && i + j < bytes.Length; j++) { var b = bytes[i + j]; sb.Append(b is >= 32 and < 127 ? (char)b : '.'); } sb.AppendLine("|"); } return sb.ToString(); } #endregion }