diff --git a/AyCode.Core.Serializers.Console/Program.cs b/AyCode.Core.Serializers.Console/Program.cs index 103679b..c954b11 100644 --- a/AyCode.Core.Serializers.Console/Program.cs +++ b/AyCode.Core.Serializers.Console/Program.cs @@ -47,7 +47,7 @@ public static class Program private static int TestIterations = 1; private static int BenchmarkSamples = 1; // Debug: single sample, fast iteration #else - private static int WarmupIterations = 10000; //5000 + private static int WarmupIterations = 5000; //10000 — per-phase (Ser AND Des get their own warmup separately) private static int TestIterations = 1000; //1000 private static int BenchmarkSamples = 10; #endif @@ -462,7 +462,7 @@ public static class Program var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets(); var testDataSets = FilterByLayer(allTestDataSets, layer); - System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{TargetSampleMs} ms target) | Warmup: {WarmupIterations} | Samples: {BenchmarkSamples} (median) + pilot discard"); + System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{TargetSampleMs} ms target) | Warmup: {WarmupIterations} per phase (Ser/Des isolated) | Samples: {BenchmarkSamples} (median) + pilot discard"); System.Console.WriteLine($"Build: {BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}"); System.Console.WriteLine(); @@ -589,6 +589,22 @@ public static class Program #region Benchmark Execution + /// + /// Forces a full GC cycle at a phase boundary in the benchmark loop. Two-pass collect with finalizer drain + /// in between: the first pass moves managed garbage to the finalization queue, WaitForPendingFinalizers + /// runs the finalizers, the second pass reclaims any objects the finalizers released. After this returns the + /// heap is in a known-quiescent state — the next warmup/measurement phase starts on a clean slate, isolated + /// from the previous phase's residual allocations (write-buffer pools, intern cache, write-plan arrays, etc.). + /// Called between every Ser-phase / Des-phase boundary in . + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private static void ForceGcCollect() + { + GC.Collect(2, GCCollectionMode.Forced, blocking: true); + GC.WaitForPendingFinalizers(); + GC.Collect(2, GCCollectionMode.Forced, blocking: true); + } + private static List RunBenchmarksForTestData(TestDataSet testData, string mode, string serializerMode) { var results = new List(); @@ -610,39 +626,25 @@ public static class Program System.Console.WriteLine("✓ All serializers passed round-trip verification."); - // Per-serializer (warmup → calibrate → measurement) cycle: each serializer warms up IMMEDIATELY - // before its own bench, then calibrates iter per-function (Ser and Des independently) so each - // sample lands at ~TargetSampleMs wall-clock. This avoids cache pollution AND equalizes sample - // window length across cells of vastly different per-op cost. - System.Console.WriteLine($"Running benchmarks (target ~{TargetSampleMs} ms/sample × {BenchmarkSamples} samples median, per-serializer warmup + adaptive iter)...\n"); + // Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and + // Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary. + // + // Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor + // in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a + // cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs + // keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement + // (steady-state). Branch-predictor history also stays clean per path. + // + // GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer + // pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent + // heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations. + // + // JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT). + // Each phase's freshly-promoted methods settle before its timing starts. + System.Console.WriteLine($"Running benchmarks (target ~{TargetSampleMs} ms/sample × {BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n"); foreach (var serializer in serializers) { - // Warmup THIS serializer right before benching it — keeps its hot code/data in cache. - serializer.Warmup(WarmupIterations); - - // Wait for tiered JIT background compilation to drain (mode-aware: 0ms in AOT). - // Per-serializer instead of once globally — guarantees this serializer's freshly-promoted - // methods are settled before timing, regardless of when it appears in the iteration order. - if (JitSleep > 0) Thread.Sleep(JitSleep); - - // Adaptive iter calibration — per Ser/Des/RT function, post-warmup. Each function gets its - // own iter count tuned to TargetSampleMs (typically 250 ms). The 100-iter calibration cost - // is amortized over the BenchmarkSamples + 1 (pilot) recorded measurements that follow. - int serIter = TestIterations, desIter = TestIterations, rtIter = TestIterations; - if (serializer.IsRoundTripOnly) - { - if (mode is "all" or "serialize" or "ser") - rtIter = CalibrateIterations(() => serializer.Serialize(), TargetSampleMs); - } - else - { - if (mode is "all" or "serialize" or "ser") - serIter = CalibrateIterations(() => serializer.Serialize(), TargetSampleMs); - if (mode is "all" or "deserialize" or "des") - desIter = CalibrateIterations(() => serializer.Deserialize(), TargetSampleMs); - } - var result = new BenchmarkResult { TestDataName = testData.DisplayName, // Use DisplayName for IId% info @@ -663,26 +665,38 @@ public static class Program if (serializer.IsRoundTripOnly) { - // Round-trip-only benchmarks (NamedPipe etc.): measure the full pipe round-trip directly into the RT - // columns. Ser ms / SerAlloc / Des ms / DesAlloc stay 0 → display as "N/A". Allocation uses the - // process-wide measurement so the server-drain-thread allocations (e.g. server-side new byte[len]) - // also show up — otherwise current-thread alloc would only count the client side and look ~halved. + // Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT, + // Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the + // entire round-trip path, then record into the RT result columns. if (mode is "all" or "serialize" or "ser") { + ForceGcCollect(); + serializer.WarmupSerialize(WarmupIterations); + if (JitSleep > 0) Thread.Sleep(JitSleep); + + var rtIter = CalibrateIterations(() => serializer.Serialize(), TargetSampleMs); var (rtMed, rtMin, rtMax, rtStd) = RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]"); result.RoundTripTimeMs = rtMed; result.RoundTripTimeMinMs = rtMin; result.RoundTripTimeMaxMs = rtMax; result.RoundTripTimeStdDevMs = rtStd; result.RoundTripIterations = rtIter; + // Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len]) + // also show up — otherwise current-thread alloc would only count the client side and look ~halved. result.RoundTripAllocBytesPerOp = MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]"); } // mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently. } else { + // ── Ser phase ── isolated warmup → JitSleep → calibrate → time → alloc; preceded by GC.Collect. if (mode is "all" or "serialize" or "ser") { + ForceGcCollect(); + serializer.WarmupSerialize(WarmupIterations); + if (JitSleep > 0) Thread.Sleep(JitSleep); + + var serIter = CalibrateIterations(() => serializer.Serialize(), TargetSampleMs); var (serMed, serMin, serMax, serStd) = RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]"); result.SerializeTimeMs = serMed; result.SerializeTimeMinMs = serMin; @@ -693,8 +707,16 @@ public static class Program result.SerializeAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]"); } + // ── Des phase ── isolated warmup → JitSleep → calibrate → time → alloc; preceded by GC.Collect. + // The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the + // Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph). if (mode is "all" or "deserialize" or "des") { + ForceGcCollect(); + serializer.WarmupDeserialize(WarmupIterations); + if (JitSleep > 0) Thread.Sleep(JitSleep); + + var desIter = CalibrateIterations(() => serializer.Deserialize(), TargetSampleMs); var (desMed, desMin, desMax, desStd) = RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]"); result.DeserializeTimeMs = desMed; result.DeserializeTimeMinMs = desMin; @@ -708,10 +730,10 @@ public static class Program // batch-time addition would be misleading. Instead: compute per-op µs (iter-independent), // then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that // RoundTripTimeMs / RoundTripIterations * 1000 == SerPerOp + DesPerOp. - var serPerOp = ToPerOpMicros(result.SerializeTimeMs, serIter); - var desPerOp = ToPerOpMicros(result.DeserializeTimeMs, desIter); + var serPerOp = ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations); + var desPerOp = ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations); var rtPerOp = serPerOp + desPerOp; - result.RoundTripIterations = Math.Max(serIter, desIter); + result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations); result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations; result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp; } @@ -1469,7 +1491,28 @@ public static class Program /// rankings (because both metrics are misleading there) — they still participate in "Fastest Round-trip". /// Default false for in-memory IO modes which measure Ser and Des separately. bool IsRoundTripOnly => false; + /// Combined warmup (Ser + Deser interleaved). Kept for backward-compat with ProfilerMode + /// and other callers that don't need phase-separated warmup. The benchmark loop prefers the split + /// + pair for cache-isolated measurements. void Warmup(int iterations); + + /// Warm only the Serialize path. Default body iterates N times. + /// Overrides are only needed when the implementor wants Ser-specific warmup-state (e.g. pre-allocate buffers). + /// On benchmarks (NamedPipe-style) performs the full RT, + /// so this warms the entire round-trip path. + void WarmupSerialize(int iterations) + { + for (var i = 0; i < iterations; i++) Serialize(); + } + + /// Warm only the Deserialize path. Default body iterates N times. + /// On benchmarks is a no-op, so the bench loop + /// skips the Des-phase entirely for those cells. + void WarmupDeserialize(int iterations) + { + for (var i = 0; i < iterations; i++) Deserialize(); + } + void Serialize(); void Deserialize(); /// Round-trip correctness check — called once per cell before warmup. Returns true if Serialize+Deserialize preserves data. @@ -3274,7 +3317,7 @@ public static class Program var sb = new StringBuilder(); var testTypeName = testDataSets.FirstOrDefault()?.TypeName ?? "unknown"; sb.AppendLine($"# AcBinary Benchmark {BuildConfiguration} {DateTime.Now:yyyy-MM-dd HH:mm:ss}"); - sb.AppendLine($"Charset: {GetCurrentCharsetName()} | Iterations: per-cell adaptive (target ~{TargetSampleMs} ms/sample) | Warmup: {WarmupIterations} | Samples: {BenchmarkSamples} (median) + 1 pilot discarded | .NET: {Environment.Version} | TestType: {testTypeName} | UnstableCV threshold: {UnstableCVThreshold * 100:F0}%"); + sb.AppendLine($"Charset: {GetCurrentCharsetName()} | Iterations: per-cell adaptive (target ~{TargetSampleMs} ms/sample) | Warmup: {WarmupIterations} per phase (Ser/Des isolated) | Samples: {BenchmarkSamples} (median) + 1 pilot discarded | .NET: {Environment.Version} | TestType: {testTypeName} | UnstableCV threshold: {UnstableCVThreshold * 100:F0}%"); sb.AppendLine($"Baseline: {"MemoryPack (Byte[])"} (SOTA reference) | Verified: round-trip correctness checked once per cell before warmup"); // Options summary