[LOADED_DOCS: 2 files, no new loads]

Phase-isolated Ser/Des warmup & GC in benchmarks

Refactored benchmark loop to perform separate warmup and measurement for serialization and deserialization phases, with forced GC.Collect at each phase boundary for heap and cache isolation. Added ForceGcCollect() and new WarmupSerialize/WarmupDeserialize interface methods (with defaults). Updated output, documentation, and per-phase iteration handling for improved accuracy and clarity. Added detailed comments explaining rationale and effects.
This commit is contained in:
Loretta 2026-05-11 13:52:38 +02:00
parent 73d81ea580
commit 969fa550b5
1 changed files with 83 additions and 40 deletions

View File

@ -47,7 +47,7 @@ public static class Program
private static int TestIterations = 1;
private static int BenchmarkSamples = 1; // Debug: single sample, fast iteration
#else
private static int WarmupIterations = 10000; //5000
private static int WarmupIterations = 5000; //10000 — per-phase (Ser AND Des get their own warmup separately)
private static int TestIterations = 1000; //1000
private static int BenchmarkSamples = 10;
#endif
@ -462,7 +462,7 @@ public static class Program
var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
var testDataSets = FilterByLayer(allTestDataSets, layer);
System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{TargetSampleMs} ms target) | Warmup: {WarmupIterations} | Samples: {BenchmarkSamples} (median) + pilot discard");
System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{TargetSampleMs} ms target) | Warmup: {WarmupIterations} per phase (Ser/Des isolated) | Samples: {BenchmarkSamples} (median) + pilot discard");
System.Console.WriteLine($"Build: {BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
System.Console.WriteLine();
@ -589,6 +589,22 @@ public static class Program
#region Benchmark Execution
/// <summary>
/// Forces a full GC cycle at a phase boundary in the benchmark loop. Two-pass collect with finalizer drain
/// in between: the first pass moves managed garbage to the finalization queue, <c>WaitForPendingFinalizers</c>
/// runs the finalizers, the second pass reclaims any objects the finalizers released. After this returns the
/// heap is in a known-quiescent state — the next warmup/measurement phase starts on a clean slate, isolated
/// from the previous phase's residual allocations (write-buffer pools, intern cache, write-plan arrays, etc.).
/// Called between every Ser-phase / Des-phase boundary in <see cref="RunBenchmarksForTestData"/>.
/// </summary>
[MethodImpl(MethodImplOptions.NoInlining)]
private static void ForceGcCollect()
{
GC.Collect(2, GCCollectionMode.Forced, blocking: true);
GC.WaitForPendingFinalizers();
GC.Collect(2, GCCollectionMode.Forced, blocking: true);
}
private static List<BenchmarkResult> RunBenchmarksForTestData(TestDataSet testData, string mode, string serializerMode)
{
var results = new List<BenchmarkResult>();
@ -610,39 +626,25 @@ public static class Program
System.Console.WriteLine("✓ All serializers passed round-trip verification.");
// Per-serializer (warmup → calibrate → measurement) cycle: each serializer warms up IMMEDIATELY
// before its own bench, then calibrates iter per-function (Ser and Des independently) so each
// sample lands at ~TargetSampleMs wall-clock. This avoids cache pollution AND equalizes sample
// window length across cells of vastly different per-op cost.
System.Console.WriteLine($"Running benchmarks (target ~{TargetSampleMs} ms/sample × {BenchmarkSamples} samples median, per-serializer warmup + adaptive iter)...\n");
// Per-serializer, PER-PHASE (warmup → calibrate → measurement) cycle: each serializer's Ser-path and
// Des-path get COMPLETELY ISOLATED warmup→measure rounds, with a GC.Collect at every phase boundary.
//
// Why phase-isolation: a combined warmup (Ser+Des interleaved) leaves the CPU I-cache + branch-predictor
// in a "compromise state" — neither Ser nor Des code-set dominates. The first phase to measure pays a
// cache-miss penalty as its code-set displaces the leftover-warmup-state. Isolated warmup→measure pairs
// keep the I-cache HOT for ONLY the measured path, both in the warmup (priming) and the measurement
// (steady-state). Branch-predictor history also stays clean per path.
//
// GC.Collect at every boundary: removes residual allocation pressure from the previous phase (write-buffer
// pool churn from Ser, deserialized object graph from Des) so the next phase starts with a quiescent
// heap — GC tier-promotion timing during measurement is then driven only by THAT phase's allocations.
//
// JitSleep per-phase: tiered JIT background promotion drain after each warmup (mode-aware: 0 ms in AOT).
// Each phase's freshly-promoted methods settle before its timing starts.
System.Console.WriteLine($"Running benchmarks (target ~{TargetSampleMs} ms/sample × {BenchmarkSamples} samples median, phase-isolated warmup/measure per Ser/Des)...\n");
foreach (var serializer in serializers)
{
// Warmup THIS serializer right before benching it — keeps its hot code/data in cache.
serializer.Warmup(WarmupIterations);
// Wait for tiered JIT background compilation to drain (mode-aware: 0ms in AOT).
// Per-serializer instead of once globally — guarantees this serializer's freshly-promoted
// methods are settled before timing, regardless of when it appears in the iteration order.
if (JitSleep > 0) Thread.Sleep(JitSleep);
// Adaptive iter calibration — per Ser/Des/RT function, post-warmup. Each function gets its
// own iter count tuned to TargetSampleMs (typically 250 ms). The 100-iter calibration cost
// is amortized over the BenchmarkSamples + 1 (pilot) recorded measurements that follow.
int serIter = TestIterations, desIter = TestIterations, rtIter = TestIterations;
if (serializer.IsRoundTripOnly)
{
if (mode is "all" or "serialize" or "ser")
rtIter = CalibrateIterations(() => serializer.Serialize(), TargetSampleMs);
}
else
{
if (mode is "all" or "serialize" or "ser")
serIter = CalibrateIterations(() => serializer.Serialize(), TargetSampleMs);
if (mode is "all" or "deserialize" or "des")
desIter = CalibrateIterations(() => serializer.Deserialize(), TargetSampleMs);
}
var result = new BenchmarkResult
{
TestDataName = testData.DisplayName, // Use DisplayName for IId% info
@ -663,26 +665,38 @@ public static class Program
if (serializer.IsRoundTripOnly)
{
// Round-trip-only benchmarks (NamedPipe etc.): measure the full pipe round-trip directly into the RT
// columns. Ser ms / SerAlloc / Des ms / DesAlloc stay 0 → display as "N/A". Allocation uses the
// process-wide measurement so the server-drain-thread allocations (e.g. server-side new byte[len])
// also show up — otherwise current-thread alloc would only count the client side and look ~halved.
// Round-trip-only benchmarks (NamedPipe etc.): single phase — Serialize() performs the full RT,
// Deserialize() is a no-op. We use the Ser-phase entry-points (WarmupSerialize) to warm the
// entire round-trip path, then record into the RT result columns.
if (mode is "all" or "serialize" or "ser")
{
ForceGcCollect();
serializer.WarmupSerialize(WarmupIterations);
if (JitSleep > 0) Thread.Sleep(JitSleep);
var rtIter = CalibrateIterations(() => serializer.Serialize(), TargetSampleMs);
var (rtMed, rtMin, rtMax, rtStd) = RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
result.RoundTripTimeMs = rtMed;
result.RoundTripTimeMinMs = rtMin;
result.RoundTripTimeMaxMs = rtMax;
result.RoundTripTimeStdDevMs = rtStd;
result.RoundTripIterations = rtIter;
// Process-wide allocation measurement: server-drain-thread allocations (server-side new byte[len])
// also show up — otherwise current-thread alloc would only count the client side and look ~halved.
result.RoundTripAllocBytesPerOp = MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]");
}
// mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently.
}
else
{
// ── Ser phase ── isolated warmup → JitSleep → calibrate → time → alloc; preceded by GC.Collect.
if (mode is "all" or "serialize" or "ser")
{
ForceGcCollect();
serializer.WarmupSerialize(WarmupIterations);
if (JitSleep > 0) Thread.Sleep(JitSleep);
var serIter = CalibrateIterations(() => serializer.Serialize(), TargetSampleMs);
var (serMed, serMin, serMax, serStd) = RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
result.SerializeTimeMs = serMed;
result.SerializeTimeMinMs = serMin;
@ -693,8 +707,16 @@ public static class Program
result.SerializeAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
}
// ── Des phase ── isolated warmup → JitSleep → calibrate → time → alloc; preceded by GC.Collect.
// The GC.Collect here is critical: it discards the Ser-phase's write-buffer pool churn so the
// Des-phase's allocation measurement reflects ONLY Des-side allocations (deserialized object graph).
if (mode is "all" or "deserialize" or "des")
{
ForceGcCollect();
serializer.WarmupDeserialize(WarmupIterations);
if (JitSleep > 0) Thread.Sleep(JitSleep);
var desIter = CalibrateIterations(() => serializer.Deserialize(), TargetSampleMs);
var (desMed, desMin, desMax, desStd) = RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
result.DeserializeTimeMs = desMed;
result.DeserializeTimeMinMs = desMin;
@ -708,10 +730,10 @@ public static class Program
// batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
// then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
// RoundTripTimeMs / RoundTripIterations * 1000 == SerPerOp + DesPerOp.
var serPerOp = ToPerOpMicros(result.SerializeTimeMs, serIter);
var desPerOp = ToPerOpMicros(result.DeserializeTimeMs, desIter);
var serPerOp = ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations);
var desPerOp = ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations);
var rtPerOp = serPerOp + desPerOp;
result.RoundTripIterations = Math.Max(serIter, desIter);
result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations);
result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
}
@ -1469,7 +1491,28 @@ public static class Program
/// rankings (because both metrics are misleading there) — they still participate in "Fastest Round-trip".
/// Default false for in-memory IO modes which measure Ser and Des separately.</summary>
bool IsRoundTripOnly => false;
/// <summary>Combined warmup (Ser + Deser interleaved). Kept for backward-compat with <c>ProfilerMode</c>
/// and other callers that don't need phase-separated warmup. The benchmark loop prefers the split
/// <see cref="WarmupSerialize"/> + <see cref="WarmupDeserialize"/> pair for cache-isolated measurements.</summary>
void Warmup(int iterations);
/// <summary>Warm only the Serialize path. Default body iterates <see cref="Serialize"/> N times.
/// Overrides are only needed when the implementor wants Ser-specific warmup-state (e.g. pre-allocate buffers).
/// On <see cref="IsRoundTripOnly"/> benchmarks (NamedPipe-style) <see cref="Serialize"/> performs the full RT,
/// so this warms the entire round-trip path.</summary>
void WarmupSerialize(int iterations)
{
for (var i = 0; i < iterations; i++) Serialize();
}
/// <summary>Warm only the Deserialize path. Default body iterates <see cref="Deserialize"/> N times.
/// On <see cref="IsRoundTripOnly"/> benchmarks <see cref="Deserialize"/> is a no-op, so the bench loop
/// skips the Des-phase entirely for those cells.</summary>
void WarmupDeserialize(int iterations)
{
for (var i = 0; i < iterations; i++) Deserialize();
}
void Serialize();
void Deserialize();
/// <summary>Round-trip correctness check — called once per cell before warmup. Returns true if Serialize+Deserialize preserves data.</summary>
@ -3274,7 +3317,7 @@ public static class Program
var sb = new StringBuilder();
var testTypeName = testDataSets.FirstOrDefault()?.TypeName ?? "unknown";
sb.AppendLine($"# AcBinary Benchmark {BuildConfiguration} {DateTime.Now:yyyy-MM-dd HH:mm:ss}");
sb.AppendLine($"Charset: {GetCurrentCharsetName()} | Iterations: per-cell adaptive (target ~{TargetSampleMs} ms/sample) | Warmup: {WarmupIterations} | Samples: {BenchmarkSamples} (median) + 1 pilot discarded | .NET: {Environment.Version} | TestType: {testTypeName} | UnstableCV threshold: {UnstableCVThreshold * 100:F0}%");
sb.AppendLine($"Charset: {GetCurrentCharsetName()} | Iterations: per-cell adaptive (target ~{TargetSampleMs} ms/sample) | Warmup: {WarmupIterations} per phase (Ser/Des isolated) | Samples: {BenchmarkSamples} (median) + 1 pilot discarded | .NET: {Environment.Version} | TestType: {testTypeName} | UnstableCV threshold: {UnstableCVThreshold * 100:F0}%");
sb.AppendLine($"Baseline: {"MemoryPack (Byte[])"} (SOTA reference) | Verified: round-trip correctness checked once per cell before warmup");
// Options summary