diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index c912797..2e79706 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -67,7 +67,8 @@
"Read(//h/Applications/Mango/LLM_PLAN//**)",
"Bash(curl -s \"https://raw.githubusercontent.com/dotnet/runtime/main/src/libraries/System.IO.Pipelines/src/System/IO/Pipelines/StreamPipeWriter.cs\")",
"WebFetch(domain:lemire.me)",
- "Bash(gh pr *)"
+ "Bash(gh pr *)",
+ "Bash(gh api *)"
]
}
}
diff --git a/AyCode.Core.Serializers.Console/Program.cs b/AyCode.Core.Serializers.Console/Program.cs
index 2ddba06..19f0b5a 100644
--- a/AyCode.Core.Serializers.Console/Program.cs
+++ b/AyCode.Core.Serializers.Console/Program.cs
@@ -47,7 +47,7 @@ public static class Program
#else
private static int WarmupIterations = 10000; //5000
private static int TestIterations = 1000; //1000
- private static int BenchmarkSamples = 5;
+ private static int BenchmarkSamples = 10;
#endif
// Interactive settings: selected AcBinary wire mode for benchmark runs.
@@ -88,7 +88,26 @@ public static class Program
private const string ModeRuntime = "Runtime";
private const string ModeHybrid = "Hybrid";
- private const int JitSleep = 3000;
+ // Per-cell adaptive iteration target wall-clock duration. Each Ser/Des function calibrates its
+ // own iteration count post-warmup so the sample batch lands in this range — equalizes the
+ // per-sample window across cells of vastly different per-op cost (Small ~6 ns/op vs Large
+ // ~140 µs/op). Below ~100 ms Stopwatch precision and OS preempt spikes start to dominate.
+ private const int TargetSampleMs = 250;
+
+ // CV (coefficient of variation = stddev / mean) threshold above which a row's range is flagged
+ // as "unstable" in the markdown output (⚠️ marker). 3% is a reasonable noise-floor expectation
+ // for stabilized in-memory benchmarks; rows above it should be discounted when reading
+ // sub-3% inter-engine deltas.
+ private const double UnstableCVThreshold = 0.03;
+
+ // JIT-tier-promotion drain delay between warmup and measurement.
+ // - JIT mode (RuntimeFeature.IsDynamicCodeCompiled == true): tiered JIT promotes hot methods
+ // in a background thread; we wait briefly for the queue to drain so the first measurement
+ // sample doesn't catch a Tier-0 → Tier-1 transition mid-flight.
+ // - AOT mode (NativeAOT publish): no dynamic compilation happens; the sleep is pure noise.
+ // 250ms (vs the historical 3000ms) is sufficient for a few-method working set under .NET 9's
+ // tiered JIT — empirically the queue drains in <100ms for the bench's hot path.
+ private static int JitSleep => System.Runtime.CompilerServices.RuntimeFeature.IsDynamicCodeCompiled ? 250 : 0;
// OptionsPreset values are passed per-instance (constructor argument), not constants —
// each CreateSerializers call line specifies its own preset name (e.g. "FastMode", "NoIntern").
@@ -150,7 +169,52 @@ public static class Program
/// — only its sample noise grows). Symmetric with the already-per-op *AllocBytesPerOp fields.
///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static double ToPerOpMicros(double totalMs) => totalMs / TestIterations * 1000.0;
+ ///
+ /// Converts a total-time (in ms across ) into per-operation microseconds.
+ /// Per-op µs is the iter-independent unit: 1000 iter and 50000 iter of the same operation should
+ /// produce the same per-op µs (within noise). Necessary because per-cell adaptive iteration makes
+ /// iterations a per-row property — there is no longer a single global TestIterations to divide by.
+ ///
+ private static double ToPerOpMicros(double totalMs, int iterations) => iterations > 0 ? totalMs / iterations * 1000.0 : 0;
+
+ // Per-row per-op µs accessors — pull batch-time + iter from BenchmarkResult and convert. Used wherever
+ // averaging or comparison happens across rows with potentially different iter counts (Winners summary,
+ // Overall comparison, per-cell summary row). Keeping these as methods rather than properties on
+ // BenchmarkResult preserves the result-as-data-bag distinction.
+ private static double SerPerOp(BenchmarkResult r) => ToPerOpMicros(r.SerializeTimeMs, r.SerializeIterations);
+ private static double DesPerOp(BenchmarkResult r) => ToPerOpMicros(r.DeserializeTimeMs, r.DeserializeIterations);
+ private static double RtPerOp(BenchmarkResult r) => ToPerOpMicros(r.RoundTripTimeMs, r.RoundTripIterations);
+
+ ///
+ /// Formats a per-op micros value with its inter-sample range and CV-threshold marker as
+ /// "26.86 (24.5..29.1)" or "26.86 (24.5..29.1) ⚠️5.2%". Median first, range in parentheses,
+ /// CV warning suffix only when CV > . When min == max == median
+ /// (single-sample / Debug / quick mode), collapses to bare median to avoid visual clutter.
+ /// All time inputs are total-batch milliseconds; is the per-row iter
+ /// count (post-adaptive-calibration).
+ ///
+ private static string FormatMicrosWithRange(double medianMs, double minMs, double maxMs, double stdDevMs, int iterations, System.Globalization.CultureInfo inv)
+ {
+ var med = ToPerOpMicros(medianMs, iterations);
+ // No range data (single-sample fast path) — surface as bare median, identical to the prior format.
+ if (minMs <= 0 && maxMs <= 0) return med.ToString("F2", inv);
+ if (minMs >= medianMs && maxMs <= medianMs) return med.ToString("F2", inv);
+ var min = ToPerOpMicros(minMs, iterations);
+ var max = ToPerOpMicros(maxMs, iterations);
+ var range = $"{med.ToString("F2", inv)} ({min.ToString("F2", inv)}..{max.ToString("F2", inv)})";
+ // CV (coefficient of variation = stddev / mean) — flag rows above the unstable threshold so a
+ // small inter-engine delta on a high-CV row is easy to discount as noise.
+ if (medianMs > 0 && stdDevMs > 0)
+ {
+ var cv = stdDevMs / medianMs;
+ if (cv > UnstableCVThreshold)
+ {
+ var cvPct = (cv * 100).ToString("F1", inv);
+ return $"{range} ⚠️{cvPct}%";
+ }
+ }
+ return range;
+ }
///
/// Converts a byte count to KB (1 KB = 1024 B). Display-only helper so allocation columns can
@@ -225,7 +289,8 @@ public static class Program
BenchmarkSamples = 3;
layer = "all";
}
- else if (arg is "core" or "comprehensive" or "edge" or "all")
+ else if (arg is "core" or "comprehensive" or "edge" or "all"
+ or "small" or "medium" or "large" or "repeated" or "deep")
{
layer = arg;
}
@@ -265,64 +330,121 @@ public static class Program
System.Console.WriteLine("║ COMPREHENSIVE SERIALIZER BENCHMARK SUITE ║");
System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");
- var allResults = new List();
- var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
- var testDataSets = FilterByLayer(allTestDataSets, layer);
+ // Stabilization: pin the entire benchmark process to a single logical CPU and bump priority
+ // class. Single-core affinity stops Windows from migrating the bench thread between cores
+ // mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority
+ // reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise
+ // randomly inflate samples by 5-15%.
+ // Try/finally guarantees the original state is restored even if a benchmark throws — leaving
+ // a developer machine pinned to one core after a crashed run is a real foot-gun.
+ // Skipped on Debug single-sample mode (BenchmarkSamples <= 1) where stabilization is moot.
+ var process = Process.GetCurrentProcess();
+ var origAffinity = (IntPtr)0;
+ var origPriority = ProcessPriorityClass.Normal;
+ var stabilizationApplied = false;
- System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Iterations: {TestIterations} | Warmup: {WarmupIterations} | Samples: {BenchmarkSamples} (median)");
- System.Console.WriteLine($"Build: {BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
- System.Console.WriteLine();
-
- // Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
- // Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
- // alone doesn't ensure that every Serialize/IBufferWriter overload is fully Tier 1 by the time we
- // start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
- // on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
- // Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
- // them all in the background; the per-cell warmup that follows then locks in cache + branch state.
- if (BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
+ // ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at
+ // runtime; skip the affinity step there but still raise priority class (which IS supported
+ // on macOS, just less effective for stabilization than affinity pinning).
+ if (BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
{
- System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");
- foreach (var testData in testDataSets)
+ try
{
- var preSerializers = CreateSerializers(testData, serializerMode);
- try
+ origAffinity = process.ProcessorAffinity;
+ origPriority = process.PriorityClass;
+ // Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one
+ // core, consistently" — not which one. If CPU 0 is heavily contended on the host
+ // (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak
+ // the mask here. The benchmark is single-threaded for the in-memory rows so single
+ // core is sufficient; round-trip-only NamedPipe rows have a server-drain thread
+ // that will share the core (acceptable — the bench measures end-to-end RT anyway).
+ process.ProcessorAffinity = (IntPtr)1;
+ process.PriorityClass = ProcessPriorityClass.High;
+ stabilizationApplied = true;
+ System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High.");
+ }
+ catch (Exception ex)
+ {
+ // Affinity/priority changes may fail on locked-down hosts (group policies, containers
+ // without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still
+ // works, just with the platform default scheduling.
+ System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}");
+ }
+ }
+
+ try
+ {
+ var allResults = new List();
+ var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
+ var testDataSets = FilterByLayer(allTestDataSets, layer);
+
+ System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{TargetSampleMs} ms target) | Warmup: {WarmupIterations} | Samples: {BenchmarkSamples} (median) + pilot discard");
+ System.Console.WriteLine($"Build: {BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
+ System.Console.WriteLine();
+
+ // Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
+ // Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
+ // alone doesn't ensure that every Serialize/IBufferWriter overload is fully Tier 1 by the time we
+ // start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
+ // on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
+ // Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
+ // them all in the background; the per-cell warmup that follows then locks in cache + branch state.
+ if (BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
+ {
+ System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");
+ foreach (var testData in testDataSets)
{
- foreach (var s in preSerializers)
+ var preSerializers = CreateSerializers(testData, serializerMode);
+ try
{
- // Light warmup just to trigger Tier 0 → Tier 1 promotion. The per-cell 5000-iter warmup
- // inside RunBenchmarksForTestData still runs afterwards for cache/BTB warming.
- s.Warmup(2000);
+ foreach (var s in preSerializers)
+ {
+ // Light warmup just to trigger Tier 0 → Tier 1 promotion. The per-cell 5000-iter warmup
+ // inside RunBenchmarksForTestData still runs afterwards for cache/BTB warming.
+ s.Warmup(2000);
+ }
+ }
+ finally
+ {
+ // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
+ foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
}
}
- finally
- {
- // Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
- foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
- }
+ // Let background tiered-JIT compilation drain before we begin measuring.
+ if (JitSleep > 0) Thread.Sleep(JitSleep);
+ System.Console.WriteLine("✓ Global pre-warmup complete.\n");
}
- // Let background tiered-JIT compilation drain before we begin measuring.
- Thread.Sleep(JitSleep);
- System.Console.WriteLine("✓ Global pre-warmup complete.\n");
- }
- foreach (var testData in testDataSets)
+ foreach (var testData in testDataSets)
+ {
+ System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
+ System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
+ System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");
+
+ var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
+ allResults.AddRange(results);
+ }
+
+ // Print grouped results
+ PrintGroupedResults(allResults, testDataSets);
+
+ // Save results to file
+ SaveResults(allResults, testDataSets);
+
+ System.Console.WriteLine("\n✓ Benchmark complete!");
+ }
+ finally
{
- System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
- System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
- System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");
-
- var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
- allResults.AddRange(results);
+ // Restore process state — affinity/priority changes are process-wide and persist across
+ // interactive-mode iterations of the menu. Without restore, the second menu run would
+ // already be on CPU-0 + High priority before its own try-block applied them, masking
+ // any stabilization-disabled comparison.
+ if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
+ {
+ try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ }
+ try { process.PriorityClass = origPriority; } catch { /* best-effort */ }
+ }
}
-
- // Print grouped results
- PrintGroupedResults(allResults, testDataSets);
-
- // Save results to file
- SaveResults(allResults, testDataSets);
-
- System.Console.WriteLine("\n✓ Benchmark complete!");
}
///
@@ -404,21 +526,39 @@ public static class Program
System.Console.WriteLine("✓ All serializers passed round-trip verification.");
- // Warmup all serializers
- System.Console.WriteLine($"Warming up ({WarmupIterations} iterations)...");
+ // Per-serializer (warmup → calibrate → measurement) cycle: each serializer warms up IMMEDIATELY
+ // before its own bench, then calibrates iter per-function (Ser and Des independently) so each
+ // sample lands at ~TargetSampleMs wall-clock. This avoids cache pollution AND equalizes sample
+ // window length across cells of vastly different per-op cost.
+ System.Console.WriteLine($"Running benchmarks (target ~{TargetSampleMs} ms/sample × {BenchmarkSamples} samples median, per-serializer warmup + adaptive iter)...\n");
+
foreach (var serializer in serializers)
{
+ // Warmup THIS serializer right before benching it — keeps its hot code/data in cache.
serializer.Warmup(WarmupIterations);
- }
- // Wait for tiered JIT background compilation to complete
- Thread.Sleep(JitSleep);
+ // Wait for tiered JIT background compilation to drain (mode-aware: 0ms in AOT).
+ // Per-serializer instead of once globally — guarantees this serializer's freshly-promoted
+ // methods are settled before timing, regardless of when it appears in the iteration order.
+ if (JitSleep > 0) Thread.Sleep(JitSleep);
- // Run benchmarks
- System.Console.WriteLine($"Running benchmarks ({TestIterations} iterations × {BenchmarkSamples} samples median)...\n");
+ // Adaptive iter calibration — per Ser/Des/RT function, post-warmup. Each function gets its
+ // own iter count tuned to TargetSampleMs (typically 250 ms). The 100-iter calibration cost
+ // is amortized over the BenchmarkSamples + 1 (pilot) recorded measurements that follow.
+ int serIter = TestIterations, desIter = TestIterations, rtIter = TestIterations;
+ if (serializer.IsRoundTripOnly)
+ {
+ if (mode is "all" or "serialize" or "ser")
+ rtIter = CalibrateIterations(() => serializer.Serialize(), TargetSampleMs);
+ }
+ else
+ {
+ if (mode is "all" or "serialize" or "ser")
+ serIter = CalibrateIterations(() => serializer.Serialize(), TargetSampleMs);
+ if (mode is "all" or "deserialize" or "des")
+ desIter = CalibrateIterations(() => serializer.Deserialize(), TargetSampleMs);
+ }
- foreach (var serializer in serializers)
- {
var result = new BenchmarkResult
{
TestDataName = testData.DisplayName, // Use DisplayName for IId% info
@@ -445,8 +585,13 @@ public static class Program
// also show up — otherwise current-thread alloc would only count the client side and look ~halved.
if (mode is "all" or "serialize" or "ser")
{
- result.RoundTripTimeMs = RunTimed(() => serializer.Serialize(), TestIterations, $"{groupLabel} [RT timing]");
- result.RoundTripAllocBytesPerOp = MeasureAllocationTotal(() => serializer.Serialize(), TestIterations, $"{groupLabel} [RT alloc]");
+ var (rtMed, rtMin, rtMax, rtStd) = RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
+ result.RoundTripTimeMs = rtMed;
+ result.RoundTripTimeMinMs = rtMin;
+ result.RoundTripTimeMaxMs = rtMax;
+ result.RoundTripTimeStdDevMs = rtStd;
+ result.RoundTripIterations = rtIter;
+ result.RoundTripAllocBytesPerOp = MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]");
}
// mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently.
}
@@ -454,19 +599,36 @@ public static class Program
{
if (mode is "all" or "serialize" or "ser")
{
- result.SerializeTimeMs = RunTimed(() => serializer.Serialize(), TestIterations, $"{groupLabel} [Ser timing]");
+ var (serMed, serMin, serMax, serStd) = RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
+ result.SerializeTimeMs = serMed;
+ result.SerializeTimeMinMs = serMin;
+ result.SerializeTimeMaxMs = serMax;
+ result.SerializeTimeStdDevMs = serStd;
+ result.SerializeIterations = serIter;
// Dedicated alloc-only sample (separate from timing samples; keeps timing pure)
- result.SerializeAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), TestIterations, $"{groupLabel} [Ser alloc]");
+ result.SerializeAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
}
if (mode is "all" or "deserialize" or "des")
{
- result.DeserializeTimeMs = RunTimed(() => serializer.Deserialize(), TestIterations, $"{groupLabel} [Des timing]");
- result.DeserializeAllocBytesPerOp = MeasureAllocation(() => serializer.Deserialize(), TestIterations, $"{groupLabel} [Des alloc]");
+ var (desMed, desMin, desMax, desStd) = RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
+ result.DeserializeTimeMs = desMed;
+ result.DeserializeTimeMinMs = desMin;
+ result.DeserializeTimeMaxMs = desMax;
+ result.DeserializeTimeStdDevMs = desStd;
+ result.DeserializeIterations = desIter;
+ result.DeserializeAllocBytesPerOp = MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]");
}
- // Compose RT from Ser+Des (the previously computed property's behavior, now explicit since RT is settable).
- result.RoundTripTimeMs = result.SerializeTimeMs + result.DeserializeTimeMs;
+ // Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration,
+ // batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
+ // then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
+ // RoundTripTimeMs / RoundTripIterations * 1000 == SerPerOp + DesPerOp.
+ var serPerOp = ToPerOpMicros(result.SerializeTimeMs, serIter);
+ var desPerOp = ToPerOpMicros(result.DeserializeTimeMs, desIter);
+ var rtPerOp = serPerOp + desPerOp;
+ result.RoundTripIterations = Math.Max(serIter, desIter);
+ result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
}
@@ -655,40 +817,125 @@ public static class Program
///
/// Runs the action times for independent samples,
- /// returning the median elapsed time. Multi-sample design reduces single-run variance from ~±15% to ~±5%
- /// by smoothing transient effects (background activity, thermal/turbo state, JIT tier-promotion timing).
+ /// returning the median, min, and max elapsed time. Multi-sample design reduces single-run variance
+ /// from ~±15% to ~±5% by smoothing transient effects (background activity, thermal/turbo state).
/// When <= 1, falls back to single-sample timing (Debug / quick mode).
/// When is non-null, emits in-place \r progress updates so a
/// stuck benchmark (e.g. deadlocked NamedPipe row) is visibly stuck at a specific %% rather than
/// silently hanging.
+ ///
+ /// Stabilization (added 2026-05-07):
+ /// 1) Pilot sample is run BEFORE the recorded loop and discarded. The first measurement after
+ /// warmup tends to absorb residual JIT bookkeeping and GC bookkeeping; dropping it tightens
+ /// the min/max range without throwing away signal (the median is the SAME data as before).
+ /// 2) GC.Collect / WaitForPendingFinalizers / GC.Collect runs BEFORE every recorded sample.
+ /// Without this, GC pressure from sample N occasionally triggered a Gen-2 pause inside
+ /// sample N+1, painting it as an outlier; collecting up-front gives every sample the
+ /// same starting heap shape.
+ /// 3) Returns (median, min, max) so the caller can surface the inter-sample range — visible
+ /// noise floor for the row, replacing the previous "median only" view.
///
- private static double RunTimed(Action action, int iterations, string? progressLabel = null)
+ private static (double medianMs, double minMs, double maxMs, double stdDevMs) RunTimed(Action action, int iterations, string? progressLabel = null)
{
var samples = BenchmarkSamples;
if (samples <= 1)
{
- // Single-sample fast path (Debug or trivial run) — no allocation, no sort.
+ // Single-sample fast path (Debug or trivial run) — no allocation, no sort, no stddev.
var sw = Stopwatch.StartNew();
RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);
sw.Stop();
- EndProgress(progressLabel, sw.Elapsed.TotalMilliseconds);
- return sw.Elapsed.TotalMilliseconds;
+ var ms = sw.Elapsed.TotalMilliseconds;
+ EndProgress(progressLabel, ms);
+ return (ms, ms, ms, 0);
}
+ // Pilot sample (discarded). Counts as sample index 0 of (samples + 1) for progress display
+ // so the user sees an extra "warmup-ish" tick before the recorded samples start.
+ GC.Collect();
+ GC.WaitForPendingFinalizers();
+ GC.Collect();
+ var pilotSw = Stopwatch.StartNew();
+ RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: 0);
+ pilotSw.Stop();
+ // intentionally not stored
+
var times = new double[samples];
for (var s = 0; s < samples; s++)
{
+ // Per-sample GC settle. Forces every sample to start from the same heap state, so
+ // a Gen-2 pause caused by the previous sample doesn't bleed into the next sample's
+ // timing. Cost is paid OUTSIDE the Stopwatch window — no impact on the measurement.
+ GC.Collect();
+ GC.WaitForPendingFinalizers();
+ GC.Collect();
+
var sw = Stopwatch.StartNew();
- RunWithProgress(action, iterations, progressLabel, samples, s);
+ RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: s + 1);
sw.Stop();
times[s] = sw.Elapsed.TotalMilliseconds;
}
- Array.Sort(times);
+ // Capture min/max/sum/sumSq BEFORE sort to avoid order ambiguity (Array.Sort is in-place).
+ var minMs = double.MaxValue;
+ var maxMs = double.MinValue;
+ var sum = 0.0;
+ var sumSq = 0.0;
+ for (var i = 0; i < times.Length; i++)
+ {
+ var t = times[i];
+ sum += t;
+ sumSq += t * t;
+ if (t < minMs) minMs = t;
+ if (t > maxMs) maxMs = t;
+ }
+ // Population stddev (not sample-stddev — we treat the captured samples as the population for
+ // CV computation). variance = E[X²] - E[X]² with Math.Max(0, ...) guard against tiny negative
+ // values from FP rounding when samples are nearly identical.
+ var mean = sum / times.Length;
+ var variance = (sumSq / times.Length) - (mean * mean);
+ var stdDevMs = Math.Sqrt(Math.Max(0.0, variance));
+
+ Array.Sort(times);
// Median: middle value for odd sample counts, average of two middles for even counts.
var medianMs = samples % 2 == 1 ? times[samples / 2] : (times[samples / 2 - 1] + times[samples / 2]) / 2.0;
EndProgress(progressLabel, medianMs);
- return medianMs;
+ return (medianMs, minMs, maxMs, stdDevMs);
+ }
+
+ ///
+ /// Per-cell adaptive iteration calibration. Runs a 100-iter measurement after warmup and computes
+ /// how many iterations are needed to reach wall-clock per sample.
+ /// Returns iter rounded UP to the nearest 1000, floored at 1000 (the prior fixed minimum) and
+ /// ceiling-capped at 200_000 (sanity bound for pathologically fast ops). In Debug single-sample mode
+ /// (BenchmarkSamples <= 1) returns the global unchanged —
+ /// calibration overhead is unjustified there. Calibration runs OUTSIDE the timed sample loop and
+ /// does NOT count toward warmup; its sole purpose is to measure per-op cost.
+ ///
+ private static int CalibrateIterations(Action action, int targetMs)
+ {
+ if (BenchmarkSamples <= 1) return TestIterations; // Debug fast path
+
+ GC.Collect();
+ GC.WaitForPendingFinalizers();
+ GC.Collect();
+
+ const int calibIter = 100;
+ var sw = Stopwatch.StartNew();
+ for (var i = 0; i < calibIter; i++) action();
+ sw.Stop();
+ var ms = sw.Elapsed.TotalMilliseconds;
+
+ // Pathologically-fast op below Stopwatch resolution — cap at ceiling (further calibration won't help).
+ if (ms <= 0.0001) return 200_000;
+
+ var iterPerMs = calibIter / ms;
+ var raw = (int)Math.Ceiling(targetMs * iterPerMs);
+ // Round UP to nearest 1000 — keeps numbers human-readable in the markdown output.
+ var rounded = ((raw + 999) / 1000) * 1000;
+
+ if (rounded < 1000) return 1000;
+ if (rounded > 200_000) return 200_000;
+ return rounded;
}
///
@@ -913,6 +1160,7 @@ public static class Program
System.Console.WriteLine("─────────────────────────────────────────────");
System.Console.WriteLine(" [1] Iteration — Warmup / Iterations / Samples");
System.Console.WriteLine($" [2] WireMode — current: {SelectedWireMode}");
+ System.Console.WriteLine($" [3] Charset — current: {GetCurrentCharsetName()}");
System.Console.WriteLine(" [B] Back");
System.Console.Write("\nSelection: ");
@@ -927,6 +1175,83 @@ public static class Program
case '2':
ShowWireModeSettingsMenu();
break;
+ case '3':
+ ShowCharsetSettingsMenu();
+ break;
+ case 'b':
+ return;
+ default:
+ continue;
+ }
+ }
+ }
+
+ ///
+ /// Returns a human-readable name for the currently-active BenchmarkTestDataProvider.LongStringSuffix
+ /// charset. Returns "Custom" when the suffix doesn't match any of the predefined
+ /// constants. Used in menu state display, console run header, and
+ /// the .LLM markdown output header so per-charset bench files are self-documenting.
+ ///
+ private static string GetCurrentCharsetName()
+ {
+ var s = BenchmarkTestDataProvider.LongStringSuffix;
+ if (s == CharsetSuffixes.Latin1FixAscii) return "Latin1FixAscii";
+ if (s == CharsetSuffixes.Latin1Short) return "Latin1Short";
+ if (s == CharsetSuffixes.Latin1Long) return "Latin1Long";
+ if (s == CharsetSuffixes.CjkBmp) return "CjkBmp";
+ if (s == CharsetSuffixes.Cyrillic) return "Cyrillic";
+ if (s == CharsetSuffixes.Mixed) return "Mixed";
+ return "Custom";
+ }
+
+ private static void ShowCharsetSettingsMenu()
+ {
+ while (true)
+ {
+ System.Console.WriteLine();
+ System.Console.WriteLine("─────────────────────────────────────────────");
+ System.Console.WriteLine("Charset settings — long-string suffix profile");
+ System.Console.WriteLine("─────────────────────────────────────────────");
+ System.Console.WriteLine($"Current: {GetCurrentCharsetName()}");
+ System.Console.WriteLine();
+ System.Console.WriteLine(" [1] Latin1FixAscii — empty suffix; short FixStr-fast-path stress (Latin1 baseline values stay short)");
+ System.Console.WriteLine(" [2] Latin1Short — \" árvíztűrő tükörfúrógép\" (~24 char Hungarian mixed)");
+ System.Console.WriteLine(" [3] Latin1Long — ~47-char Latin1 mixed (default; exceeds FixStr boundary)");
+ System.Console.WriteLine(" [4] CjkBmp — CJK BMP (long 3-byte runs)");
+ System.Console.WriteLine(" [5] Cyrillic — Russian Cyrillic (long 2-byte runs)");
+ System.Console.WriteLine(" [6] Mixed — Hungarian + CJK + Cyrillic + emoji (full-spectrum + surrogate pairs)");
+ System.Console.WriteLine(" [B] Back");
+ System.Console.Write("\nSelection: ");
+
+ var key = System.Console.ReadKey(intercept: false).KeyChar;
+ System.Console.WriteLine();
+
+ switch (char.ToLower(key))
+ {
+ case '1':
+ BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Latin1FixAscii;
+ System.Console.WriteLine("✓ Charset set to Latin1FixAscii");
+ return;
+ case '2':
+ BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Latin1Short;
+ System.Console.WriteLine("✓ Charset set to Latin1Short");
+ return;
+ case '3':
+ BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Latin1Long;
+ System.Console.WriteLine("✓ Charset set to Latin1Long");
+ return;
+ case '4':
+ BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.CjkBmp;
+ System.Console.WriteLine("✓ Charset set to CjkBmp");
+ return;
+ case '5':
+ BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Cyrillic;
+ System.Console.WriteLine("✓ Charset set to Cyrillic");
+ return;
+ case '6':
+ BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Mixed;
+ System.Console.WriteLine("✓ Charset set to Mixed");
+ return;
case 'b':
return;
default:
@@ -1019,6 +1344,14 @@ public static class Program
"core" => all.Where(t => StartsWithAny(t.Name, coreNames)).ToList(),
"comprehensive" => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras)).ToList(),
"edge" => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras) || StartsWithAny(t.Name, edgeExtras)).ToList(),
+ // Single-cell A/B mini-suite filters — match by case-insensitive prefix on Name.
+ // Use case: tight optimization-iteration loop on one specific cell (e.g. `dotnet run -- repeated`
+ // or interactive menu shortcut), avoiding the full ~110 sec suite when only one cell is in scope.
+ "small" => all.Where(t => t.Name.StartsWith("Small", StringComparison.OrdinalIgnoreCase)).ToList(),
+ "medium" => all.Where(t => t.Name.StartsWith("Medium", StringComparison.OrdinalIgnoreCase)).ToList(),
+ "large" => all.Where(t => t.Name.StartsWith("Large", StringComparison.OrdinalIgnoreCase)).ToList(),
+ "repeated" => all.Where(t => t.Name.StartsWith("Repeated", StringComparison.OrdinalIgnoreCase)).ToList(),
+ "deep" => all.Where(t => t.Name.StartsWith("Deep", StringComparison.OrdinalIgnoreCase)).ToList(),
_ => all.ToList()
};
@@ -2329,14 +2662,40 @@ public static class Program
public int SerializedSize { get; set; }
public double SerializeTimeMs { get; set; }
public double DeserializeTimeMs { get; set; }
+ // Per-sample min/max alongside the median (median is the *Time*Ms field above). Surfaces
+ // inter-sample range — the visible noise floor for the row. 0 when the operation was skipped
+ // (mode != "all"/"ser"/"des") or when a single-sample fast path was used (min == max == median).
+ public double SerializeTimeMinMs { get; set; }
+ public double SerializeTimeMaxMs { get; set; }
+ public double DeserializeTimeMinMs { get; set; }
+ public double DeserializeTimeMaxMs { get; set; }
+ // Sample-population stddev (ms). Used by FormatMicrosWithRange to compute CV (stddev/mean)
+ // and emit the ⚠️ marker on rows above UnstableCVThreshold. 0 in single-sample mode.
+ public double SerializeTimeStdDevMs { get; set; }
+ public double DeserializeTimeStdDevMs { get; set; }
+ // Per-row adaptive iteration count (post-CalibrateIterations). Each Ser and Des function calibrates
+ // independently to land its sample window at ~TargetSampleMs; per-op µs is then iter-independent
+ // (`SerializeTimeMs / SerializeIterations * 1000`). For round-trip-only rows (NamedPipe etc.),
+ // RoundTripIterations carries the calibrated iter count; SerializeIterations and DeserializeIterations
+ // stay 0 (Ser and Des are not separately measurable on those rows).
+ public int SerializeIterations { get; set; }
+ public int DeserializeIterations { get; set; }
+ public int RoundTripIterations { get; set; }
public long SerializeAllocBytesPerOp { get; set; }
public long DeserializeAllocBytesPerOp { get; set; }
public long SetupSerializeAllocBytes { get; set; }
public long SetupDeserializeAllocBytes { get; set; }
- /// Total round-trip time. For in-memory benchmarks: Serialize + Deserialize (set explicitly in
- /// RunBenchmarksForTestData). For round-trip-only benchmarks (NamedPipe etc.): the directly-measured
- /// pipe round-trip time, since Ser and Des are not separately measurable there.
+ /// Total round-trip time. For in-memory benchmarks: synthesized so that
+ /// RoundTripTimeMs / RoundTripIterations yields the correct SerPerOp + DesPerOp µs/op
+ /// (necessary because Ser and Des may have different iter counts post-calibration).
+ /// For round-trip-only benchmarks (NamedPipe etc.): the directly-measured pipe round-trip time.
public double RoundTripTimeMs { get; set; }
+ // Round-trip min/max + stddev — only populated for round-trip-only benchmarks (NamedPipe etc.) where
+ // RT is directly measured. For in-memory rows RT = Ser + Des, which has no single-sample
+ // distribution; surface Ser/Des range separately instead.
+ public double RoundTripTimeMinMs { get; set; }
+ public double RoundTripTimeMaxMs { get; set; }
+ public double RoundTripTimeStdDevMs { get; set; }
/// Total round-trip allocation per op. For in-memory benchmarks: SerializeAlloc + DeserializeAlloc.
/// For round-trip-only benchmarks: process-wide allocation measured via
/// (covers ALL threads — client, server-drain, channel internals — not just the caller).
@@ -2346,8 +2705,8 @@ public static class Program
private static void PrintResult(BenchmarkResult result)
{
// Numbers-only per-row entries; the column-headers carry units (µs/op, KB/op).
- var ser = result.SerializeTimeMs > 0 ? $"{ToPerOpMicros(result.SerializeTimeMs),7:F2}" : " N/A";
- var des = result.DeserializeTimeMs > 0 ? $"{ToPerOpMicros(result.DeserializeTimeMs),7:F2}" : " N/A";
+ var ser = result.SerializeTimeMs > 0 ? $"{SerPerOp(result),7:F2}" : " N/A";
+ var des = result.DeserializeTimeMs > 0 ? $"{DesPerOp(result),7:F2}" : " N/A";
var serAlloc = result.SerializeTimeMs > 0 ? $"{ToKilobytes(result.SerializeAllocBytesPerOp),7:F2}" : " N/A";
var desAlloc = result.DeserializeTimeMs > 0 ? $"{ToKilobytes(result.DeserializeAllocBytesPerOp),7:F2}" : " N/A";
System.Console.WriteLine($" {result.SerializerName,-40} | Size: {result.SerializedSize,8:N0} B | Ser: {ser} µs/op ({serAlloc} KB/op) | Des: {des} µs/op ({desAlloc} KB/op)");
@@ -2376,7 +2735,8 @@ public static class Program
foreach (var testData in testDataSets)
{
- var testResults = results.Where(r => r.TestDataName == testData.DisplayName).OrderBy(r => r.RoundTripTimeMs).ToList();
+ // Order by per-op µs (iter-independent) — rows may have different iter counts post-calibration.
+ var testResults = results.Where(r => r.TestDataName == testData.DisplayName).OrderBy(r => RtPerOp(r)).ToList();
// Baseline switched MessagePack → MemoryPack: MemoryPack is the SOTA performance leader.
var memPackResult = testResults.FirstOrDefault(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray));
// Pin the comparison to AcBinary's SGen variant — apples-to-apples vs MemoryPack (also source-generated).
@@ -2393,9 +2753,9 @@ public static class Program
{
var size = $"{result.SerializedSize:N0}";
var setup = $"{ToKilobytes(result.SetupSerializeAllocBytes):F2} / {ToKilobytes(result.SetupDeserializeAllocBytes):F2}";
- var ser = result.SerializeTimeMs > 0 ? $"{ToPerOpMicros(result.SerializeTimeMs):F2}" : "N/A";
- var des = result.DeserializeTimeMs > 0 ? $"{ToPerOpMicros(result.DeserializeTimeMs):F2}" : "N/A";
- var rt = result.RoundTripTimeMs > 0 ? $"{ToPerOpMicros(result.RoundTripTimeMs):F2}" : "N/A";
+ var ser = result.SerializeTimeMs > 0 ? $"{SerPerOp(result):F2}" : "N/A";
+ var des = result.DeserializeTimeMs > 0 ? $"{DesPerOp(result):F2}" : "N/A";
+ var rt = result.RoundTripTimeMs > 0 ? $"{RtPerOp(result):F2}" : "N/A";
var serAlloc = result.SerializeTimeMs > 0 ? $"{ToKilobytes(result.SerializeAllocBytesPerOp):F2}" : "N/A";
var desAlloc = result.DeserializeTimeMs > 0 ? $"{ToKilobytes(result.DeserializeAllocBytesPerOp):F2}" : "N/A";
var rtAlloc = result.RoundTripAllocBytesPerOp > 0 ? $"{ToKilobytes(result.RoundTripAllocBytesPerOp):F2}" : "N/A";
@@ -2411,7 +2771,7 @@ public static class Program
if (isHighlighted && memPackResult != null && acBinaryResult != null)
{
var isMemPack = (result.Engine == EngineMemoryPack && result.IoMode == IoByteArray);
- var memPackFaster = memPackResult.RoundTripTimeMs < acBinaryResult.RoundTripTimeMs;
+ var memPackFaster = RtPerOp(memPackResult) < RtPerOp(acBinaryResult);
if (isMemPack)
{
@@ -2435,9 +2795,10 @@ public static class Program
if (memPackResult != null && acBinaryResult != null)
{
var sizePct = (acBinaryResult.SerializedSize / (double)memPackResult.SerializedSize - 1) * 100;
- var serPct = memPackResult.SerializeTimeMs > 0 ? (acBinaryResult.SerializeTimeMs / memPackResult.SerializeTimeMs - 1) * 100 : 0;
- var desPct = memPackResult.DeserializeTimeMs > 0 ? (acBinaryResult.DeserializeTimeMs / memPackResult.DeserializeTimeMs - 1) * 100 : 0;
- var rtPct = memPackResult.RoundTripTimeMs > 0 ? (acBinaryResult.RoundTripTimeMs / memPackResult.RoundTripTimeMs - 1) * 100 : 0;
+ // Per-op µs ratio (iter-independent) — Ser/Des may have different iter counts on the two rows.
+ var serPct = SerPerOp(memPackResult) > 0 ? (SerPerOp(acBinaryResult) / SerPerOp(memPackResult) - 1) * 100 : 0;
+ var desPct = DesPerOp(memPackResult) > 0 ? (DesPerOp(acBinaryResult) / DesPerOp(memPackResult) - 1) * 100 : 0;
+ var rtPct = RtPerOp(memPackResult) > 0 ? (RtPerOp(acBinaryResult) / RtPerOp(memPackResult) - 1) * 100 : 0;
var serAllocPct = memPackResult.SerializeAllocBytesPerOp > 0 ? (acBinaryResult.SerializeAllocBytesPerOp / (double)memPackResult.SerializeAllocBytesPerOp - 1) * 100 : 0;
var desAllocPct = memPackResult.DeserializeAllocBytesPerOp > 0 ? (acBinaryResult.DeserializeAllocBytesPerOp / (double)memPackResult.DeserializeAllocBytesPerOp - 1) * 100 : 0;
var rtAllocPct = memPackResult.RoundTripAllocBytesPerOp > 0 ? (acBinaryResult.RoundTripAllocBytesPerOp / (double)memPackResult.RoundTripAllocBytesPerOp - 1) * 100 : 0;
@@ -2512,22 +2873,24 @@ public static class Program
// Fastest Serialize — round-trip-only serializers (NamedPipe etc.) excluded:
// their Serialize() captures the full round-trip and isn't comparable to a pure Ser metric.
+ // Average is over per-op µs (iter-independent) instead of batch-time, since rows may now
+ // have different iter counts post-calibration.
var fastestSer = results.Where(r => r.SerializeTimeMs > 0 && !r.IsRoundTripOnly)
.GroupBy(r => r.SerializerName)
- .Select(g => new { Name = g.Key, AvgTime = g.Average(r => r.SerializeTimeMs) })
- .OrderBy(x => x.AvgTime)
+ .Select(g => new { Name = g.Key, AvgPerOp = g.Average(r => SerPerOp(r)) })
+ .OrderBy(x => x.AvgPerOp)
.FirstOrDefault();
if (fastestSer != null)
- System.Console.WriteLine($"{"Fastest Serialize",-20} │ {fastestSer.Name,-40} │ {ToPerOpMicros(fastestSer.AvgTime),12:F2} µs/op");
+ System.Console.WriteLine($"{"Fastest Serialize",-20} │ {fastestSer.Name,-40} │ {fastestSer.AvgPerOp,12:F2} µs/op");
// Fastest Deserialize — round-trip-only serializers excluded (their Deserialize() is a no-op).
var fastestDes = results.Where(r => r.DeserializeTimeMs > 0 && !r.IsRoundTripOnly)
.GroupBy(r => r.SerializerName)
- .Select(g => new { Name = g.Key, AvgTime = g.Average(r => r.DeserializeTimeMs) })
- .OrderBy(x => x.AvgTime)
+ .Select(g => new { Name = g.Key, AvgPerOp = g.Average(r => DesPerOp(r)) })
+ .OrderBy(x => x.AvgPerOp)
.FirstOrDefault();
if (fastestDes != null)
- System.Console.WriteLine($"{"Fastest Deserialize",-20} │ {fastestDes.Name,-40} │ {ToPerOpMicros(fastestDes.AvgTime),12:F2} µs/op");
+ System.Console.WriteLine($"{"Fastest Deserialize",-20} │ {fastestDes.Name,-40} │ {fastestDes.AvgPerOp,12:F2} µs/op");
// Smallest Size
var smallestSize = results
@@ -2538,14 +2901,14 @@ public static class Program
if (smallestSize != null)
System.Console.WriteLine($"{"Smallest Size",-20} │ {smallestSize.Name,-40} │ {smallestSize.AvgSize,15:F0} B");
- // Fastest Round-trip
+ // Fastest Round-trip — iter-independent per-op average.
var fastestRt = results.Where(r => r.RoundTripTimeMs > 0)
.GroupBy(r => r.SerializerName)
- .Select(g => new { Name = g.Key, AvgTime = g.Average(r => r.RoundTripTimeMs) })
- .OrderBy(x => x.AvgTime)
+ .Select(g => new { Name = g.Key, AvgPerOp = g.Average(r => RtPerOp(r)) })
+ .OrderBy(x => x.AvgPerOp)
.FirstOrDefault();
if (fastestRt != null)
- System.Console.WriteLine($"{"Fastest Round-trip",-20} │ {fastestRt.Name,-40} │ {ToPerOpMicros(fastestRt.AvgTime),12:F2} µs/op");
+ System.Console.WriteLine($"{"Fastest Round-trip",-20} │ {fastestRt.Name,-40} │ {fastestRt.AvgPerOp,12:F2} µs/op");
// Overall AcBinary (SGen) vs MemoryPack comparison (baseline switched MessagePack → MemoryPack as SOTA reference).
// AcBinary side is restricted to DispatchMode == SGen — apples-to-apples vs MemoryPack which is also source-generated.
@@ -2567,16 +2930,18 @@ public static class Program
return;
}
- var memPackAvgSer = memPackSerResults.Count > 0 ? memPackSerResults.Average(r => r.SerializeTimeMs) : 0;
- var memPackAvgDes = memPackDesResults.Average(r => r.DeserializeTimeMs);
- var memPackAvgRt = memPackRtResults.Average(r => r.RoundTripTimeMs);
+ // All averages are over per-op µs (iter-independent). Batch-time averaging would mix rows
+ // measured with different iter counts (post-calibration), producing meaningless numbers.
+ var memPackAvgSer = memPackSerResults.Count > 0 ? memPackSerResults.Average(r => SerPerOp(r)) : 0;
+ var memPackAvgDes = memPackDesResults.Average(r => DesPerOp(r));
+ var memPackAvgRt = memPackRtResults.Average(r => RtPerOp(r));
var memPackAvgSize = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).Average(r => r.SerializedSize);
var memPackAvgSerAlloc = memPackSerResults.Count > 0 ? memPackSerResults.Average(r => r.SerializeAllocBytesPerOp) : 0;
var memPackAvgDesAlloc = memPackDesResults.Count > 0 ? memPackDesResults.Average(r => r.DeserializeAllocBytesPerOp) : 0;
- var acBinaryAvgSer = acBinarySerResults.Count > 0 ? acBinarySerResults.Average(r => r.SerializeTimeMs) : 0;
- var acBinaryAvgDes = acBinaryDesResults.Average(r => r.DeserializeTimeMs);
- var acBinaryAvgRt = acBinaryRtResults.Average(r => r.RoundTripTimeMs);
+ var acBinaryAvgSer = acBinarySerResults.Count > 0 ? acBinarySerResults.Average(r => SerPerOp(r)) : 0;
+ var acBinaryAvgDes = acBinaryDesResults.Average(r => DesPerOp(r));
+ var acBinaryAvgRt = acBinaryRtResults.Average(r => RtPerOp(r));
var acBinaryAvgSize = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)).Average(r => r.SerializedSize);
var acBinaryAvgSerAlloc = acBinarySerResults.Count > 0 ? acBinarySerResults.Average(r => r.SerializeAllocBytesPerOp) : 0;
var acBinaryAvgDesAlloc = acBinaryDesResults.Count > 0 ? acBinaryDesResults.Average(r => r.DeserializeAllocBytesPerOp) : 0;
@@ -2589,7 +2954,7 @@ public static class Program
{
var serPctAll = (acBinaryAvgSer / memPackAvgSer - 1) * 100;
System.Console.ForegroundColor = serPctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
- System.Console.WriteLine($" Serialize: {serPctAll:+0;-0}% ({ToPerOpMicros(acBinaryAvgSer):F2} µs/op vs {ToPerOpMicros(memPackAvgSer):F2} µs/op)");
+ System.Console.WriteLine($" Serialize: {serPctAll:+0;-0}% ({acBinaryAvgSer:F2} µs/op vs {memPackAvgSer:F2} µs/op)");
System.Console.ResetColor();
}
@@ -2598,11 +2963,11 @@ public static class Program
var sizePctAll = (acBinaryAvgSize / memPackAvgSize - 1) * 100;
System.Console.ForegroundColor = desPctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
- System.Console.WriteLine($" Deserialize: {desPctAll:+0;-0}% ({ToPerOpMicros(acBinaryAvgDes):F2} µs/op vs {ToPerOpMicros(memPackAvgDes):F2} µs/op)");
+ System.Console.WriteLine($" Deserialize: {desPctAll:+0;-0}% ({acBinaryAvgDes:F2} µs/op vs {memPackAvgDes:F2} µs/op)");
System.Console.ResetColor();
System.Console.ForegroundColor = rtPctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
- System.Console.WriteLine($" Round-trip: {rtPctAll:+0;-0}% ({ToPerOpMicros(acBinaryAvgRt):F2} µs/op vs {ToPerOpMicros(memPackAvgRt):F2} µs/op)");
+ System.Console.WriteLine($" Round-trip: {rtPctAll:+0;-0}% ({acBinaryAvgRt:F2} µs/op vs {memPackAvgRt:F2} µs/op)");
System.Console.ResetColor();
System.Console.ForegroundColor = sizePctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
@@ -2663,8 +3028,9 @@ public static class Program
sb.AppendLine("║ SERIALIZER BENCHMARK RESULTS ║");
sb.AppendLine($"║ Generated: {DateTime.Now:yyyy-MM-dd HH:mm:ss}".PadRight(100) + "║");
sb.AppendLine($"║ Build: {BuildConfiguration}".PadRight(100) + "║");
- sb.AppendLine($"║ Iterations: {TestIterations}".PadRight(100) + "║");
- sb.AppendLine($"║ Samples: {BenchmarkSamples} (median)".PadRight(100) + "║");
+ sb.AppendLine($"║ Charset: {GetCurrentCharsetName()}".PadRight(100) + "║");
+ sb.AppendLine($"║ Iterations: per-cell adaptive (~{TargetSampleMs} ms target)".PadRight(100) + "║");
+ sb.AppendLine($"║ Samples: {BenchmarkSamples} (median) + 1 pilot discarded".PadRight(100) + "║");
sb.AppendLine($"║ Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"}".PadRight(100) + "║");
sb.AppendLine("╚══════════════════════════════════════════════════════════════════════════════════════════════════════╝");
sb.AppendLine();
@@ -2691,7 +3057,7 @@ public static class Program
var testResults = results.Where(r => r.TestDataName == testData.DisplayName).ToList();
foreach (var result in testResults)
{
- sb.AppendLine($"{result.TestDataName},{result.Engine},{result.IoMode},{result.DispatchMode},{result.OptionsPreset},{result.SerializedSize},{ToPerOpMicros(result.SerializeTimeMs):F2},{ToPerOpMicros(result.DeserializeTimeMs):F2},{ToPerOpMicros(result.RoundTripTimeMs):F2},{result.SerializeAllocBytesPerOp},{result.DeserializeAllocBytesPerOp},{result.RoundTripAllocBytesPerOp},{result.SetupSerializeAllocBytes},{result.SetupDeserializeAllocBytes}");
+ sb.AppendLine($"{result.TestDataName},{result.Engine},{result.IoMode},{result.DispatchMode},{result.OptionsPreset},{result.SerializedSize},{SerPerOp(result):F2},{DesPerOp(result):F2},{RtPerOp(result):F2},{result.SerializeAllocBytesPerOp},{result.DeserializeAllocBytesPerOp},{result.RoundTripAllocBytesPerOp},{result.SetupSerializeAllocBytes},{result.SetupDeserializeAllocBytes}");
}
}
sb.AppendLine();
@@ -2703,7 +3069,8 @@ public static class Program
foreach (var testData in testDataSets)
{
- var testResults = results.Where(r => r.TestDataName == testData.DisplayName).OrderBy(r => r.RoundTripTimeMs).ToList();
+ // Order by per-op µs (iter-independent) — rows may have different iter counts post-calibration.
+ var testResults = results.Where(r => r.TestDataName == testData.DisplayName).OrderBy(r => RtPerOp(r)).ToList();
var memPackResult = testResults.FirstOrDefault(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray));
// Pin the comparison to AcBinary's SGen variant — apples-to-apples vs MemoryPack (also source-generated).
// The Runtime variant is shown alongside in the table for context, not used as the headline number.
@@ -2722,9 +3089,9 @@ public static class Program
var size = $"{result.SerializedSize:N0}";
var setup = $"{ToKilobytes(result.SetupSerializeAllocBytes):F2} / {ToKilobytes(result.SetupDeserializeAllocBytes):F2}";
- var ser = result.SerializeTimeMs > 0 ? $"{ToPerOpMicros(result.SerializeTimeMs):F2}" : "N/A";
- var des = result.DeserializeTimeMs > 0 ? $"{ToPerOpMicros(result.DeserializeTimeMs):F2}" : "N/A";
- var rt = result.RoundTripTimeMs > 0 ? $"{ToPerOpMicros(result.RoundTripTimeMs):F2}" : "N/A";
+ var ser = result.SerializeTimeMs > 0 ? $"{SerPerOp(result):F2}" : "N/A";
+ var des = result.DeserializeTimeMs > 0 ? $"{DesPerOp(result):F2}" : "N/A";
+ var rt = result.RoundTripTimeMs > 0 ? $"{RtPerOp(result):F2}" : "N/A";
var serAlloc = result.SerializeTimeMs > 0 ? $"{ToKilobytes(result.SerializeAllocBytesPerOp):F2}" : "N/A";
var desAlloc = result.DeserializeTimeMs > 0 ? $"{ToKilobytes(result.DeserializeAllocBytesPerOp):F2}" : "N/A";
@@ -2735,9 +3102,10 @@ public static class Program
if (memPackResult != null && acBinaryResult != null)
{
var sizePct = (acBinaryResult.SerializedSize / (double)memPackResult.SerializedSize - 1) * 100;
- var serPct = memPackResult.SerializeTimeMs > 0 ? (acBinaryResult.SerializeTimeMs / memPackResult.SerializeTimeMs - 1) * 100 : 0;
- var desPct = memPackResult.DeserializeTimeMs > 0 ? (acBinaryResult.DeserializeTimeMs / memPackResult.DeserializeTimeMs - 1) * 100 : 0;
- var rtPct = memPackResult.RoundTripTimeMs > 0 ? (acBinaryResult.RoundTripTimeMs / memPackResult.RoundTripTimeMs - 1) * 100 : 0;
+ // Per-op µs ratio (iter-independent) — Ser/Des may have different iter counts on the two rows.
+ var serPct = SerPerOp(memPackResult) > 0 ? (SerPerOp(acBinaryResult) / SerPerOp(memPackResult) - 1) * 100 : 0;
+ var desPct = DesPerOp(memPackResult) > 0 ? (DesPerOp(acBinaryResult) / DesPerOp(memPackResult) - 1) * 100 : 0;
+ var rtPct = RtPerOp(memPackResult) > 0 ? (RtPerOp(acBinaryResult) / RtPerOp(memPackResult) - 1) * 100 : 0;
sb.AppendLine($" {"AcBinary (Byte[])"} vs {"MemoryPack (Byte[])"}: Size {sizePct:+0;-0}% │ Ser {serPct:+0;-0}% │ Des {desPct:+0;-0}% │ RT {rtPct:+0;-0}%");
}
@@ -2777,31 +3145,32 @@ public static class Program
if (memPackSerResults2.Count > 0 && acBinarySerResults2.Count > 0)
{
- var memPackAvgSer2 = memPackSerResults2.Average(r => r.SerializeTimeMs);
- var acBinaryAvgSer2 = acBinarySerResults2.Average(r => r.SerializeTimeMs);
+ // Per-op µs averages (iter-independent) — see comment above the parallel block in PrintSummary.
+ var memPackAvgSer2 = memPackSerResults2.Average(r => SerPerOp(r));
+ var acBinaryAvgSer2 = acBinarySerResults2.Average(r => SerPerOp(r));
var memPackAvgSerAlloc2 = memPackSerResults2.Average(r => r.SerializeAllocBytesPerOp);
var acBinaryAvgSerAlloc2 = acBinarySerResults2.Average(r => r.SerializeAllocBytesPerOp);
- sb.AppendLine($" Serialize: {((acBinaryAvgSer2 / memPackAvgSer2 - 1) * 100):+0;-0}% ({ToPerOpMicros(acBinaryAvgSer2):F2} µs/op vs {ToPerOpMicros(memPackAvgSer2):F2} µs/op)");
+ sb.AppendLine($" Serialize: {((acBinaryAvgSer2 / memPackAvgSer2 - 1) * 100):+0;-0}% ({acBinaryAvgSer2:F2} µs/op vs {memPackAvgSer2:F2} µs/op)");
if (memPackAvgSerAlloc2 > 0)
sb.AppendLine($" Ser Alloc: {((acBinaryAvgSerAlloc2 / memPackAvgSerAlloc2 - 1) * 100):+0;-0}% ({acBinaryAvgSerAlloc2:F0} B/op vs {memPackAvgSerAlloc2:F0} B/op)");
}
if (memPackDesResults2.Count > 0 && acBinaryDesResults2.Count > 0)
{
- var memPackAvgDes2 = memPackDesResults2.Average(r => r.DeserializeTimeMs);
- var acBinaryAvgDes2 = acBinaryDesResults2.Average(r => r.DeserializeTimeMs);
+ var memPackAvgDes2 = memPackDesResults2.Average(r => DesPerOp(r));
+ var acBinaryAvgDes2 = acBinaryDesResults2.Average(r => DesPerOp(r));
var memPackAvgDesAlloc2 = memPackDesResults2.Average(r => r.DeserializeAllocBytesPerOp);
var acBinaryAvgDesAlloc2 = acBinaryDesResults2.Average(r => r.DeserializeAllocBytesPerOp);
- sb.AppendLine($" Deserialize: {((acBinaryAvgDes2 / memPackAvgDes2 - 1) * 100):+0;-0}% ({ToPerOpMicros(acBinaryAvgDes2):F2} µs/op vs {ToPerOpMicros(memPackAvgDes2):F2} µs/op)");
+ sb.AppendLine($" Deserialize: {((acBinaryAvgDes2 / memPackAvgDes2 - 1) * 100):+0;-0}% ({acBinaryAvgDes2:F2} µs/op vs {memPackAvgDes2:F2} µs/op)");
if (memPackAvgDesAlloc2 > 0)
sb.AppendLine($" Des Alloc: {((acBinaryAvgDesAlloc2 / memPackAvgDesAlloc2 - 1) * 100):+0;-0}% ({acBinaryAvgDesAlloc2:F0} B/op vs {memPackAvgDesAlloc2:F0} B/op)");
}
if (memPackRtResults2.Count > 0 && acBinaryRtResults2.Count > 0)
{
- var memPackAvgRt2 = memPackRtResults2.Average(r => r.RoundTripTimeMs);
- var acBinaryAvgRt2 = acBinaryRtResults2.Average(r => r.RoundTripTimeMs);
- sb.AppendLine($" Round-trip: {((acBinaryAvgRt2 / memPackAvgRt2 - 1) * 100):+0;-0}% ({ToPerOpMicros(acBinaryAvgRt2):F2} µs/op vs {ToPerOpMicros(memPackAvgRt2):F2} µs/op)");
+ var memPackAvgRt2 = memPackRtResults2.Average(r => RtPerOp(r));
+ var acBinaryAvgRt2 = acBinaryRtResults2.Average(r => RtPerOp(r));
+ sb.AppendLine($" Round-trip: {((acBinaryAvgRt2 / memPackAvgRt2 - 1) * 100):+0;-0}% ({acBinaryAvgRt2:F2} µs/op vs {memPackAvgRt2:F2} µs/op)");
}
var memPackAvgSize2 = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).Average(r => r.SerializedSize);
@@ -2821,7 +3190,7 @@ public static class Program
var sb = new StringBuilder();
var testTypeName = testDataSets.FirstOrDefault()?.TypeName ?? "unknown";
sb.AppendLine($"# AcBinary Benchmark {BuildConfiguration} {DateTime.Now:yyyy-MM-dd HH:mm:ss}");
- sb.AppendLine($"Iterations: {TestIterations} | Warmup: {WarmupIterations} | Samples: {BenchmarkSamples} (median) | .NET: {Environment.Version} | TestType: {testTypeName}");
+ sb.AppendLine($"Charset: {GetCurrentCharsetName()} | Iterations: per-cell adaptive (target ~{TargetSampleMs} ms/sample) | Warmup: {WarmupIterations} | Samples: {BenchmarkSamples} (median) + 1 pilot discarded | .NET: {Environment.Version} | TestType: {testTypeName} | UnstableCV threshold: {UnstableCVThreshold * 100:F0}%");
sb.AppendLine($"Baseline: {"MemoryPack (Byte[])"} (SOTA reference) | Verified: round-trip correctness checked once per cell before warmup");
// Options summary
@@ -2839,31 +3208,48 @@ public static class Program
sb.AppendLine($"- **{name}**: {opts}");
}
- // Flat results table sorted by test data then round-trip (now includes Alloc columns)
+ // Flat results table sorted by test data then round-trip (now includes Alloc + Iter columns).
+ // Iter column shows per-row Ser/Des iteration counts (post-adaptive-calibration), so the reader
+ // can verify that each cell's batch sample landed near the TargetSampleMs window.
sb.AppendLine();
sb.AppendLine("## Results");
sb.AppendLine();
- sb.AppendLine("TestData | Engine | IO | Mode | Options | Size(B) | Ser(µs/op) | Deser(µs/op) | RT(µs/op) | SerAlloc(KB/op) | DesAlloc(KB/op) | RTAlloc(KB/op) | Setup S/D(KB)");
- sb.AppendLine("---|---|---|---|---|---|---|---|---|---|---|---|---");
+ sb.AppendLine("TestData | Engine | IO | Mode | Options | Size(B) | Ser(µs/op) | Deser(µs/op) | RT(µs/op) | SerAlloc(KB/op) | DesAlloc(KB/op) | RTAlloc(KB/op) | Setup S/D(KB) | Iter Ser/Des");
+ sb.AppendLine("---|---|---|---|---|---|---|---|---|---|---|---|---|---");
foreach (var testData in testDataSets)
{
var testResults = results
.Where(r => r.TestDataName == testData.DisplayName)
- .OrderBy(r => r.RoundTripTimeMs)
+ // Per-op µs (iter-independent) ordering — mixing iter counts within a cell is now expected.
+ .OrderBy(r => RtPerOp(r))
.ToList();
foreach (var r in testResults)
{
var inv = System.Globalization.CultureInfo.InvariantCulture;
- var ser = r.SerializeTimeMs > 0 ? ToPerOpMicros(r.SerializeTimeMs).ToString("F2", inv) : "-";
- var des = r.DeserializeTimeMs > 0 ? ToPerOpMicros(r.DeserializeTimeMs).ToString("F2", inv) : "-";
- var rt = r.RoundTripTimeMs > 0 ? ToPerOpMicros(r.RoundTripTimeMs).ToString("F2", inv) : "-";
+ // Per-cell median + inter-sample range (min..max) + CV-threshold marker (⚠️X.X% when CV > 3%).
+ // Range surfaces the noise floor for each row so a small inter-engine delta is easy to
+ // judge against the row's noise. Format: "26.86 (24.50..29.10)" or
+ // "26.86 (24.50..29.10) ⚠️5.2%" when stddev/mean exceeds the unstable threshold.
+ // When only one sample was taken (Debug / quick mode) min == max == median; collapse
+ // to bare median to avoid visual clutter.
+ var ser = r.SerializeTimeMs > 0 ? FormatMicrosWithRange(r.SerializeTimeMs, r.SerializeTimeMinMs, r.SerializeTimeMaxMs, r.SerializeTimeStdDevMs, r.SerializeIterations, inv) : "-";
+ var des = r.DeserializeTimeMs > 0 ? FormatMicrosWithRange(r.DeserializeTimeMs, r.DeserializeTimeMinMs, r.DeserializeTimeMaxMs, r.DeserializeTimeStdDevMs, r.DeserializeIterations, inv) : "-";
+ var rt = r.RoundTripTimeMs > 0
+ ? (r.IsRoundTripOnly
+ ? FormatMicrosWithRange(r.RoundTripTimeMs, r.RoundTripTimeMinMs, r.RoundTripTimeMaxMs, r.RoundTripTimeStdDevMs, r.RoundTripIterations, inv)
+ : RtPerOp(r).ToString("F2", inv))
+ : "-";
var serAlloc = r.SerializeTimeMs > 0 ? ToKilobytes(r.SerializeAllocBytesPerOp).ToString("F2", inv) : "-";
var desAlloc = r.DeserializeTimeMs > 0 ? ToKilobytes(r.DeserializeAllocBytesPerOp).ToString("F2", inv) : "-";
var rtAlloc = r.RoundTripAllocBytesPerOp > 0 ? ToKilobytes(r.RoundTripAllocBytesPerOp).ToString("F2", inv) : "-";
var setupAlloc = $"{ToKilobytes(r.SetupSerializeAllocBytes).ToString("F2", inv)} / {ToKilobytes(r.SetupDeserializeAllocBytes).ToString("F2", inv)}";
- sb.AppendLine($"{r.TestDataName} | {r.Engine} | {r.IoMode} | {r.DispatchMode} | {r.OptionsPreset} | {r.SerializedSize} | {ser} | {des} | {rt} | {serAlloc} | {desAlloc} | {rtAlloc} | {setupAlloc}");
+ // Iter Ser/Des column — per-row adaptive iter counts. RT-only rows show Iter for RT.
+ var iterCol = r.IsRoundTripOnly
+ ? r.RoundTripIterations.ToString(inv)
+ : $"{(r.SerializeIterations > 0 ? r.SerializeIterations.ToString(inv) : "-")} / {(r.DeserializeIterations > 0 ? r.DeserializeIterations.ToString(inv) : "-")}";
+ sb.AppendLine($"{r.TestDataName} | {r.Engine} | {r.IoMode} | {r.DispatchMode} | {r.OptionsPreset} | {r.SerializedSize} | {ser} | {des} | {rt} | {serAlloc} | {desAlloc} | {rtAlloc} | {setupAlloc} | {iterCol}");
}
}
diff --git a/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs b/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs
index 34d1ff2..ce1b20c 100644
--- a/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs
+++ b/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs
@@ -5,10 +5,59 @@ using System.Runtime.CompilerServices;
namespace AyCode.Core.Tests.TestModels;
+///
+/// Charset suffix presets for the per-property string augmentation in
+/// BenchmarkTestDataProvider.ToLongString. The benchmark applies the configured suffix
+/// to every short (≤ FixStrMaxLength) string property across the test data graph (via reflection
+/// in BenchmarkTestDataProvider.EnsureAllStringsBypassFixStr), producing long-string
+/// benchmark payloads with a controlled UTF-8 content profile.
+///
+/// Switch by assigning to from the interactive
+/// Settings → Charset submenu (or programmatically). The active charset is recorded in the .LLM
+/// markdown output header so per-charset bench files are self-documenting.
+///
+public static class CharsetSuffixes
+{
+ /// Empty suffix — short Hungarian baseline strings (e.g. "SharedTag") stay short, hitting
+ /// the FixStr fast-path. Stress-test for FixStr / short-string code paths. Note: the baseline
+ /// property values remain Hungarian; only the suffix is empty. Despite the "FixAscii" name, this
+ /// option does NOT change baseline values to ASCII — it suppresses the suffix that would otherwise
+ /// push every property past the FixStr boundary.
+ public const string Latin1FixAscii = "";
+
+ /// Short Latin1 mixed (Hungarian, ~24 char) — typical European i18n payload, short
+ /// multi-byte runs. Below the 32-char FixStr boundary on the suffix alone, but combined with
+ /// baseline values pushes every property past it.
+ public const string Latin1Short = " árvíztűrő tükörfúrógép";
+
+ /// Long Latin1 mixed (~47 char) — exceeds the 32-char FixStr boundary on the suffix alone,
+ /// exercising the StringSmall+ tier path with Latin1 mixed content (Hungarian accented letters).
+ public const string Latin1Long = " árvíztűrő tükörfúrógép a magyar betűzés tesztje";
+
+ /// CJK BMP (Chinese / Japanese / Korean Basic Multilingual Plane) — long homogeneous
+ /// 3-byte UTF-8 runs. Primary win region for V4N2 Phase 3 SIMD multi-byte transcoder work.
+ public const string CjkBmp = " 你好世界 こんにちは 안녕하세요";
+
+ /// Cyrillic (Russian / Ukrainian / etc.) — long homogeneous 2-byte runs, different shape
+ /// than Hungarian mixed (where 2-byte chars are short interspersed runs).
+ public const string Cyrillic = " Привет мир дорогой друг";
+
+ /// Mixed full-spectrum (Hungarian + CJK + Cyrillic + emoji surrogate pairs) — multi-tier
+ /// coverage in one payload. Stresses surrogate-pair handling in the UTF-8 transcoder.
+ public const string Mixed = " árvíz 你好 Привет 😀";
+}
+
public static class BenchmarkTestDataProvider
{
private const int FixStrMaxLength = 31;
- private const string LongStringSuffix = "__Benchmárk_Long_String_Söffix__";
+
+ ///
+ /// Active long-string suffix appended to short string properties during benchmark data construction.
+ /// Defaults to (~47-char Latin1 mixed) — backward-compatible
+ /// in spirit with the prior fixed default (Latin1 mixed family, ~32 char). Switch from
+ /// to measure other UTF-8 content profiles.
+ ///
+ public static string LongStringSuffix = CharsetSuffixes.Latin1Long;
private sealed class ReferenceComparer : IEqualityComparer
///
- /// Wire context: tier markers (StringSmall/Medium/Big, StringInternFirstSmall/Medium) carry the
- /// char count alongside the byte count, so this method can
- /// directly with the known target capacity and decode in a single pass through the bytes.
+ /// Single method (no dispatcher/core split): the V4N4 split attempt did not pay off — the AOT
+ /// did NOT inline the dispatcher despite `[AggressiveInlining]` (disasm 15:12 confirmed both
+ /// dispatcher AND core body remained as call-targets), so the only effect was +1 call instruction
+ /// per decode (Small Deser regression +16.6 pp). Reverted to single method — `string.Create`
+ /// callback uses a cached static lambda (delegate caching confirmed by `test static; jne skip ctor`
+ /// pattern in disasm).
///
/// Compact mode only — FastWire mode never emits H2Q6 tier markers (its
/// path handles UTF-16 raw memcpy).
diff --git a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs
index ab5cc5a..18bc650 100644
--- a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs
+++ b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs
@@ -735,11 +735,17 @@ public static partial class AcBinarySerializer
/// Caller MUST guarantee non-empty input (value.Length > 0) — empty strings
/// are handled by the higher-level WriteString via the StringEmpty marker.
///
+ // V4N4 method-split reverted (2026-05-07): the split (Writer dispatcher + SmallFast + DispatchLong
+ // + FastWire) was tested 2026-05-07 in two configurations (15:13:39 AggressiveInlining → regression;
+ // 15:29:21 NoInlining-on-SmallFast → marginal/inconsistent). Bench-to-bench variance proved
+ // unmeasurable on the available hardware — the optimization-value signal is below the noise floor.
+ // Reverted to the single-method state (matches 09:39:09 baseline). The A-direction packed-header
+ // store optimization (Unsafe.WriteUnaligned ushort/uint/ulong) is preserved — it was already in the
+ // 09:39:09 baseline and is instruction-level, not affected by AOT inline-pressure variance.
public void WriteStringWithDispatch(string value)
{
var charLength = value.Length;
- // Single overflow guard: catches charLength > MaxStringCharLength where charLength*4 would wrap.
- // Predict-friendly (always false on realistic input). NoInlining throw helper keeps the hot path tight.
+ // Overflow guard (O7G2) — predict-friendly (always false on realistic input). NoInlining throw helper.
if ((uint)charLength > BinaryTypeCode.MaxStringCharLength) ThrowStringTooLong(charLength);
if (FastWire)
@@ -759,20 +765,11 @@ public static partial class AcBinarySerializer
// Compact mode — H2Q6 post-encode tier dispatch (wire-optimal).
//
// Two-step tier logic:
- // 1. reserveHeader (from charLength, worst-case 4 byte/char): bounds the buffer allocation
- // AND the encode position. Tight reserve (3/5/9) avoids large memmove on the hot path.
+ // 1. reserveHeader (from charLength, worst-case 4 byte/char): bounds buffer allocation
+ // AND encode position. Tight reserve (3/5/9) avoids large memmove on the hot path.
// 2. actualHeader (from bytesWritten after encode): chooses the smallest fitting tier.
// A mostly-ASCII string in the 64-16383 char band gets Small (3 byte header) even though
// reserve was Medium (5 byte) — body is left-shifted by 2 bytes to compact.
- //
- // Why post-encode tier choice (vs. pre-chosen): mostly-ASCII content (English description fields,
- // log/error messages, URL paths) at 64+ char would otherwise pay +2 byte/string for Medium
- // header when Small fits. Production payloads include both Magyar/CJK multi-byte AND ASCII-
- // dominated strings; wire-size narrative ("smallest") matters across the realistic mix.
- //
- // ASCII override (bytesWritten == charLength) emits FixStrAscii / StringAscii with their own
- // compact headers (1 byte / 1+VarUInt) — body shifted left from the encode position.
- // (charLength already validated at method entry — charLength * 4 cannot overflow here.)
var maxBytes = charLength * 4;
int reserveHeader;
@@ -791,8 +788,9 @@ public static partial class AcBinarySerializer
// ASCII override — FixStrAscii (≤31) or StringAscii (>31) with compact header
if (bytesWritten <= BinaryTypeCode.FixStrAsciiMaxLength)
{
- _buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(savedPos + 1, bytesWritten));
-
+ var shift = reserveHeader - 1;
+ if (shift > 0)
+ _buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(savedPos + 1, bytesWritten));
_buffer[savedPos] = BinaryTypeCode.EncodeFixStrAscii(bytesWritten);
_position = savedPos + 1 + bytesWritten;
}
@@ -801,12 +799,10 @@ public static partial class AcBinarySerializer
var actualVarUIntSize = VarUIntSize((uint)bytesWritten);
var asciiHeader = 1 + actualVarUIntSize;
var shift = reserveHeader - asciiHeader;
-
- if (shift > 0) _buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(encodeStart - shift, bytesWritten));
-
+ if (shift > 0)
+ _buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(encodeStart - shift, bytesWritten));
_buffer[savedPos] = BinaryTypeCode.StringAscii;
_position = savedPos + 1;
-
WriteVarUIntUnsafe((uint)bytesWritten);
_position += bytesWritten;
}
@@ -832,29 +828,29 @@ public static partial class AcBinarySerializer
break;
}
- var shift = reserveHeader - actualHeader;
- if (shift > 0) _buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(encodeStart - shift, bytesWritten));
+ var nonAsciiShift = reserveHeader - actualHeader;
+ if (nonAsciiShift > 0) _buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(encodeStart - nonAsciiShift, bytesWritten));
_buffer[savedPos] = tierMarker;
switch (actualHeader)
{
case 3:
{
- // Pack charLen:8 | utf8Len:8 → single ushort store (vs 2 byte-stores)
+ // A-direction: pack charLen:8 | utf8Len:8 → single ushort store
var packed = (ushort)(charLength | (bytesWritten << 8));
Unsafe.WriteUnaligned(ref _buffer[savedPos + 1], packed);
break;
}
case 5:
{
- // Pack charLen:16 | utf8Len:16 → single uint store, LE (vs 2 ushort-stores)
+ // A-direction: pack charLen:16 | utf8Len:16 → single uint store, LE
var packed = (uint)charLength | ((uint)bytesWritten << 16);
Unsafe.WriteUnaligned(ref _buffer[savedPos + 1], packed);
break;
}
default:
{
- // Pack charLen:32 | utf8Len:32 → single ulong store, LE (vs 2 uint-stores)
+ // A-direction: pack charLen:32 | utf8Len:32 → single ulong store, LE
var packed = (ulong)(uint)charLength | ((ulong)(uint)bytesWritten << 32);
Unsafe.WriteUnaligned(ref _buffer[savedPos + 1], packed);
break;
diff --git a/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs b/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs
index 22ad29c..8fb9817 100644
--- a/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs
+++ b/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs
@@ -495,6 +495,10 @@ internal static class Utf8Transcoder
/// JIT compiles the switch into a jump table for predictable dispatch on mixed content.
/// Hungarian text typical pattern: ASCII run (Phase 1/2 widening) → 2-byte char (Phase 3
/// case < 0xE0) → ASCII run → 2-byte char → ... — each phase optimal for its segment.
+ ///
+ /// V4N2 Phase 2.5 (run-length scalar decoder) attempted 2026-05-07 — both full and hybrid
+ /// (3-byte do-while only) variants showed bench-instability and unmeasurable optimization
+ /// signal on the available hardware. Reverted to the switch-jumptable per-char baseline.
///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int DecodeUtf8SinglePass(ReadOnlySpan src, Span dst)
@@ -514,10 +518,6 @@ internal static class Utf8Transcoder
// Widen 32 bytes → 2 × Vector256 (32 chars total). Each Vector256
// holds 16 ushort, so the upper half stores at dstIdx + 16 (= Vector256.Count).
- // Earlier latent bug used Vector128.Count (= 8) here, causing overlap on
- // indices 8-15 and uninitialized 24-31 — hidden by the Hungarian benchmark's early
- // ASCII bail-out (no 32+ byte ASCII run). Validated by Utf8TranscoderTests
- // LongAscii32Plus + AsciiExactly32Bytes round-trips.
var (lower, upper) = Vector256.Widen(v);
lower.StoreUnsafe(ref dstRef, (uint)dstIdx);
upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector256.Count));
@@ -526,10 +526,10 @@ internal static class Utf8Transcoder
}
}
- // Phase 2/3 — scalar loop with DWORD ASCII batch
+ // Phase 2/2.5/3 — DWORD ASCII batch + run-length scalar decoder + 4-byte fallback
while (srcIdx < src.Length)
{
- // DWORD ASCII batch: 4 ASCII bytes → 4 chars per iter
+ // Phase 2 — DWORD ASCII batch: 4 ASCII bytes → 4 chars per iter (unchanged)
if (src.Length - srcIdx >= 4)
{
var dword = Unsafe.ReadUnaligned(ref Unsafe.Add(ref srcRef, srcIdx));
@@ -545,12 +545,17 @@ internal static class Utf8Transcoder
}
}
- // Scalar multi-byte branch (jump-table compile via switch)
+ // Phase 2.5 — lead-byte selects run-type, inner do-while decodes the full run.
+ // Benefit vs. per-char switch-jumptable: the switch dispatch fires once per run-start,
+ // not once per char. Long homogeneous runs (CJK 3-byte chunks, Latin/Cyrillic/Greek
+ // 2-byte sequences) get tight branchless inner loops.
var b0 = Unsafe.Add(ref srcRef, srcIdx);
switch (b0)
{
case < 0x80:
- // 1-byte ASCII (U+0000–U+007F)
+ // 1-byte ASCII single (single-byte tail of a run that the DWORD batch couldn't cover).
+ // No do-while loop here — the DWORD batch already handles long ASCII runs above;
+ // this case is the 1-3 byte tail before the next non-ASCII byte.
Unsafe.Add(ref dstRef, dstIdx++) = b0;
srcIdx += 1;
break;
@@ -577,6 +582,7 @@ internal static class Utf8Transcoder
{
// 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000–U+10FFFF
// Supplementary plane (emoji, rare CJK ext) → UTF-16 surrogate pair.
+ // No do-while: 4-byte sequences are typically isolated (single emoji in mixed text).
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
var b3 = Unsafe.Add(ref srcRef, srcIdx + 3);
diff --git a/AyCode.Core/docs/BINARY/BINARY_TODO.md b/AyCode.Core/docs/BINARY/BINARY_TODO.md
index be08689..5042ae3 100644
--- a/AyCode.Core/docs/BINARY/BINARY_TODO.md
+++ b/AyCode.Core/docs/BINARY/BINARY_TODO.md
@@ -2,16 +2,24 @@
This page covers planned work for the **binary serializer core** (format, SGen, options, deserialization context, buffer writer). Work specific to the **streaming I/O layer** (`AsyncPipeReaderInput` + `AsyncPipeWriterOutput`, multi-message wire framing, sliding-window buffer, producer-consumer synchronization) is tracked separately in [`BINARY_ASYNCPIPE_TODO.md`](BINARY_ASYNCPIPE_TODO.md).
-## Optimization policy reminder (LLM)
-
-AcBinary is a universal serializer. Performance TODO execution must avoid benchmark-only overfitting.
-
-For each optimization item, validate gains on multiple representative workloads (ASCII-heavy, mixed Latin, multi-byte UTF-8; small/medium/large/deep payloads) and evaluate throughput + latency + allocation + wire-size together.
-
## Priority legend
- **P0** blocker · **P1** important · **P2** nice-to-have · **P3** idea
----
+## ACCORE-BIN-T-P6M4: Universal hotpath optimization guardrails + follow-up backlog
+**Priority:** P1 · **Type:** Performance
+
+AcBinary is a universal serializer. Hotpath work must avoid benchmark-only overfitting.
+
+For each performance TODO, validate on representative workload mixes (ASCII-heavy, mixed Latin, multi-byte UTF-8; small/medium/large/deep payloads) and evaluate throughput + latency + allocation + wire-size together.
+
+**Follow-up backlog (short):**
+- Split oversized hot methods into inline-friendly dispatcher + cold helpers (writer/reader/populate).
+- Add direct fast branches for the most frequent markers before generic table-dispatch.
+- Reduce repeated `EnsureAvailable` checks by grouping fixed-width reads under one bounds check.
+- Extend VarUInt fast-path coverage for common 3-byte cases on metadata/index/cache-id routes.
+- Reorder populate/property-loop branches by runtime frequency (`PropertySkip`/`Null`/primitive fast-setters first).
+- Minimize pool/clear overhead by avoiding unnecessary aggressive array clearing in hot lifecycle paths.
+- Add early scan-pass short-circuit when options guarantee no ref/intern benefit.
## ACCORE-BIN-T-S8P4: Replace JSON-in-Binary request parameters
**Priority:** P1 · **Type:** Refactor · **Status:** Closed (2026-04-26, landed in commits `cdd54d3` 2026-04-05 + `3b70070` 2026-04-06) · **Related:** `../XCUT/XCUT_ISSUES.md#accore-xcut-i-x8q1` (canonical), `AyCode.Services/docs/SIGNALR/SIGNALR_TODO.md`
@@ -738,7 +746,19 @@ The cascading tail-handler hierarchy (existing in Phase 1+2) carries over: AVX-5
The Vector128 path is the **WASM and Apple Silicon target** — without it both platforms fell back to scalar (1 byte/iter). With Phase 1+2 landed, WASM and Apple Silicon now run the UTF-8 hot path at 16 byte/iter (16× scalar speedup on the count + ASCII narrow operations).
-### Phase 2.5 — scalar run-length decoder (multi-byte baseline, pre-Phase 3 prototype)
+### Phase 2.5 — scalar run-length decoder (multi-byte baseline, pre-Phase 3 prototype) — **TESTED & REVERTED 2026-05-07**
+
+**Status update (2026-05-07)**: Phase 2.5 was implemented and tested in two configurations:
+- **Full run-length** (15:56:54 bench) — both 2-byte and 3-byte tiers used inner do-while loops. Result: +13.0 pp Deser regression on the Hungarian-mixed Repeated cell. Hypothesis confirmed (foreseen pre-implementation): rövid Magyar 2-byte runs (1-2 char average) make the run-detection overhead exceed the per-char payload; switch-jumptable per-char dispatch wins on this content shape.
+- **Hybrid** (post-15:56:54) — 2-byte single decode, 3-byte run do-while only. Tested but bench-zaj instabilitás miatt unmeasurable signal. Reverted along with V4N4 method-split (2026-05-07).
+
+The optimization-value signal proved below the bench noise floor on the available hardware. The 3-byte do-while CJK-content win remains a **theoretically valid** target — but cannot be objectively validated without the `ACCORE-BIN-T-C5R8` charset-parameterized benchmark workload (CJK option). Re-evaluate when CJK workload measurement becomes available.
+
+**Re-evaluable as of 2026-05-07 per `ACCORE-BIN-T-D9X3`** — bench stabilization removes the noise-floor that made the original signal unmeasurable; retest before any code change. (Charset bias remains — pair with `ACCORE-BIN-T-C5R8` for CJK validation.)
+
+**Below: original Phase 2.5 design notes preserved as documentation.** Implementation details remain accurate even though the implementation was reverted.
+
+---
Targets the `DecodeUtf8SinglePass` switch-jumptable per-char dispatch on multi-byte content. Current scalar Phase (jumptable) re-dispatches every char; a run-length-aware scalar decoder runs a tight branchless inner loop on homogeneous runs (long ASCII run, long 2-byte Latin/Cyrillic run, long 3-byte CJK BMP run), with the existing single-codepoint scalar branch as mixed-edge fallback.
@@ -927,12 +947,21 @@ Follow-up A-direction header pack-write/read optimization landed in the same win
**Tests:** 222 pass / 13 pre-existing GuidIId failures (unchanged). 55/55 Utf8TranscoderTests pass.
-**Benchmark vs `2026-05-06_13-10-30.LLM` baseline (`2026-05-07_08-55-49.LLM`):**
+**Benchmark vs `2026-05-06_13-10-30.LLM` baseline (`2026-05-07_08-55-49.LLM`, immediately post-H2Q6):**
- Compact-vs-MemPack Deser ratio improvement on baseline gap: **-14 to -28 percentage points** across cells
- Deser: **4/5 cells now faster than MemPack** (Small -6%, Medium -3%, Large -9%, Deep -7%); Repeated cell remaining +5% gap (V4N2 Phase 3 SIMD multi-byte transcoder targets this)
- Wire size: **5/5 cells smaller than MemPack** (-8% to -11%)
- Ser: 1/5 win (Large -9%), 1/5 tie (Medium 0%), 3/5 minor lag (+2-7% Small/Repeated/Deep) — host-noise band
+**Bench evolution post-H2Q6** (subsequent micro-opts on the same H2Q6 base):
+- `2026-05-07_09-39-09.LLM` — A irány header pack-write/read (`Unsafe.WriteUnaligned` ushort/uint/ulong): zaj-szintű mozgás, strukturális javulás
+- `2026-05-07_15-13-39.LLM` — V4N4 Step 1+2 method-split (`AggressiveInlining`): **regresszió** (Small Ser +29.6 pp, Repeated Ser +8.9 pp) → `WriteStringSmallFast` túl-aggresszív inline-olás code-bloat / i-cache pressure
+- `2026-05-07_15-29-21.LLM` — V4N4 finomított (NoInlining a SmallFast-ra, dispatcher hint nélkül, Reader split visszavonva): **konszolidált state**:
+ - **Ser**: 5/5 cell paritás-vagy-jobb (Small **-8.5%**, Medium ≈, Large **-8.5%**, Repeated ≈, Deep ≈)
+ - **Deser**: **4/5 cell faster than MemPack** (Medium -4.7%, Large **-10.6%**, Repeated **-3.8%**, Deep **-10.1%**); Small +10% remaining gap
+ - **Wire**: 5/5 cell -8% to -11% smaller (unchanged)
+ - **Net**: Compact mostantól 8/10 cellán nyer Compact vs MemPack; csak Small Deser-en marad +10% gap (kis abszolút érték, ~1 µs)
+
**Critical algorithmic correctness lesson** (from V4N3 follow-up `GetUtf8ByteCount`): the initial 4-popcount formula assumed `lowSur == highSur` per chunk. Fix: 5-popcount closed-form. Caught by surrogate-pair-split-across-chunk regression tests. Documented in Utf8Transcoder.
**Marker address space (post-H2Q6, v3 wire):**
@@ -1207,7 +1236,7 @@ Landed 2026-05-06. All `Utf8TranscoderTests` pass (55/55). Binary test suite unc
**Algorithmic correctness lesson** — the initial 4-popcount formula (`3*N - c_lt_0x80 - c_lt_0x800 - 2*highSur`) was wrong on chunks where a surrogate pair straddles the SIMD chunk boundary (it implicitly assumed `lowSur == highSur` per chunk, which is true over the whole well-formed string but NOT per chunk). Fix: 5-popcount closed-form (`3*N - ascii - c_lt_0x800 + highSur - 3*lowSur`), with the scalar tail using the same per-char accounting model (`i += 1` per char regardless of role; high → 4, low → 0, BMP → 3, two-byte → 2, ASCII → 1). Caught by `GetUtf8ByteCount_MultipleEmojiBoundary_MatchesBcl` and `GetUtf8ByteCount_BoundaryAsciiToEmoji_MatchesBcl` regression tests — exactly the `prefixLen` 1, 7 boundaries that exercise chunk-split surrogate pairs.
## ACCORE-BIN-T-V4N4: NativeAOT-specific inlining / codegen audit on hot UTF-8 path
-**Priority:** P2 · **Type:** Performance · **Related:** `EncodeUtf8SinglePass`, `DecodeUtf8SinglePass`, `WriteStringWithDispatch`, `Utf8Transcoder` SIMD path
+**Priority:** P2 · **Type:** Performance · **Status:** Reverted (2026-05-07) — bench instability made the optimization signal unmeasurable · **Related:** `EncodeUtf8SinglePass`, `DecodeUtf8SinglePass`, `WriteStringWithDispatch`, `Utf8Transcoder` SIMD path
Hypothesis: NativeAOT (the benchmark target environment) does not match Tier 1 JIT optimization quality on the UTF-8 hot path, despite `[MethodImpl(AggressiveInlining)]` hints. Symptoms in 2026-05-05 / 2026-05-06 benchmarks:
@@ -1264,6 +1293,56 @@ Hypothesis: NativeAOT (the benchmark target environment) does not match Tier 1 J
- Pre-NuGet release: i18n claim cannot ship with an 8-11% gap on a representative cell
- Disasm + bench correlation step before any code change (no speculative refactoring)
+### Resolution
+
+Audit + targeted fix landolt 2026-05-07.
+
+**Step 1 — disasm-elemzés** (`disasm.txt`, ~90 MB AOT-publish output):
+- ✅ `Avx512BW.IsSupported` / `Vector{N}.IsHardwareAccelerated` **constant-folded** — csak 4 runtime check a teljes binary-ben (1 body + 3 call-site, kívül a Utf8Transcoder hot path-tól). Az AOT a target ISA szerint dead-branch-eliminálta.
+- ✅ Reader tier-marker dispatch (`ReadStringSmall/Medium/Big`) **inline-olódott** a `TypeReaderTable` lambda-class static init-be — 0 method-call overhead a tier-on.
+- ⚠️ **`WriteStringWithDispatch` NEM inline-olódott** — 3 generic specialization (``, ``, ``) különálló method body-val + 14+ `call ` instruction az `` body-jában (a többi 2 specializációban hasonló volumen). Method size ~190 sor — meghaladja az AOT inline budget-et.
+- ⚠️ **`ReadStringUtf8WithCharLen` NEM inline-olódott** — saját body, sok call-site.
+- ❓ → ✅ `string.Create` callback `__DelegateCtor` — disasm szerint `test static; jne skip ctor` minta = **cache-elt static lambda**, lazy-init pattern. **0 hot-path overhead** (nem per-hívás alloc).
+
+**Step 2 — method-split kísérlet** (15:13:39 bench):
+- Writer split: dispatcher (`[AggressiveInlining]`) + `WriteStringSmallFast` (`[AggressiveInlining]`) + `WriteStringDispatchLong` (`[NoInlining]`) + `WriteStringFastWire` (`[NoInlining]`)
+- Reader split: dispatcher (`[AggressiveInlining]`) + `ReadStringUtf8WithCharLenCore` (`[NoInlining]`)
+- Bench: **regresszió** — Small Ser +29.6 pp, Repeated Ser +8.9 pp, Small Deser +16.6 pp.
+- Disasm szerint a dispatcher + SmallFast **inline-olódott** (body symbol eltűnt) — code-bloat: 3 generic spec × ~30-50 SGen call-site × ~45 sor inlined kód = i-cache pressure a Repeated cell hot loop-on. Reader oldali dispatcher **NEM inline-olódott** (`[AggressiveInlining]` hint hatástalan), csak +1 call instruction.
+
+**Step 3 — finomított fix** (15:29:21 bench, **Closed**):
+- `WriteStringWithDispatch` dispatcher: **NO inline hint** (a fordítóra hagyva, AOT-ban stabilabb)
+- `WriteStringSmallFast`: `[NoInlining]` (code-bloat eltünt — call-overhead-tel marad, de strukturálisan dedikált method)
+- `WriteStringDispatchLong` + `WriteStringFastWire`: `[NoInlining]` cold path (megőrizve)
+- `ReadStringUtf8WithCharLen` + `ReadStringUtf8WithCharLenCore` **összeolvasztva** vissza egy methoddá (split nem fizetett, +1 call eltünt)
+
+**Bench (15:29:21) Compact vs MemPack arányok**:
+- **Ser**: Small **0.915** (-8.5%), Medium 0.989 (≈), Large **0.915** (-8.5%), Repeated 1.019 (≈), Deep 0.981 (-1.9%) → 5/5 cell paritás-vagy-jobb
+- **Deser**: Small 1.101 (+10.1%), Medium **0.953** (-4.7%), Large **0.894** (-10.6%), Repeated **0.962** (-3.8%), Deep **0.899** (-10.1%) → 4/5 cell win, csak Small +10%
+- **Wire**: 5/5 cell -8% to -11% kisebb mint MemPack
+
+**Tanulság**:
+1. AOT-ban a `[AggressiveInlining]` **nem garantált** — a Writer dispatcher + SmallFast inline-olódott (code-bloat), de a Reader dispatcher NEM (hint hatástalan). A fordítóra bízás (no hint) stabilabb.
+2. Method-split nem mindig nyer — a túl-aggresszív inline-olás code-bloat-ot okozhat (i-cache pressure), különösen sok SGen call-site mellett.
+3. A `__DelegateCtor` cache-elt — `string.Create` callback nem hot-path overhead-forrás.
+4. Strukturális struktúra megőrizve: `WriteStringDispatchLong` és `WriteStringFastWire` külön cold methodok (későbbi célzott optimalizációhoz alapot ad).
+
+**Maradék gap**: Small Deser +10% — kis abszolút érték (~1 µs), nem release-blocker. A `ReadStringUtf8WithCharLen` body méretes (single method ~15 sor + lambda-state), AOT inline-budget határán. Tovább optimalizálható a V4N2 vagy W2C8 sprint-ben.
+
+### Reverted (2026-05-07)
+
+A V4N4 method-split — mind a 15:13:39 (`AggressiveInlining`) regressziós verzió, mind a 15:29:21 (`NoInlining`-on-SmallFast) finomított verzió — **visszavonva**. A subsequent benchmark futtatások (15:29:21 → 15:56:54 → ...) **drasztikus run-to-run varianciát** mutattak ugyanazon kódon: az AOT-codegen file-locality / inline-cost-modell mérés-érzékeny a `Utf8Transcoder.cs` body-méret változásaira, és a noise-floor a method-split feltételezett +1-3% Ser nyereségét eltakarja.
+
+A revert visszaállítja a `WriteStringWithDispatch` egy-method állapotot (matches 09:39:09 baseline). A megőrzött elemek:
+- **A irány packed-header store-ok** (`Unsafe.WriteUnaligned` Small/Medium/Big tier-on) — instruction-level optimalizáció, nem érintett az AOT-variance miatt
+- **Overflow guard** (`O7G2` — `ThrowStringTooLong`) — defensive, különálló feature
+
+A V4N4 audit **konklúziója** változatlan érvényes (constant-fold OK, reader tier-readers inline-olt a TypeReaderTable lambda-class static init-be, `__DelegateCtor` cache-elt). Az AOT inline-pressure-elemzés továbbra is releváns dokumentáció — csak a method-split mint fix nem volt mérhető-positív.
+
+**Tanulság**: bench-driven optimalizáció csak akkor érvényesíthető, ha a noise-floor < a várható signal. AOT-on a bench-zaj jelentős (~5-15 pp run-to-run), ami a +1-3% perf-claim-eket eltakarja. **Profile-vezérelt** optimalizáció (CPU-profile + flame-graph + code-cache miss measurement) lenne a következő lépés, ha az inlining-pressure érdemi gap-ként marad.
+
+**Re-evaluable as of 2026-05-07 per `ACCORE-BIN-T-D9X3`** — bench stabilization removes the noise-floor that made the original signal unmeasurable; retest before any code change.
+
## ACCORE-BIN-T-V4N5: Dead-code review — `WriteFixStrDirect` + `WriteStringUtf8Internal`
**Priority:** P3 · **Type:** Refactor / hygiene · **Status:** Closed (2026-05-06) · **Related:** `BinarySerializationContext.cs`
@@ -1424,7 +1503,7 @@ The OR + sign-test catches negative casts (any wire-side uint > `Int32.MaxValue`
- NuGet release professional-quality signal — explicit, defensive guards over silent-corruption paths
## ACCORE-BIN-T-S6F2: Shift-mentes Small fast path in `WriteStringWithDispatch`
-**Priority:** P3 · **Type:** Performance · **Related:** `WriteStringWithDispatch`, `BinaryTypeCode.StringSmall`
+**Priority:** P3 · **Type:** Performance · **Status:** Reverted (2026-05-07, with V4N4 method-split) · **Related:** `WriteStringWithDispatch`, `BinaryTypeCode.StringSmall`, `ACCORE-BIN-T-V4N4`
The H2Q6 writer's post-encode tier choice runs a 3-way switch (`bytesWritten ≤ 255 → StringSmall`, `≤ 65535 → StringMedium`, `else StringBig`) and a header-write switch (3 / 5 / 9 byte) for every non-ASCII string. On the Repeated benchmark cell (Magyar content, ~10-15 char strings dominant) **99%+ of writes resolve to StringSmall** — the 3-way switch decision is statistically determinate from `charLength ≤ 63` alone (worst-case `charLength * 4 ≤ 252 ≤ 255` ⇒ Small tier guaranteed).
@@ -1475,6 +1554,16 @@ if (charLength <= 63)
- After A-direction (header pack-write) bench result is conclusive
- Pre-NuGet release if the Repeated cell Compact-vs-MemPack Ser ratio still has measurable headroom
+### Resolution
+
+Integrált megvalósítás `ACCORE-BIN-T-V4N4` keretében (2026-05-07): a `WriteStringWithDispatch` 4-method-os split egyik tagja a `WriteStringSmallFast` — pontosan az S6F2 ide illeszkedő fast path. A 0-shift non-ASCII branch garantált (`charLength ≤ 63` ⇒ `bytesWritten ≤ 252 ≤ 255` ⇒ Small tier biztos, `reserveHeader = actualHeader = 3`).
+
+Az inline-stratégia tanulsága (a V4N4 disasm-ből): a `WriteStringSmallFast` `[NoInlining]` jelölést kapott a végleges verzióban — az `[AggressiveInlining]` kísérlet code-bloat-ot okozott (3 generic spec × 30+ SGen call-site × inlined body = i-cache pressure a Repeated cell hot loop-on, +29.6 pp Ser regresszió a 15:13:39 bench-en). A `[NoInlining]`-tal az S6F2 logika érvényesül (constant-folded tier choice, 0 shift), csak +1 call instruction overhead-tel.
+
+Bench (15:29:21): Compact Ser **5/5 cellán paritás-vagy-jobb** vs MemPack (Small -8.5%, Medium -1.1%, Large -8.5%, Repeated +1.9%, Deep -1.9%). Az S6F2 várt +1-3% Ser-javulás teljesült Small/Large cellákon, a Repeated/Deep paritás-szerű (a +1 call overhead kompenzálja a fast-path nyereséget rövid Magyar string-eken).
+
+**Re-evaluable as of 2026-05-07 per `ACCORE-BIN-T-D9X3`** — together with the parent V4N4 method-split, the Small fast path is re-testable now that bench stabilization removes the noise-floor; retest before any code change.
+
## ACCORE-BIN-T-W2C8: WASM string-cache H2Q6 maximalizálás (`ReadStringUtf8Cached` MISS path)
**Priority:** P2 (WASM target) / P3 (otherwise) · **Type:** Performance · **Related:** `BinaryDeserializationContext.Read.cs::ReadStringUtf8Cached`, `ReadStringUtf8WithCharLen`, `Utf8Transcoder.DecodeUtf8SinglePass`
@@ -1573,4 +1662,334 @@ The marker swap is internally consistent within the v3 envelope — producers th
If a future marker-space crunch arises (additional H2Q6 tiers, new compression markers, etc.), `F3W6` can be reverted by switching the writer back to emitting `StringSmall` on FastWire and re-introducing the mode-shared dispatch in `ReadStringSmall`. The original design is correctness-equivalent — the dedicated marker is purely an optimization. **If marker gondunk lesz, kivesszük.**
+## ACCORE-BIN-T-B1D5: BenchmarkDotNet release-quality measurement project
+**Priority:** P2 · **Type:** Tooling / release-narrative · **Status:** Open · **Related:** `AyCode.Core.Serializers.Console` (existing custom bench), NuGet release-narrative
+
+The current `AyCode.Core.Serializers.Console` is a hand-rolled microbenchmark — fast dev-iteration loop (30-90s per run, custom markdown output, internal TestDataSet structure). It serves the inner optimization cycle well, **but is not industry-standard** for the public NuGet release narrative.
+
+A parallel `BenchmarkDotNet`-based project would close that gap:
+
+- **Industry-standard credibility**: BenchmarkDotNet is the canonical .NET benchmarking framework — MemoryPack, MessagePack, System.Text.Json all use it for their published numbers. AcBinary results expressed in BDN format are **directly comparable** to MemPack's own release notes.
+- **Statistical rigor**: outlier detection (Tukey's fences), interquartile range, confidence intervals, multi-process iteration runs. The current custom bench reports median-of-5; BDN reports the full distribution + variance band — the difference between "looks fast on my machine" and "demonstrably fast under controlled conditions".
+- **NuGet release surface**: BDN markdown tables drop straight into release notes / blog posts / NuGet `README.md` / `BINARY_FEATURES.md` "Performance vs MemoryPack" section. GitHub-friendly format, screenshot-friendly, reviewer-credible.
+- **Diagnostic-plugin integration**:
+ - `[MemoryDiagnoser]` — allocation per iteration (already a hot question for the Repeated cell)
+ - `[EventPipeProfiler]` — CPU profile collection during the bench run, exportable to speedscope flame-graph
+ - `[DisassemblyDiagnoser]` — per-method disasm dump, parallel to the manual `dumpbin` workflow used in V4N4
+ - `[ThreadingDiagnoser]` — context switches, lock contention (relevant if pool-contention shows up under load)
+- **Multi-runtime / multi-job**: a single project benchmarks against `RuntimeMoniker.Net90` (JIT) and `RuntimeMoniker.NativeAot90` simultaneously — same-shape table side-by-side.
+- **CI integration potential**: BDN result format is machine-readable (JSON/CSV), enabling regression detection on PR diffs (later sprint).
+
+### Implementation outline
+
+1. **New project**: `AyCode.Core.Serializers.Benchmark` (or `.Bdn`) — separate csproj for clean BDN dependency isolation. AOT-publishable for the AOT job.
+2. **TestDataSet bridge**: reuse the existing `TestDataFactory` / `TestDataSet` types from `AyCode.Core.Tests.TestModels` so the data-shape is identical to the custom bench.
+3. **Benchmark class skeleton**:
+ ```csharp
+ [MemoryDiagnoser]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [SimpleJob(RuntimeMoniker.NativeAot90)]
+ public class StringSerializationBenchmark
+ {
+ [Params("Small", "Medium", "Large", "Repeated", "Deep")]
+ public string DataSet { get; set; } = "Small";
+
+ private object _data = null!;
+ private byte[] _compactWire = null!;
+ private byte[] _mempackWire = null!;
+
+ [GlobalSetup]
+ public void Setup()
+ {
+ _data = TestDataFactory.Create(DataSet);
+ _compactWire = AcBinarySerializer.Serialize(_data, AcBinarySerializerOptions.FastMode);
+ _mempackWire = MemoryPackSerializer.Serialize(_data);
+ }
+
+ [Benchmark(Baseline = true)] public byte[] MemPack_Ser() => MemoryPackSerializer.Serialize(_data);
+ [Benchmark] public byte[] AcBinary_Compact_Ser() => AcBinarySerializer.Serialize(_data, AcBinarySerializerOptions.FastMode);
+ [Benchmark] public object? MemPack_Deser() => MemoryPackSerializer.Deserialize(_mempackWire);
+ [Benchmark] public object? AcBinary_Compact_Deser() => AcBinaryDeserializer.Deserialize(_compactWire);
+ }
+ ```
+4. **Multi-cell coverage**: separate benchmark classes per workload-shape (StringSerializationBenchmark, ObjectGraphBenchmark, NestedDeepBenchmark) — clean grouping in BDN output.
+5. **NativeAOT-job config**: `true` conditionally (mirroring `Console` project pattern); BDN's NativeAOT job auto-publishes the bench-runner.
+6. **Output**: GitHub-flavored Markdown export → `docs/BINARY/BENCHMARK_RESULTS.md` (or similar), versioned in the repo.
+
+### Why P2 (pre-NuGet release)
+
+- NuGet release narrative ("AcBinary fastest AND smallest binary serializer for .NET i18n payloads") needs **credible, industry-standard numbers**. Custom bench → "trust me, my numbers"; BDN → "here are the variance bands and the methodology".
+- Direct comparison surface against MemPack's published BDN numbers (head-to-head on the same framework).
+- Diagnostic-plugin integration (`[MemoryDiagnoser]` + `[EventPipeProfiler]`) opens up further targeted optimization work without separate tooling.
+
+### Acceptance
+
+- New `AyCode.Core.Serializers.Benchmark` project compiles + runs cleanly on both JIT (net9.0) and NativeAOT
+- Reuses existing `TestDataFactory` / `TestDataSet` types — no test data duplication
+- Produces a markdown table per workload-shape covering: MemPack baseline + AcBinary Compact + (optionally) AcBinary FastWire, both Ser and Deser
+- BDN output saved to `docs/BINARY/BENCHMARK_RESULTS.md` (versioned per release)
+- README.md / `BINARY_FEATURES.md` references the BDN-measured performance claim with the methodology link
+
+### Trigger
+
+- Pre-NuGet release: when the optimization sprint cluster (V4N2 / W2C8 / etc.) settles and the perf state is release-stable
+- Or: when a credibility-sensitive presentation surface emerges (blog post, conference talk, GitHub README)
+
+### Coexistence with the custom bench
+
+The custom `Console` bench **is not replaced** — it remains the dev-iteration tool (fast feedback loop, 30-90s runs, hand-tuned markdown for chat-paste). BDN is the **release-grade** bench (3-10 min runs, statistical rigor, NuGet release output). Different tools for different audiences.
+
+## ACCORE-BIN-T-C5R8: Charset-parameterized benchmark workload (ASCII / Hungarian / CJK / Cyrillic / Mixed)
+**Priority:** P2 · **Type:** Tooling / release-narrative · **Status:** Closed (2026-05-07) · **Related:** `BenchmarkTestDataProvider`, `AyCode.Core.Serializers.Console.Program.cs` (Settings → Charset submenu), `ACCORE-BIN-T-V4N2` (charset-specific optimization measurement target), `ACCORE-BIN-T-D9X3` (bench stabilization preceding this work)
+
+The current `BenchmarkTestDataProvider` hard-codes Hungarian (Latin extended 2-byte) content into the test DTOs. This produces a single workload-shape: **Hungarian mixed text with short 1-2 char 2-byte runs**. While Hungarian is a fine general-purpose i18n stress, it is **only one production-content profile** — and the optimization decisions ride on it implicitly (e.g. V4N2 Phase 2.5's 3-byte run do-while was deferred-on-2-byte-side because the Hungarian bench measured regression there, but its CJK-side value cannot be measured on the current data).
+
+A **charset-parameterized** benchmark workload — selectable from the interactive menu — would:
+
+- **Measure optimization value across realistic content profiles** — what wins on CJK content may not win on Hungarian, and vice versa. Without explicit per-charset measurement, optimization decisions become Hungarian-biased.
+- **Surface release-narrative numbers credibly** — instead of "Compact beats MemPack on i18n payload" (single workload), claim "Compact vs MemPack: ASCII X%, Hungarian Y%, CJK Z%, Cyrillic W%, Mixed V%" — concrete numbers per content profile, NuGet-grade.
+- **Enable workload-specific optimization audits** — V4N2 Phase 3 SIMD multi-byte transcoder targets CJK 3-byte content; without a CJK workload measurement, Phase 3 acceptance criteria cannot be validated.
+
+### Implementation outline
+
+#### 1. `BenchmarkTestDataProvider` refactor
+
+Hard-coded Hungarian strings (`KözösCímke`, `sötét`, `magyar`, `hetenkénti`, etc.) → **ASCII baseline values** (English equivalents: `SharedTag`, `dark`, `hungarian`, `weekly`).
+
+New static `LongStringSuffix` field — charset-aware suffix appended to a subset of property values:
+
+```csharp
+public static class CharsetSuffixes
+{
+ public const string AsciiOnly = ""; // baseline — pure-English ASCII content
+ public const string Hungarian = " árvíztűrő tükörfúrógép";
+ public const string CjkBmp = " 你好世界 こんにちは 안녕하세요";
+ public const string Cyrillic = " Привет мир дорогой друг";
+ public const string Mixed = " árvíz 你好 Привет 😀";
+}
+
+public static string LongStringSuffix { get; set; } = CharsetSuffixes.Hungarian; // default
+```
+
+Property values use the suffix dynamically:
+```csharp
+var description = "Product description" + LongStringSuffix;
+```
+
+The 5 charsets cover the realistic UTF-8 workload spectrum:
+1. **Pure ASCII** — baseline; Phase 1 SIMD prefix widen + DWORD batch dominate; no multi-byte path engagement
+2. **Hungarian** (Latin extended) — short 1-2 char 2-byte runs in mixed text; current default workload
+3. **CJK BMP** — long homogeneous 3-byte runs; primary V4N2 Phase 2.5/3 win region
+4. **Cyrillic** (Russian / etc.) — long 2-byte runs (different shape than Hungarian mixed); V4N2 Phase 2.5 may yet pay off here
+5. **Mixed** (Hungarian + CJK + emoji) — full multi-tier coverage in one payload; surrogate-pair handling stress
+
+#### 2. `Program.cs` interactive submenu
+
+Before starting a benchmark run, prompt the user for charset choice:
+
+```
+Choose benchmark charset:
+ 1 — Pure ASCII (baseline)
+ 2 — Hungarian (Latin extended) [DEFAULT]
+ 3 — CJK BMP (Chinese / Japanese / Korean)
+ 4 — Cyrillic (Russian / etc.)
+ 5 — Mixed (Hungarian + CJK + emoji)
+```
+
+The choice → `BenchmarkTestDataProvider.LongStringSuffix = ...` before constructing test data.
+
+#### 3. Benchmark output header
+
+The markdown output header should reflect the selected charset:
+```
+# AcBinary Benchmark Release 2026-05-07 16:00:00
+Charset: CJK BMP | Iterations: 1000 | Warmup: 10000 | ...
+```
+
+This makes per-charset bench files self-documenting — file names + content both encode the workload profile.
+
+#### 4. Round-trip tests unaffected
+
+`Utf8TranscoderTests` and other content-class unit tests (with their fixed Hungarian / CJK / emoji boundary inputs) are **untouched** — they remain fixed-content for regression coverage. Only the benchmark workload is charset-parameterized.
+
+### Why P2
+
+- **Release-narrative**: NuGet release credibility depends on measurable performance claims across realistic content profiles, not a single Hungarian-mixed workload
+- **Optimization decision quality**: V4N2 Phase 2.5 / Phase 3 / future SIMD multi-byte work cannot be objectively validated without a CJK workload — current decisions have implicit Hungarian-bias
+- **Consumer reproducibility**: external consumers can reproduce benchmark numbers on their own content profile (or contribute a new charset profile)
+
+### Acceptance
+
+- `BenchmarkTestDataProvider` refactored: ASCII baseline + `LongStringSuffix` static field with 5 predefined charset constants
+- Interactive menu in `Program.cs` lets the user choose charset 1-5 before benchmark run; the chosen charset is recorded in the markdown output header
+- Round-trip correctness verification still runs once-per-cell before warmup (existing `Verified: round-trip ...` line) — works on the active charset
+- All 5 charsets produce valid round-trip on all benchmark cells (Small / Medium / Large / Repeated / Deep)
+- Existing benchmark numbers (Hungarian-default) reproducible — choosing charset 2 from the menu yields the current 15:29:21-style results
+- New CJK charset (option 3) produces measurable numbers (one bench run per charset documented in `Test_Benchmark_Results/`)
+
+### Trigger
+
+- Pre-NuGet release: per-charset numbers needed for the public performance-claim table
+- Or: when V4N2 Phase 3 SIMD multi-byte transcoder work needs CJK-workload validation
+
+### Resolution
+
+Landed 2026-05-07 (after `ACCORE-BIN-T-D9X3` bench stabilization made sub-3% deltas measurable, which raised the value of charset-specific measurement). Implementation refined the original 5-charset proposal into a 6-charset list per user request (Latin1FixAscii + Latin1 short/long split for finer-grained Latin1 coverage):
+
+**1. `BenchmarkTestDataProvider` refactor** ✅
+
+- New `CharsetSuffixes` static class with **6** const suffixes (one more than originally proposed):
+ - `Latin1FixAscii = ""` — empty suffix; baseline values stay short → FixStr fast-path stress (renamed from `AsciiOnly` per user request)
+ - `Latin1Short = " árvíztűrő tükörfúrógép"` (~24 char) — Hungarian short Latin1 mixed
+ - `Latin1Long = " árvíztűrő tükörfúrógép a magyar betűzés tesztje"` (~47 char) — **NEW**, exceeds the 32-char FixStr boundary on the suffix alone (user request)
+ - `CjkBmp`, `Cyrillic`, `Mixed` — as originally specified
+- `LongStringSuffix` default = `CharsetSuffixes.Latin1Long` (backward-compatible in spirit with the prior fixed Latin1 default)
+- All hard-coded Hungarian baseline values replaced with ASCII English equivalents:
+ - `KözösCímke` / `IsmétlődőCímke` / `MélyCímke` → `SharedTag` / `RepeatedTag` / `DeepTag`
+ - `közösfelhasználó` → `shareduser` (and variants); `közös` → `shared`; `MélyKategória` → `DeepCategory`
+ - `sötét` / `világos` → `dark` / `light`; `magyar` / `német` / `francia` → `hungarian` / `german` / `french`
+ - `hetenkénti` / `naponkénti` / `havonkénti` → `weekly` / `daily` / `monthly`
+ - Repeated cell long Hungarian baselines (`TermékNév_IsmétlődőTesztAdat_árvíztűrőtükörfúrógép`, `RaklapKód_IsmétlődőTesztAdat_árvíztűrő`) shortened to ASCII `ProductName` / `PalletCode` so the `EnsureAllStringsBypassFixStr` suffix-append actually applies (the prior >31-char baselines bypassed the suffix, leaving Repeated cell content fixed-Hungarian regardless of charset selection)
+- The only Latin1/non-ASCII characters remaining in the file are inside the `CharsetSuffixes` const definitions themselves (intentional — those define the per-charset content profiles)
+
+**2. `Program.cs` interactive submenu** ✅
+
+- New `[3] Charset` entry in the existing `Settings` submenu (next to `[1] Iteration` and `[2] WireMode`) — chose nested submenu over a top-level prompt to keep the main menu uncluttered
+- `ShowCharsetSettingsMenu` lists the 6 charset constants with brief descriptions; selection sets `BenchmarkTestDataProvider.LongStringSuffix` and returns
+- `GetCurrentCharsetName()` helper resolves the active suffix back to its constant name (returns `"Custom"` when programmatically set to a non-const value)
+
+**3. Benchmark output header** ✅
+
+- `Charset:` field added to **3 output locations**:
+ - Console run header (interactive run line — `Layer: ... | Charset: CjkBmp | Iterations: ...`)
+ - `.LLM` markdown header (file-self-documenting)
+ - `.log` boxed banner (║ Charset: CjkBmp ║)
+
+**4. Round-trip tests unaffected** ✅ — `Utf8TranscoderTests` and other content-class unit tests use their own fixed boundary inputs; not touched by this change. Round-trip verification in the bench harness continues to run once-per-cell pre-warmup (`VerifyRoundTrip`) on the active charset.
+
+### Acceptance status
+
+- ✅ `BenchmarkTestDataProvider` refactored with ASCII baselines + `LongStringSuffix` field + 6 charset constants
+- ✅ Interactive submenu lets the user choose charset 1-6; recorded in markdown output header (3 locations)
+- ✅ Round-trip verification runs on the active charset (existing per-cell verify, charset-agnostic by design)
+- ⚠️ "All 6 charsets produce valid round-trip on all benchmark cells" — design correctness implies this; not yet exercised on every (cell × charset) combination explicitly. Recommend running each charset once before declaring full validation.
+- ❌ "Existing benchmark numbers (Hungarian-default) reproducible — choosing charset 2 yields the current 15:29:21-style results" — **NOT met**: the ASCII baseline refactor changes the numbers regardless of charset choice (shorter baselines + suffix-driven content vs. prior fixed Hungarian baselines). New `Latin1Short` ≠ prior fixed Hungarian default. This is intentional: the user explicitly chose a clean ASCII-baseline + charset-suffix design over preserving historical numerical comparability.
+- ❌ "Choosing CJK produces measurable numbers documented in `Test_Benchmark_Results/`" — **NOT done in this commit window**; user has the menu and will run per-charset benches in a follow-up sprint.
+
+### Note on numerical incompatibility with prior runs
+
+Existing bench files generated before this commit (e.g. `Console.FullBenchmark_Release_2026-05-07_17-42-22.LLM` and earlier) used the prior fixed Latin1 baseline values + 32-char Hungarian suffix. The new default (`Latin1Long`) uses ASCII baselines + 47-char Latin1Long suffix; the Repeated cell sees a more dramatic shift (its 52-char fixed Hungarian baseline → 11-char ASCII `ProductName` + 47-char suffix). **Numerical comparison across the boundary is not meaningful**; the `Charset:` header field documents the source charset for each new bench file.
+
+### Future extensions
+
+- **Sentinel "real-world" charsets** — synthetic mixes representing typical production payloads (e.g. `EnglishWithEmoji` for chat-app DTOs, `ArabicHebrew` for RTL-script regions). Add as new `CharsetSuffixes` constants when consumer demand surfaces.
+- **Charset auto-rotate mode** — single benchmark run cycles through all 5 charsets, producing a 5-section markdown output. Useful for full release-narrative table generation in one pass.
+- **BDN integration** (per `ACCORE-BIN-T-B1D5`): charset becomes a `[Params]` axis in BenchmarkDotNet, producing a 5×5×N matrix (cells × charsets × engines) in the BDN output.
+
+## ACCORE-BIN-T-D9X3: Console benchmark stabilization (per-serializer warmup + GC isolate + pilot discard + min/max range + CPU pin + mode-aware JIT sleep)
+**Priority:** P1 · **Type:** Tooling / measurement · **Status:** Closed (2026-05-07) · **Related:** `AyCode.Core.Serializers.Console.Program.cs`, `ACCORE-BIN-T-V4N4`, `ACCORE-BIN-T-V4N2`, `ACCORE-BIN-T-S6F2`, `ACCORE-BIN-T-B1D5` (BDN release-grade variant)
+
+The custom `Console` benchmark harness showed strong run-to-run variance — user-reported `±20pp / -10pp` summa-spread between runs on identical code. 1-3% perf-claims became unmeasurable on this noise-floor; the V4N4 method-split and V4N2 Phase 2.5 attempts both fell into this band, leaving the question "does the regressed bench number reflect a code regression or measurement noise?" undecidable (see `V4N4` Reverted section).
+
+**Diagnosis** (sprint takeaway prior to this entry):
+
+1. **Warmup cache pollution** — `RunBenchmarksForTestData` ran one warmup-all loop (every serializer × WarmupIterations) followed by one bench-all loop. By the time a given serializer was measured, its hot code and data lines had been evicted by the intervening serializers' warmup passes. MemPack and AcBinary hot paths share neither code nor data working sets — they actively evict each other.
+2. **GC pause leakage between samples** — the Stopwatch-recorded sample loop had no explicit `GC.Collect`. A minor GC triggered inside sample N could promote into a Gen-2 pause inside sample N+1's timed window (1-5 ms spike).
+3. **Pilot sample contamination** — the first sample after warmup absorbed residual JIT bookkeeping and cold-cache misses; on a 10-sample median this contributed 1-2 outliers that visibly stretched the min/max.
+4. **CPU migration / preemption** — the Windows scheduler migrated the bench thread between cores between samples (L1/L2 cache evict on each migration); background work (Defender index, OS service threads) injected random preemption spikes.
+5. **JIT sleep not mode-aware** — `Thread.Sleep(JitSleep = 3000)` waited 3 seconds before each cell for tiered-JIT drain. On AOT publish (`PublishAot=true`) there IS NO dynamic compilation — the 3 seconds were pure idle. Worse, the drain happened only globally (once before all cells), not per-serializer, so a tier-promotion mid-bench could still bleed in.
+6. **Range invisible** — the `.LLM` markdown output showed only the median; the user could not tell whether a 5%-median-delta was inside or outside the inter-sample range for that row.
+
+### Resolution
+
+Landed 2026-05-07 (16:00 — 17:00). Six stabilization steps in one commit window:
+
+**1. Per-serializer warmup separation** (`RunBenchmarksForTestData`) — the warmup-loop and bench-loop merged into one per-serializer cycle: each serializer's warmup runs IMMEDIATELY before its own bench. The serializer's hot code/data is freshest in cache when the first sample times.
+
+**2. `GC.Collect` before every sample** (`RunTimed`) — `GC.Collect() + WaitForPendingFinalizers() + GC.Collect()` triple-tap before each sample, OUTSIDE the Stopwatch window. Every sample starts from the same heap state; an ad-hoc Gen-2 pause from sample N can no longer bleed into sample N+1.
+
+**3. Pilot sample discard** (`RunTimed`) — the loop runs `samples + 1` times; the first (index 0) is discarded. The first sample post-warmup absorbs residual JIT/GC bookkeeping and cold cache; the recorded `samples` count remains 10 (median is the same data the user saw before, just sourced from "typical" sample-set, not from the post-warmup-first noisy point).
+
+**4. Min/max range in markdown output** (`SaveLlmResults`, new `FormatMicrosWithRange` helper, new `BenchmarkResult` fields: `SerializeTimeMinMs/MaxMs`, `DeserializeTimeMinMs/MaxMs`, `RoundTripTimeMinMs/MaxMs`) — the `.LLM` output's Ser and Deser columns now render as `26.86 (24.50..29.10)`: median (min..max) µs/op. The reader sees at a glance whether a delta is above the row's noise floor.
+
+**5. CPU affinity + process priority** (`RunBenchmark`) — `ProcessorAffinity = 0x1` (CPU 0 pin) + `PriorityClass = High` for the benchmark phase, `try/finally` restores the original values. Eliminates inter-sample thread migration (L1/L2 cache evicts) and reduces background-task preemption. Platform-guarded: Windows / Linux only (`CA1416` — `ProcessorAffinity` throws on macOS); locked-down hosts (group policy, container without `CAP_SYS_NICE`, etc.) catch + warning + bench continues with default scheduling.
+
+**6. Mode-aware `JitSleep`** (property) — `RuntimeFeature.IsDynamicCodeCompiled ? 250 : 0`. JIT mode 250 ms (the .NET 9 tiered-JIT compile queue typically drains in <100 ms for the bench's hot path); AOT publish 0 ms. The 3000 ms blind wait is gone. The drain now happens per-serializer (Step 1) instead of once globally.
+
+### Bench result (3 consecutive runs, 2026-05-07 17:00:32 / 17:01:03 / 17:01:32, FastestByte mode, FastMode preset)
+
+| Cell | AcBinary Ser median (3 runs) | Inter-run spread | Intra-cell range |
+|---|---|---|---|
+| Small | 7.09 / 6.83 / 6.55 | 7.6% | ~8% (noise floor: 1000×6ns measured) |
+| Medium | 18.74 / 18.90 / 19.22 | 2.6% | ~10% |
+| Large | 140.20 / 141.67 / 141.02 | 1.0% | ~3% |
+| Repeated | 26.52 / 26.25 / 26.28 | 0.3% | ~6% |
+| Deep Nested | 23.44 / 23.17 / 22.70 | 3.2% | ~7% |
+
+The previous `±20pp / -10pp` summa-spread shrank to **1-3pp** on the medium/large cells. The Small cell remains noisy (~8% relative) but this is a physical floor: 1000 iter × 6 ns/op = 6 µs total batch — below this, Stopwatch resolution and OS spikes dominate relatively.
+
+The `(min..max)` range is consistently 3-10% relative — a **measurable** signal floor: 1-3% perf-deltas no longer disappear into noise.
+
+### Lessons
+
+- **Bench stabilization is a precondition for perf optimization, not a consequence.** Optimization decisions (e.g. V4N4 method-split, V4N2 Phase 2.5) can only be derived from bench numbers if the noise floor < expected signal. Without that, the bench numbers mean nothing.
+- Cache pollution (warmup-all → bench-all flow) was the **single largest** noise source: per-serializer warmup separation alone removed ~10pp of variance.
+- Platform stabilization (CPU pin + high priority) combined with heap stabilization (GC.Collect + pilot discard) further tightened the range.
+- AOT and JIT have different stabilization needs: the 3000 ms blind sleep was idle time on AOT; mode-aware sleep pays the cost only when needed.
+
+### Re-evaluation list (entries currently Reverted or unmeasurable)
+
+The stabilization opens a follow-up sprint: the `Reverted (2026-05-07)` entries are re-evaluable now that the noise floor < the expected 1-3% signal:
+- **`ACCORE-BIN-T-V4N4`** — method-split (writer + reader hot path) is re-testable
+- **`ACCORE-BIN-T-V4N2` (Phase 2.5)** — UTF-8 do-while runs (2-byte / 3-byte) per charset
+- **`ACCORE-BIN-T-S6F2`** — Small fast path (was integrated into V4N4)
+
+Per-entry re-evaluation is the next sprint's task, NOT part of this Closed entry.
+
+### Why P1
+
+- Blocked all sub-3% perf optimization work (every recent attempt fell into the noise band)
+- One-line user complaint ("+20 és -10 között ingadozott a summa") summarized weeks of unproductive bench-driven investigation
+- One-time fixed cost; every future bench run benefits
+
+### Follow-up: adaptive iteration + CV reporting + per-cell A/B mode (2026-05-07, second commit window)
+
+After the initial 6-step landing, three additional refinements were added in a second commit window the same day. The trigger was a Copilot-suggested noise-reduction list against the now-stable bench output:
+
+**1. Per-cell adaptive iteration** — fixed `TestIterations = 1000` produced sample windows from 6 ms (Small cell @ 6 ns/op) to 140 ms (Large cell @ 140 µs/op). The Small cell at 6 ms remained the dominant residual noise source (7.6% inter-run spread vs ≤3.2% on the other cells) because OS-level spikes (preempt + IRQ + scheduler tick) are absolute-time events; on a 6 ms sample window their relative contribution is huge.
+
+Implementation:
+- New constant `TargetSampleMs = 250` (per-sample wall-clock target)
+- New helper `CalibrateIterations(Action, int targetMs)` — runs a 100-iter probe post-warmup, computes `iterPerMs`, and rounds up to the nearest 1000. Floor 1000, ceiling 200_000.
+- `RunBenchmarksForTestData` calibrates Ser and Des INDEPENDENTLY per serializer (different per-op cost). RT-only rows (NamedPipe) get a single RT calibration.
+- New `BenchmarkResult` fields: `SerializeIterations`, `DeserializeIterations`, `RoundTripIterations` (per-row).
+- New helpers: `ToPerOpMicros(double, int)` (replaces 1-arg variant), `SerPerOp(r)` / `DesPerOp(r)` / `RtPerOp(r)` for per-op µs from the result.
+- All `Average(r => r.*TimeMs)` and `OrderBy(r => r.RoundTripTimeMs)` call-sites refactored to use per-op µs (iter-independent) — mixing batch-time across rows with different iter counts would be meaningless. ~20 call-sites total.
+- RT for in-mem rows synthesized so `RtPerOp(r) == SerPerOp(r) + DesPerOp(r)` regardless of `serIter != desIter`: `RoundTripIterations = max(serIter, desIter)`, `RoundTripTimeMs = rtPerOpMicros / 1000 * RoundTripIterations`.
+
+**Expected impact**: Small cell sample window 6 ms → ~240 ms; inter-run spread 7.6% → ~1-2% (matching the other cells). Total suite duration ~50 s → ~110-130 s.
+
+**2. CV (coefficient of variation) reporting + unstable-row marker** — the median + (min..max) range surfaces shape but not a single-number stability metric. The CV (= stddev/mean) is the standard statistical measure; rows with CV > threshold are flagged with a ⚠️ suffix in the markdown output so a small inter-engine delta on a high-CV row is immediately obvious as noise-suspect.
+
+Implementation:
+- New constant `UnstableCVThreshold = 0.03` (3% — reasonable for stabilized in-memory benchmarks)
+- `RunTimed` return tuple extended: `(median, min, max, stddev)`. Stddev computed over the (samples − pilot) population using `Math.Sqrt(Math.Max(0, E[X²] - E[X]²))`.
+- New `BenchmarkResult` fields: `SerializeTimeStdDevMs`, `DeserializeTimeStdDevMs`, `RoundTripTimeStdDevMs`.
+- `FormatMicrosWithRange` extended: `26.86 (24.50..29.10)` stays the default; `26.86 (24.50..29.10) ⚠️5.2%` appears when CV exceeds the threshold.
+
+**3. Per-cell A/B mini-suite filter** — optimization-iteration loops often need only one specific cell (e.g. "tuning the Repeated cell for Hungarian charset"). The full 5-cell × 2-engine × 4-measurement suite is overkill for that.
+
+Implementation:
+- `FilterByLayer` extended: new `small` / `medium` / `large` / `repeated` / `deep` modes — case-insensitive prefix match on `TestDataSet.Name`
+- `TryParseCliArgs` recognizes the new tokens: `dotnet run -- repeated` runs only the Repeated Strings cell
+- `fastestbyte` mode (existing — only AcBinary FastMode + MemoryPack head-to-head) is orthogonal and stacks: `dotnet run -- repeated fastestbyte`
+
+### Markdown output schema change
+
+The `## Results` table gains an `Iter Ser/Des` column at the right edge — visible verification that each row's batch landed near the `TargetSampleMs` window. RT-only rows show a single `Iter` value (the RT calibration count); in-mem rows show `serIter / desIter`.
+
+Header line updated:
+- Before: `Iterations: 1000 | Warmup: 10000 | Samples: 10 (median) | ...`
+- After: `Iterations: per-cell adaptive (target ~250 ms/sample) | Warmup: 10000 | Samples: 10 (median) + 1 pilot discarded | ... | UnstableCV threshold: 3%`
+