[LOADED_DOCS: 3 files, no new loads]

Benchmark stabilization & charset-param workload support

Major overhaul of the custom benchmark harness:
- Per-serializer warmup, GC isolation, pilot discard, and CPU pinning for stable, reproducible results
- Adaptive per-cell iteration targeting (~250ms/sample) and statistical reporting (min/max/stddev/CV)
- CLI/menu support for single-cell A/B runs
- Test data refactored to ASCII baselines with configurable charset suffix (6 presets), selectable via menu; charset recorded in all outputs
- Markdown/console output now includes per-op µs, inter-sample range, CV warnings, and iteration counts
- Documentation updated with rationale, methodology, and notes on reverted/experimental optimizations

Enables reliable, cross-charset, release-grade performance measurement for AcBinary.
This commit is contained in:
Loretta 2026-05-07 19:13:19 +02:00
parent 17ef0904d9
commit 8eaae4dda3
7 changed files with 1076 additions and 215 deletions

View File

@ -67,7 +67,8 @@
"Read(//h/Applications/Mango/LLM_PLAN//**)",
"Bash(curl -s \"https://raw.githubusercontent.com/dotnet/runtime/main/src/libraries/System.IO.Pipelines/src/System/IO/Pipelines/StreamPipeWriter.cs\")",
"WebFetch(domain:lemire.me)",
"Bash(gh pr *)"
"Bash(gh pr *)",
"Bash(gh api *)"
]
}
}

View File

@ -47,7 +47,7 @@ public static class Program
#else
private static int WarmupIterations = 10000; //5000
private static int TestIterations = 1000; //1000
private static int BenchmarkSamples = 5;
private static int BenchmarkSamples = 10;
#endif
// Interactive settings: selected AcBinary wire mode for benchmark runs.
@ -88,7 +88,26 @@ public static class Program
private const string ModeRuntime = "Runtime";
private const string ModeHybrid = "Hybrid";
private const int JitSleep = 3000;
// Per-cell adaptive iteration target wall-clock duration. Each Ser/Des function calibrates its
// own iteration count post-warmup so the sample batch lands in this range — equalizes the
// per-sample window across cells of vastly different per-op cost (Small ~6 ns/op vs Large
// ~140 µs/op). Below ~100 ms Stopwatch precision and OS preempt spikes start to dominate.
private const int TargetSampleMs = 250;
// CV (coefficient of variation = stddev / mean) threshold above which a row's range is flagged
// as "unstable" in the markdown output (⚠️ marker). 3% is a reasonable noise-floor expectation
// for stabilized in-memory benchmarks; rows above it should be discounted when reading
// sub-3% inter-engine deltas.
private const double UnstableCVThreshold = 0.03;
// JIT-tier-promotion drain delay between warmup and measurement.
// - JIT mode (RuntimeFeature.IsDynamicCodeCompiled == true): tiered JIT promotes hot methods
// in a background thread; we wait briefly for the queue to drain so the first measurement
// sample doesn't catch a Tier-0 → Tier-1 transition mid-flight.
// - AOT mode (NativeAOT publish): no dynamic compilation happens; the sleep is pure noise.
// 250ms (vs the historical 3000ms) is sufficient for a few-method working set under .NET 9's
// tiered JIT — empirically the queue drains in <100ms for the bench's hot path.
private static int JitSleep => System.Runtime.CompilerServices.RuntimeFeature.IsDynamicCodeCompiled ? 250 : 0;
// OptionsPreset values are passed per-instance (constructor argument), not constants —
// each CreateSerializers call line specifies its own preset name (e.g. "FastMode", "NoIntern").
@ -150,7 +169,52 @@ public static class Program
/// — only its sample noise grows). Symmetric with the already-per-op <c>*AllocBytesPerOp</c> fields.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static double ToPerOpMicros(double totalMs) => totalMs / TestIterations * 1000.0;
/// <summary>
/// Converts a total-time (in ms across <paramref name="iterations"/>) into per-operation microseconds.
/// Per-op µs is the iter-independent unit: 1000 iter and 50000 iter of the same operation should
/// produce the same per-op µs (within noise). Necessary because per-cell adaptive iteration makes
/// <c>iterations</c> a per-row property — there is no longer a single global TestIterations to divide by.
/// </summary>
private static double ToPerOpMicros(double totalMs, int iterations) => iterations > 0 ? totalMs / iterations * 1000.0 : 0;
// Per-row per-op µs accessors — pull batch-time + iter from BenchmarkResult and convert. Used wherever
// averaging or comparison happens across rows with potentially different iter counts (Winners summary,
// Overall comparison, per-cell summary row). Keeping these as methods rather than properties on
// BenchmarkResult preserves the result-as-data-bag distinction.
private static double SerPerOp(BenchmarkResult r) => ToPerOpMicros(r.SerializeTimeMs, r.SerializeIterations);
private static double DesPerOp(BenchmarkResult r) => ToPerOpMicros(r.DeserializeTimeMs, r.DeserializeIterations);
private static double RtPerOp(BenchmarkResult r) => ToPerOpMicros(r.RoundTripTimeMs, r.RoundTripIterations);
/// <summary>
/// Formats a per-op micros value with its inter-sample range and CV-threshold marker as
/// <c>"26.86 (24.5..29.1)"</c> or <c>"26.86 (24.5..29.1) ⚠5.2%"</c>. Median first, range in parentheses,
/// CV warning suffix only when CV > <see cref="UnstableCVThreshold"/>. When min == max == median
/// (single-sample / Debug / quick mode), collapses to bare median to avoid visual clutter.
/// All time inputs are total-batch milliseconds; <paramref name="iterations"/> is the per-row iter
/// count (post-adaptive-calibration).
/// </summary>
private static string FormatMicrosWithRange(double medianMs, double minMs, double maxMs, double stdDevMs, int iterations, System.Globalization.CultureInfo inv)
{
var med = ToPerOpMicros(medianMs, iterations);
// No range data (single-sample fast path) — surface as bare median, identical to the prior format.
if (minMs <= 0 && maxMs <= 0) return med.ToString("F2", inv);
if (minMs >= medianMs && maxMs <= medianMs) return med.ToString("F2", inv);
var min = ToPerOpMicros(minMs, iterations);
var max = ToPerOpMicros(maxMs, iterations);
var range = $"{med.ToString("F2", inv)} ({min.ToString("F2", inv)}..{max.ToString("F2", inv)})";
// CV (coefficient of variation = stddev / mean) — flag rows above the unstable threshold so a
// small inter-engine delta on a high-CV row is easy to discount as noise.
if (medianMs > 0 && stdDevMs > 0)
{
var cv = stdDevMs / medianMs;
if (cv > UnstableCVThreshold)
{
var cvPct = (cv * 100).ToString("F1", inv);
return $"{range} ⚠️{cvPct}%";
}
}
return range;
}
/// <summary>
/// Converts a byte count to KB (1 KB = 1024 B). Display-only helper so allocation columns can
@ -225,7 +289,8 @@ public static class Program
BenchmarkSamples = 3;
layer = "all";
}
else if (arg is "core" or "comprehensive" or "edge" or "all")
else if (arg is "core" or "comprehensive" or "edge" or "all"
or "small" or "medium" or "large" or "repeated" or "deep")
{
layer = arg;
}
@ -265,64 +330,121 @@ public static class Program
System.Console.WriteLine("║ COMPREHENSIVE SERIALIZER BENCHMARK SUITE ║");
System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");
var allResults = new List<BenchmarkResult>();
var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
var testDataSets = FilterByLayer(allTestDataSets, layer);
// Stabilization: pin the entire benchmark process to a single logical CPU and bump priority
// class. Single-core affinity stops Windows from migrating the bench thread between cores
// mid-sample (a migration evicts L1/L2 caches and corrupts a measurement); High priority
// reduces preemption by background tasks (Defender scans, indexer, etc.) that otherwise
// randomly inflate samples by 5-15%.
// Try/finally guarantees the original state is restored even if a benchmark throws — leaving
// a developer machine pinned to one core after a crashed run is a real foot-gun.
// Skipped on Debug single-sample mode (BenchmarkSamples <= 1) where stabilization is moot.
var process = Process.GetCurrentProcess();
var origAffinity = (IntPtr)0;
var origPriority = ProcessPriorityClass.Normal;
var stabilizationApplied = false;
System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Iterations: {TestIterations} | Warmup: {WarmupIterations} | Samples: {BenchmarkSamples} (median)");
System.Console.WriteLine($"Build: {BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
System.Console.WriteLine();
// Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
// Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
// alone doesn't ensure that every Serialize<T>/IBufferWriter overload is fully Tier 1 by the time we
// start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
// on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
// Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
// them all in the background; the per-cell warmup that follows then locks in cache + branch state.
if (BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
// ProcessorAffinity is only supported on Windows + Linux (CA1416). macOS would throw at
// runtime; skip the affinity step there but still raise priority class (which IS supported
// on macOS, just less effective for stabilization than affinity pinning).
if (BenchmarkSamples > 1 && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
{
System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");
foreach (var testData in testDataSets)
try
{
var preSerializers = CreateSerializers(testData, serializerMode);
try
origAffinity = process.ProcessorAffinity;
origPriority = process.PriorityClass;
// Pin to CPU 0 (mask = 1). Choosing CPU 0 is arbitrary; what matters is "exactly one
// core, consistently" — not which one. If CPU 0 is heavily contended on the host
// (e.g. dedicated to system-wide IRQs on some Windows configs), the user can tweak
// the mask here. The benchmark is single-threaded for the in-memory rows so single
// core is sufficient; round-trip-only NamedPipe rows have a server-drain thread
// that will share the core (acceptable — the bench measures end-to-end RT anyway).
process.ProcessorAffinity = (IntPtr)1;
process.PriorityClass = ProcessPriorityClass.High;
stabilizationApplied = true;
System.Console.WriteLine($"Stabilization: pinned to CPU 0 (affinity=0x1), priority=High.");
}
catch (Exception ex)
{
// Affinity/priority changes may fail on locked-down hosts (group policies, containers
// without CAP_SYS_NICE on Linux, etc.). Surface and continue — the benchmark still
// works, just with the platform default scheduling.
System.Console.WriteLine($"Stabilization SKIPPED: {ex.GetType().Name}: {ex.Message}");
}
}
try
{
var allResults = new List<BenchmarkResult>();
var allTestDataSets = BenchmarkTestDataProvider.CreateTestDataSets();
var testDataSets = FilterByLayer(allTestDataSets, layer);
System.Console.WriteLine($"Layer: {layer} | OpMode: {opMode} | SerializerMode: {serializerMode} | Charset: {GetCurrentCharsetName()} | Iterations: per-cell adaptive (~{TargetSampleMs} ms target) | Warmup: {WarmupIterations} | Samples: {BenchmarkSamples} (median) + pilot discard");
System.Console.WriteLine($"Build: {BuildConfiguration} | .NET: {Environment.Version} | Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"} | Test Cells: {testDataSets.Count}/{allTestDataSets.Count}");
System.Console.WriteLine();
// Global JIT pre-warmup — touches every (testdata × serializer) code path BEFORE any timing happens.
// Without this, the FIRST test data measured carries JIT-tier-promotion latency: the per-cell warmup
// alone doesn't ensure that every Serialize<T>/IBufferWriter overload is fully Tier 1 by the time we
// start measuring. Symptom: first cell's BufferWriter variants run ~2x slower than the SAME variants
// on later cells (e.g. Small BufWr reuse 9ms vs Medium BufWr reuse 4ms — even though Medium is bigger).
// Pre-warmup runs every overload at least once with each data shape so .NET 9's tiered JIT promotes
// them all in the background; the per-cell warmup that follows then locks in cache + branch state.
if (BenchmarkSamples > 1) // skip in DEBUG (single-sample fast iteration)
{
System.Console.WriteLine($"Global JIT pre-warmup ({testDataSets.Count} cells × all serializers, light pass)...");
foreach (var testData in testDataSets)
{
foreach (var s in preSerializers)
var preSerializers = CreateSerializers(testData, serializerMode);
try
{
// Light warmup just to trigger Tier 0 → Tier 1 promotion. The per-cell 5000-iter warmup
// inside RunBenchmarksForTestData still runs afterwards for cache/BTB warming.
s.Warmup(2000);
foreach (var s in preSerializers)
{
// Light warmup just to trigger Tier 0 → Tier 1 promotion. The per-cell 5000-iter warmup
// inside RunBenchmarksForTestData still runs afterwards for cache/BTB warming.
s.Warmup(2000);
}
}
finally
{
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
}
}
finally
{
// Dispose any IDisposable serializers (NamedPipe / FileStream variants own OS resources).
foreach (var s in preSerializers) (s as IDisposable)?.Dispose();
}
// Let background tiered-JIT compilation drain before we begin measuring.
if (JitSleep > 0) Thread.Sleep(JitSleep);
System.Console.WriteLine("✓ Global pre-warmup complete.\n");
}
// Let background tiered-JIT compilation drain before we begin measuring.
Thread.Sleep(JitSleep);
System.Console.WriteLine("✓ Global pre-warmup complete.\n");
}
foreach (var testData in testDataSets)
foreach (var testData in testDataSets)
{
System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");
var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
allResults.AddRange(results);
}
// Print grouped results
PrintGroupedResults(allResults, testDataSets);
// Save results to file
SaveResults(allResults, testDataSets);
System.Console.WriteLine("\n✓ Benchmark complete!");
}
finally
{
System.Console.WriteLine($"\n{'═'.ToString().PadRight(70, '═')}");
System.Console.WriteLine($"TEST DATA: {testData.DisplayName}");
System.Console.WriteLine($"{'═'.ToString().PadRight(70, '═')}");
var results = RunBenchmarksForTestData(testData, opMode, serializerMode);
allResults.AddRange(results);
// Restore process state — affinity/priority changes are process-wide and persist across
// interactive-mode iterations of the menu. Without restore, the second menu run would
// already be on CPU-0 + High priority before its own try-block applied them, masking
// any stabilization-disabled comparison.
if (stabilizationApplied && (OperatingSystem.IsWindows() || OperatingSystem.IsLinux()))
{
try { process.ProcessorAffinity = origAffinity; } catch { /* best-effort */ }
try { process.PriorityClass = origPriority; } catch { /* best-effort */ }
}
}
// Print grouped results
PrintGroupedResults(allResults, testDataSets);
// Save results to file
SaveResults(allResults, testDataSets);
System.Console.WriteLine("\n✓ Benchmark complete!");
}
/// <summary>
@ -404,21 +526,39 @@ public static class Program
System.Console.WriteLine("✓ All serializers passed round-trip verification.");
// Warmup all serializers
System.Console.WriteLine($"Warming up ({WarmupIterations} iterations)...");
// Per-serializer (warmup → calibrate → measurement) cycle: each serializer warms up IMMEDIATELY
// before its own bench, then calibrates iter per-function (Ser and Des independently) so each
// sample lands at ~TargetSampleMs wall-clock. This avoids cache pollution AND equalizes sample
// window length across cells of vastly different per-op cost.
System.Console.WriteLine($"Running benchmarks (target ~{TargetSampleMs} ms/sample × {BenchmarkSamples} samples median, per-serializer warmup + adaptive iter)...\n");
foreach (var serializer in serializers)
{
// Warmup THIS serializer right before benching it — keeps its hot code/data in cache.
serializer.Warmup(WarmupIterations);
}
// Wait for tiered JIT background compilation to complete
Thread.Sleep(JitSleep);
// Wait for tiered JIT background compilation to drain (mode-aware: 0ms in AOT).
// Per-serializer instead of once globally — guarantees this serializer's freshly-promoted
// methods are settled before timing, regardless of when it appears in the iteration order.
if (JitSleep > 0) Thread.Sleep(JitSleep);
// Run benchmarks
System.Console.WriteLine($"Running benchmarks ({TestIterations} iterations × {BenchmarkSamples} samples median)...\n");
// Adaptive iter calibration — per Ser/Des/RT function, post-warmup. Each function gets its
// own iter count tuned to TargetSampleMs (typically 250 ms). The 100-iter calibration cost
// is amortized over the BenchmarkSamples + 1 (pilot) recorded measurements that follow.
int serIter = TestIterations, desIter = TestIterations, rtIter = TestIterations;
if (serializer.IsRoundTripOnly)
{
if (mode is "all" or "serialize" or "ser")
rtIter = CalibrateIterations(() => serializer.Serialize(), TargetSampleMs);
}
else
{
if (mode is "all" or "serialize" or "ser")
serIter = CalibrateIterations(() => serializer.Serialize(), TargetSampleMs);
if (mode is "all" or "deserialize" or "des")
desIter = CalibrateIterations(() => serializer.Deserialize(), TargetSampleMs);
}
foreach (var serializer in serializers)
{
var result = new BenchmarkResult
{
TestDataName = testData.DisplayName, // Use DisplayName for IId% info
@ -445,8 +585,13 @@ public static class Program
// also show up — otherwise current-thread alloc would only count the client side and look ~halved.
if (mode is "all" or "serialize" or "ser")
{
result.RoundTripTimeMs = RunTimed(() => serializer.Serialize(), TestIterations, $"{groupLabel} [RT timing]");
result.RoundTripAllocBytesPerOp = MeasureAllocationTotal(() => serializer.Serialize(), TestIterations, $"{groupLabel} [RT alloc]");
var (rtMed, rtMin, rtMax, rtStd) = RunTimed(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT timing]");
result.RoundTripTimeMs = rtMed;
result.RoundTripTimeMinMs = rtMin;
result.RoundTripTimeMaxMs = rtMax;
result.RoundTripTimeStdDevMs = rtStd;
result.RoundTripIterations = rtIter;
result.RoundTripAllocBytesPerOp = MeasureAllocationTotal(() => serializer.Serialize(), rtIter, $"{groupLabel} [RT alloc]");
}
// mode == "deserialize" alone is meaningless for a round-trip-only benchmark; skip silently.
}
@ -454,19 +599,36 @@ public static class Program
{
if (mode is "all" or "serialize" or "ser")
{
result.SerializeTimeMs = RunTimed(() => serializer.Serialize(), TestIterations, $"{groupLabel} [Ser timing]");
var (serMed, serMin, serMax, serStd) = RunTimed(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser timing]");
result.SerializeTimeMs = serMed;
result.SerializeTimeMinMs = serMin;
result.SerializeTimeMaxMs = serMax;
result.SerializeTimeStdDevMs = serStd;
result.SerializeIterations = serIter;
// Dedicated alloc-only sample (separate from timing samples; keeps timing pure)
result.SerializeAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), TestIterations, $"{groupLabel} [Ser alloc]");
result.SerializeAllocBytesPerOp = MeasureAllocation(() => serializer.Serialize(), serIter, $"{groupLabel} [Ser alloc]");
}
if (mode is "all" or "deserialize" or "des")
{
result.DeserializeTimeMs = RunTimed(() => serializer.Deserialize(), TestIterations, $"{groupLabel} [Des timing]");
result.DeserializeAllocBytesPerOp = MeasureAllocation(() => serializer.Deserialize(), TestIterations, $"{groupLabel} [Des alloc]");
var (desMed, desMin, desMax, desStd) = RunTimed(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des timing]");
result.DeserializeTimeMs = desMed;
result.DeserializeTimeMinMs = desMin;
result.DeserializeTimeMaxMs = desMax;
result.DeserializeTimeStdDevMs = desStd;
result.DeserializeIterations = desIter;
result.DeserializeAllocBytesPerOp = MeasureAllocation(() => serializer.Deserialize(), desIter, $"{groupLabel} [Des alloc]");
}
// Compose RT from Ser+Des (the previously computed property's behavior, now explicit since RT is settable).
result.RoundTripTimeMs = result.SerializeTimeMs + result.DeserializeTimeMs;
// Compose RT from Ser+Des. Because Ser and Des may have DIFFERENT iter counts post-calibration,
// batch-time addition would be misleading. Instead: compute per-op µs (iter-independent),
// then synthesize RoundTripTimeMs against RoundTripIterations = max(serIter, desIter) so that
// RoundTripTimeMs / RoundTripIterations * 1000 == SerPerOp + DesPerOp.
var serPerOp = ToPerOpMicros(result.SerializeTimeMs, serIter);
var desPerOp = ToPerOpMicros(result.DeserializeTimeMs, desIter);
var rtPerOp = serPerOp + desPerOp;
result.RoundTripIterations = Math.Max(serIter, desIter);
result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations;
result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp;
}
@ -655,40 +817,125 @@ public static class Program
/// <summary>
/// Runs the action <paramref name="iterations"/> times for <see cref="BenchmarkSamples"/> independent samples,
/// returning the median elapsed time. Multi-sample design reduces single-run variance from ~±15% to ~±5%
/// by smoothing transient effects (background activity, thermal/turbo state, JIT tier-promotion timing).
/// returning the median, min, and max elapsed time. Multi-sample design reduces single-run variance
/// from ~±15% to ~±5% by smoothing transient effects (background activity, thermal/turbo state).
/// When <see cref="BenchmarkSamples"/> &lt;= 1, falls back to single-sample timing (Debug / quick mode).
/// When <paramref name="progressLabel"/> is non-null, emits in-place <c>\r</c> progress updates so a
/// stuck benchmark (e.g. deadlocked NamedPipe row) is visibly stuck at a specific %% rather than
/// silently hanging.
///
/// Stabilization (added 2026-05-07):
/// 1) Pilot sample is run BEFORE the recorded loop and discarded. The first measurement after
/// warmup tends to absorb residual JIT bookkeeping and GC bookkeeping; dropping it tightens
/// the min/max range without throwing away signal (the median is the SAME data as before).
/// 2) GC.Collect / WaitForPendingFinalizers / GC.Collect runs BEFORE every recorded sample.
/// Without this, GC pressure from sample N occasionally triggered a Gen-2 pause inside
/// sample N+1, painting it as an outlier; collecting up-front gives every sample the
/// same starting heap shape.
/// 3) Returns (median, min, max) so the caller can surface the inter-sample range — visible
/// noise floor for the row, replacing the previous "median only" view.
/// </summary>
private static double RunTimed(Action action, int iterations, string? progressLabel = null)
private static (double medianMs, double minMs, double maxMs, double stdDevMs) RunTimed(Action action, int iterations, string? progressLabel = null)
{
var samples = BenchmarkSamples;
if (samples <= 1)
{
// Single-sample fast path (Debug or trivial run) — no allocation, no sort.
// Single-sample fast path (Debug or trivial run) — no allocation, no sort, no stddev.
var sw = Stopwatch.StartNew();
RunWithProgress(action, iterations, progressLabel, samples: 1, sampleIndex: 0);
sw.Stop();
EndProgress(progressLabel, sw.Elapsed.TotalMilliseconds);
return sw.Elapsed.TotalMilliseconds;
var ms = sw.Elapsed.TotalMilliseconds;
EndProgress(progressLabel, ms);
return (ms, ms, ms, 0);
}
// Pilot sample (discarded). Counts as sample index 0 of (samples + 1) for progress display
// so the user sees an extra "warmup-ish" tick before the recorded samples start.
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
var pilotSw = Stopwatch.StartNew();
RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: 0);
pilotSw.Stop();
// intentionally not stored
var times = new double[samples];
for (var s = 0; s < samples; s++)
{
// Per-sample GC settle. Forces every sample to start from the same heap state, so
// a Gen-2 pause caused by the previous sample doesn't bleed into the next sample's
// timing. Cost is paid OUTSIDE the Stopwatch window — no impact on the measurement.
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
var sw = Stopwatch.StartNew();
RunWithProgress(action, iterations, progressLabel, samples, s);
RunWithProgress(action, iterations, progressLabel, samples + 1, sampleIndex: s + 1);
sw.Stop();
times[s] = sw.Elapsed.TotalMilliseconds;
}
Array.Sort(times);
// Capture min/max/sum/sumSq BEFORE sort to avoid order ambiguity (Array.Sort is in-place).
var minMs = double.MaxValue;
var maxMs = double.MinValue;
var sum = 0.0;
var sumSq = 0.0;
for (var i = 0; i < times.Length; i++)
{
var t = times[i];
sum += t;
sumSq += t * t;
if (t < minMs) minMs = t;
if (t > maxMs) maxMs = t;
}
// Population stddev (not sample-stddev — we treat the captured samples as the population for
// CV computation). variance = E[X²] - E[X]² with Math.Max(0, ...) guard against tiny negative
// values from FP rounding when samples are nearly identical.
var mean = sum / times.Length;
var variance = (sumSq / times.Length) - (mean * mean);
var stdDevMs = Math.Sqrt(Math.Max(0.0, variance));
Array.Sort(times);
// Median: middle value for odd sample counts, average of two middles for even counts.
var medianMs = samples % 2 == 1 ? times[samples / 2] : (times[samples / 2 - 1] + times[samples / 2]) / 2.0;
EndProgress(progressLabel, medianMs);
return medianMs;
return (medianMs, minMs, maxMs, stdDevMs);
}
/// <summary>
/// Per-cell adaptive iteration calibration. Runs a 100-iter measurement after warmup and computes
/// how many iterations are needed to reach <see cref="TargetSampleMs"/> wall-clock per sample.
/// Returns iter rounded UP to the nearest 1000, floored at 1000 (the prior fixed minimum) and
/// ceiling-capped at 200_000 (sanity bound for pathologically fast ops). In Debug single-sample mode
/// (<c>BenchmarkSamples &lt;= 1</c>) returns the global <see cref="TestIterations"/> unchanged —
/// calibration overhead is unjustified there. Calibration runs OUTSIDE the timed sample loop and
/// does NOT count toward warmup; its sole purpose is to measure per-op cost.
/// </summary>
private static int CalibrateIterations(Action action, int targetMs)
{
if (BenchmarkSamples <= 1) return TestIterations; // Debug fast path
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
const int calibIter = 100;
var sw = Stopwatch.StartNew();
for (var i = 0; i < calibIter; i++) action();
sw.Stop();
var ms = sw.Elapsed.TotalMilliseconds;
// Pathologically-fast op below Stopwatch resolution — cap at ceiling (further calibration won't help).
if (ms <= 0.0001) return 200_000;
var iterPerMs = calibIter / ms;
var raw = (int)Math.Ceiling(targetMs * iterPerMs);
// Round UP to nearest 1000 — keeps numbers human-readable in the markdown output.
var rounded = ((raw + 999) / 1000) * 1000;
if (rounded < 1000) return 1000;
if (rounded > 200_000) return 200_000;
return rounded;
}
/// <summary>
@ -913,6 +1160,7 @@ public static class Program
System.Console.WriteLine("─────────────────────────────────────────────");
System.Console.WriteLine(" [1] Iteration — Warmup / Iterations / Samples");
System.Console.WriteLine($" [2] WireMode — current: {SelectedWireMode}");
System.Console.WriteLine($" [3] Charset — current: {GetCurrentCharsetName()}");
System.Console.WriteLine(" [B] Back");
System.Console.Write("\nSelection: ");
@ -927,6 +1175,83 @@ public static class Program
case '2':
ShowWireModeSettingsMenu();
break;
case '3':
ShowCharsetSettingsMenu();
break;
case 'b':
return;
default:
continue;
}
}
}
/// <summary>
/// Returns a human-readable name for the currently-active <c>BenchmarkTestDataProvider.LongStringSuffix</c>
/// charset. Returns "Custom" when the suffix doesn't match any of the predefined
/// <see cref="CharsetSuffixes"/> constants. Used in menu state display, console run header, and
/// the .LLM markdown output header so per-charset bench files are self-documenting.
/// </summary>
private static string GetCurrentCharsetName()
{
var s = BenchmarkTestDataProvider.LongStringSuffix;
if (s == CharsetSuffixes.Latin1FixAscii) return "Latin1FixAscii";
if (s == CharsetSuffixes.Latin1Short) return "Latin1Short";
if (s == CharsetSuffixes.Latin1Long) return "Latin1Long";
if (s == CharsetSuffixes.CjkBmp) return "CjkBmp";
if (s == CharsetSuffixes.Cyrillic) return "Cyrillic";
if (s == CharsetSuffixes.Mixed) return "Mixed";
return "Custom";
}
private static void ShowCharsetSettingsMenu()
{
while (true)
{
System.Console.WriteLine();
System.Console.WriteLine("─────────────────────────────────────────────");
System.Console.WriteLine("Charset settings — long-string suffix profile");
System.Console.WriteLine("─────────────────────────────────────────────");
System.Console.WriteLine($"Current: {GetCurrentCharsetName()}");
System.Console.WriteLine();
System.Console.WriteLine(" [1] Latin1FixAscii — empty suffix; short FixStr-fast-path stress (Latin1 baseline values stay short)");
System.Console.WriteLine(" [2] Latin1Short — \" árvíztűrő tükörfúrógép\" (~24 char Hungarian mixed)");
System.Console.WriteLine(" [3] Latin1Long — ~47-char Latin1 mixed (default; exceeds FixStr boundary)");
System.Console.WriteLine(" [4] CjkBmp — CJK BMP (long 3-byte runs)");
System.Console.WriteLine(" [5] Cyrillic — Russian Cyrillic (long 2-byte runs)");
System.Console.WriteLine(" [6] Mixed — Hungarian + CJK + Cyrillic + emoji (full-spectrum + surrogate pairs)");
System.Console.WriteLine(" [B] Back");
System.Console.Write("\nSelection: ");
var key = System.Console.ReadKey(intercept: false).KeyChar;
System.Console.WriteLine();
switch (char.ToLower(key))
{
case '1':
BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Latin1FixAscii;
System.Console.WriteLine("✓ Charset set to Latin1FixAscii");
return;
case '2':
BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Latin1Short;
System.Console.WriteLine("✓ Charset set to Latin1Short");
return;
case '3':
BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Latin1Long;
System.Console.WriteLine("✓ Charset set to Latin1Long");
return;
case '4':
BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.CjkBmp;
System.Console.WriteLine("✓ Charset set to CjkBmp");
return;
case '5':
BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Cyrillic;
System.Console.WriteLine("✓ Charset set to Cyrillic");
return;
case '6':
BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Mixed;
System.Console.WriteLine("✓ Charset set to Mixed");
return;
case 'b':
return;
default:
@ -1019,6 +1344,14 @@ public static class Program
"core" => all.Where(t => StartsWithAny(t.Name, coreNames)).ToList(),
"comprehensive" => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras)).ToList(),
"edge" => all.Where(t => StartsWithAny(t.Name, coreNames) || StartsWithAny(t.Name, comprehensiveExtras) || StartsWithAny(t.Name, edgeExtras)).ToList(),
// Single-cell A/B mini-suite filters — match by case-insensitive prefix on Name.
// Use case: tight optimization-iteration loop on one specific cell (e.g. `dotnet run -- repeated`
// or interactive menu shortcut), avoiding the full ~110 sec suite when only one cell is in scope.
"small" => all.Where(t => t.Name.StartsWith("Small", StringComparison.OrdinalIgnoreCase)).ToList(),
"medium" => all.Where(t => t.Name.StartsWith("Medium", StringComparison.OrdinalIgnoreCase)).ToList(),
"large" => all.Where(t => t.Name.StartsWith("Large", StringComparison.OrdinalIgnoreCase)).ToList(),
"repeated" => all.Where(t => t.Name.StartsWith("Repeated", StringComparison.OrdinalIgnoreCase)).ToList(),
"deep" => all.Where(t => t.Name.StartsWith("Deep", StringComparison.OrdinalIgnoreCase)).ToList(),
_ => all.ToList()
};
@ -2329,14 +2662,40 @@ public static class Program
public int SerializedSize { get; set; }
public double SerializeTimeMs { get; set; }
public double DeserializeTimeMs { get; set; }
// Per-sample min/max alongside the median (median is the *Time*Ms field above). Surfaces
// inter-sample range — the visible noise floor for the row. 0 when the operation was skipped
// (mode != "all"/"ser"/"des") or when a single-sample fast path was used (min == max == median).
public double SerializeTimeMinMs { get; set; }
public double SerializeTimeMaxMs { get; set; }
public double DeserializeTimeMinMs { get; set; }
public double DeserializeTimeMaxMs { get; set; }
// Sample-population stddev (ms). Used by FormatMicrosWithRange to compute CV (stddev/mean)
// and emit the ⚠️ marker on rows above UnstableCVThreshold. 0 in single-sample mode.
public double SerializeTimeStdDevMs { get; set; }
public double DeserializeTimeStdDevMs { get; set; }
// Per-row adaptive iteration count (post-CalibrateIterations). Each Ser and Des function calibrates
// independently to land its sample window at ~TargetSampleMs; per-op µs is then iter-independent
// (`SerializeTimeMs / SerializeIterations * 1000`). For round-trip-only rows (NamedPipe etc.),
// RoundTripIterations carries the calibrated iter count; SerializeIterations and DeserializeIterations
// stay 0 (Ser and Des are not separately measurable on those rows).
public int SerializeIterations { get; set; }
public int DeserializeIterations { get; set; }
public int RoundTripIterations { get; set; }
public long SerializeAllocBytesPerOp { get; set; }
public long DeserializeAllocBytesPerOp { get; set; }
public long SetupSerializeAllocBytes { get; set; }
public long SetupDeserializeAllocBytes { get; set; }
/// <summary>Total round-trip time. For in-memory benchmarks: <c>Serialize + Deserialize</c> (set explicitly in
/// <c>RunBenchmarksForTestData</c>). For round-trip-only benchmarks (NamedPipe etc.): the directly-measured
/// pipe round-trip time, since Ser and Des are not separately measurable there.</summary>
/// <summary>Total round-trip time. For in-memory benchmarks: synthesized so that
/// <c>RoundTripTimeMs / RoundTripIterations</c> yields the correct <c>SerPerOp + DesPerOp</c> µs/op
/// (necessary because Ser and Des may have different iter counts post-calibration).
/// For round-trip-only benchmarks (NamedPipe etc.): the directly-measured pipe round-trip time.</summary>
public double RoundTripTimeMs { get; set; }
// Round-trip min/max + stddev — only populated for round-trip-only benchmarks (NamedPipe etc.) where
// RT is directly measured. For in-memory rows RT = Ser + Des, which has no single-sample
// distribution; surface Ser/Des range separately instead.
public double RoundTripTimeMinMs { get; set; }
public double RoundTripTimeMaxMs { get; set; }
public double RoundTripTimeStdDevMs { get; set; }
/// <summary>Total round-trip allocation per op. For in-memory benchmarks: <c>SerializeAlloc + DeserializeAlloc</c>.
/// For round-trip-only benchmarks: process-wide allocation measured via <see cref="GC.GetTotalAllocatedBytes"/>
/// (covers ALL threads — client, server-drain, channel internals — not just the caller).</summary>
@ -2346,8 +2705,8 @@ public static class Program
private static void PrintResult(BenchmarkResult result)
{
// Numbers-only per-row entries; the column-headers carry units (µs/op, KB/op).
var ser = result.SerializeTimeMs > 0 ? $"{ToPerOpMicros(result.SerializeTimeMs),7:F2}" : " N/A";
var des = result.DeserializeTimeMs > 0 ? $"{ToPerOpMicros(result.DeserializeTimeMs),7:F2}" : " N/A";
var ser = result.SerializeTimeMs > 0 ? $"{SerPerOp(result),7:F2}" : " N/A";
var des = result.DeserializeTimeMs > 0 ? $"{DesPerOp(result),7:F2}" : " N/A";
var serAlloc = result.SerializeTimeMs > 0 ? $"{ToKilobytes(result.SerializeAllocBytesPerOp),7:F2}" : " N/A";
var desAlloc = result.DeserializeTimeMs > 0 ? $"{ToKilobytes(result.DeserializeAllocBytesPerOp),7:F2}" : " N/A";
System.Console.WriteLine($" {result.SerializerName,-40} | Size: {result.SerializedSize,8:N0} B | Ser: {ser} µs/op ({serAlloc} KB/op) | Des: {des} µs/op ({desAlloc} KB/op)");
@ -2376,7 +2735,8 @@ public static class Program
foreach (var testData in testDataSets)
{
var testResults = results.Where(r => r.TestDataName == testData.DisplayName).OrderBy(r => r.RoundTripTimeMs).ToList();
// Order by per-op µs (iter-independent) — rows may have different iter counts post-calibration.
var testResults = results.Where(r => r.TestDataName == testData.DisplayName).OrderBy(r => RtPerOp(r)).ToList();
// Baseline switched MessagePack → MemoryPack: MemoryPack is the SOTA performance leader.
var memPackResult = testResults.FirstOrDefault(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray));
// Pin the comparison to AcBinary's SGen variant — apples-to-apples vs MemoryPack (also source-generated).
@ -2393,9 +2753,9 @@ public static class Program
{
var size = $"{result.SerializedSize:N0}";
var setup = $"{ToKilobytes(result.SetupSerializeAllocBytes):F2} / {ToKilobytes(result.SetupDeserializeAllocBytes):F2}";
var ser = result.SerializeTimeMs > 0 ? $"{ToPerOpMicros(result.SerializeTimeMs):F2}" : "N/A";
var des = result.DeserializeTimeMs > 0 ? $"{ToPerOpMicros(result.DeserializeTimeMs):F2}" : "N/A";
var rt = result.RoundTripTimeMs > 0 ? $"{ToPerOpMicros(result.RoundTripTimeMs):F2}" : "N/A";
var ser = result.SerializeTimeMs > 0 ? $"{SerPerOp(result):F2}" : "N/A";
var des = result.DeserializeTimeMs > 0 ? $"{DesPerOp(result):F2}" : "N/A";
var rt = result.RoundTripTimeMs > 0 ? $"{RtPerOp(result):F2}" : "N/A";
var serAlloc = result.SerializeTimeMs > 0 ? $"{ToKilobytes(result.SerializeAllocBytesPerOp):F2}" : "N/A";
var desAlloc = result.DeserializeTimeMs > 0 ? $"{ToKilobytes(result.DeserializeAllocBytesPerOp):F2}" : "N/A";
var rtAlloc = result.RoundTripAllocBytesPerOp > 0 ? $"{ToKilobytes(result.RoundTripAllocBytesPerOp):F2}" : "N/A";
@ -2411,7 +2771,7 @@ public static class Program
if (isHighlighted && memPackResult != null && acBinaryResult != null)
{
var isMemPack = (result.Engine == EngineMemoryPack && result.IoMode == IoByteArray);
var memPackFaster = memPackResult.RoundTripTimeMs < acBinaryResult.RoundTripTimeMs;
var memPackFaster = RtPerOp(memPackResult) < RtPerOp(acBinaryResult);
if (isMemPack)
{
@ -2435,9 +2795,10 @@ public static class Program
if (memPackResult != null && acBinaryResult != null)
{
var sizePct = (acBinaryResult.SerializedSize / (double)memPackResult.SerializedSize - 1) * 100;
var serPct = memPackResult.SerializeTimeMs > 0 ? (acBinaryResult.SerializeTimeMs / memPackResult.SerializeTimeMs - 1) * 100 : 0;
var desPct = memPackResult.DeserializeTimeMs > 0 ? (acBinaryResult.DeserializeTimeMs / memPackResult.DeserializeTimeMs - 1) * 100 : 0;
var rtPct = memPackResult.RoundTripTimeMs > 0 ? (acBinaryResult.RoundTripTimeMs / memPackResult.RoundTripTimeMs - 1) * 100 : 0;
// Per-op µs ratio (iter-independent) — Ser/Des may have different iter counts on the two rows.
var serPct = SerPerOp(memPackResult) > 0 ? (SerPerOp(acBinaryResult) / SerPerOp(memPackResult) - 1) * 100 : 0;
var desPct = DesPerOp(memPackResult) > 0 ? (DesPerOp(acBinaryResult) / DesPerOp(memPackResult) - 1) * 100 : 0;
var rtPct = RtPerOp(memPackResult) > 0 ? (RtPerOp(acBinaryResult) / RtPerOp(memPackResult) - 1) * 100 : 0;
var serAllocPct = memPackResult.SerializeAllocBytesPerOp > 0 ? (acBinaryResult.SerializeAllocBytesPerOp / (double)memPackResult.SerializeAllocBytesPerOp - 1) * 100 : 0;
var desAllocPct = memPackResult.DeserializeAllocBytesPerOp > 0 ? (acBinaryResult.DeserializeAllocBytesPerOp / (double)memPackResult.DeserializeAllocBytesPerOp - 1) * 100 : 0;
var rtAllocPct = memPackResult.RoundTripAllocBytesPerOp > 0 ? (acBinaryResult.RoundTripAllocBytesPerOp / (double)memPackResult.RoundTripAllocBytesPerOp - 1) * 100 : 0;
@ -2512,22 +2873,24 @@ public static class Program
// Fastest Serialize — round-trip-only serializers (NamedPipe etc.) excluded:
// their Serialize() captures the full round-trip and isn't comparable to a pure Ser metric.
// Average is over per-op µs (iter-independent) instead of batch-time, since rows may now
// have different iter counts post-calibration.
var fastestSer = results.Where(r => r.SerializeTimeMs > 0 && !r.IsRoundTripOnly)
.GroupBy(r => r.SerializerName)
.Select(g => new { Name = g.Key, AvgTime = g.Average(r => r.SerializeTimeMs) })
.OrderBy(x => x.AvgTime)
.Select(g => new { Name = g.Key, AvgPerOp = g.Average(r => SerPerOp(r)) })
.OrderBy(x => x.AvgPerOp)
.FirstOrDefault();
if (fastestSer != null)
System.Console.WriteLine($"{"Fastest Serialize",-20} │ {fastestSer.Name,-40} │ {ToPerOpMicros(fastestSer.AvgTime),12:F2} µs/op");
System.Console.WriteLine($"{"Fastest Serialize",-20} │ {fastestSer.Name,-40} │ {fastestSer.AvgPerOp,12:F2} µs/op");
// Fastest Deserialize — round-trip-only serializers excluded (their Deserialize() is a no-op).
var fastestDes = results.Where(r => r.DeserializeTimeMs > 0 && !r.IsRoundTripOnly)
.GroupBy(r => r.SerializerName)
.Select(g => new { Name = g.Key, AvgTime = g.Average(r => r.DeserializeTimeMs) })
.OrderBy(x => x.AvgTime)
.Select(g => new { Name = g.Key, AvgPerOp = g.Average(r => DesPerOp(r)) })
.OrderBy(x => x.AvgPerOp)
.FirstOrDefault();
if (fastestDes != null)
System.Console.WriteLine($"{"Fastest Deserialize",-20} │ {fastestDes.Name,-40} │ {ToPerOpMicros(fastestDes.AvgTime),12:F2} µs/op");
System.Console.WriteLine($"{"Fastest Deserialize",-20} │ {fastestDes.Name,-40} │ {fastestDes.AvgPerOp,12:F2} µs/op");
// Smallest Size
var smallestSize = results
@ -2538,14 +2901,14 @@ public static class Program
if (smallestSize != null)
System.Console.WriteLine($"{"Smallest Size",-20} │ {smallestSize.Name,-40} │ {smallestSize.AvgSize,15:F0} B");
// Fastest Round-trip
// Fastest Round-trip — iter-independent per-op average.
var fastestRt = results.Where(r => r.RoundTripTimeMs > 0)
.GroupBy(r => r.SerializerName)
.Select(g => new { Name = g.Key, AvgTime = g.Average(r => r.RoundTripTimeMs) })
.OrderBy(x => x.AvgTime)
.Select(g => new { Name = g.Key, AvgPerOp = g.Average(r => RtPerOp(r)) })
.OrderBy(x => x.AvgPerOp)
.FirstOrDefault();
if (fastestRt != null)
System.Console.WriteLine($"{"Fastest Round-trip",-20} │ {fastestRt.Name,-40} │ {ToPerOpMicros(fastestRt.AvgTime),12:F2} µs/op");
System.Console.WriteLine($"{"Fastest Round-trip",-20} │ {fastestRt.Name,-40} │ {fastestRt.AvgPerOp,12:F2} µs/op");
// Overall AcBinary (SGen) vs MemoryPack comparison (baseline switched MessagePack → MemoryPack as SOTA reference).
// AcBinary side is restricted to DispatchMode == SGen — apples-to-apples vs MemoryPack which is also source-generated.
@ -2567,16 +2930,18 @@ public static class Program
return;
}
var memPackAvgSer = memPackSerResults.Count > 0 ? memPackSerResults.Average(r => r.SerializeTimeMs) : 0;
var memPackAvgDes = memPackDesResults.Average(r => r.DeserializeTimeMs);
var memPackAvgRt = memPackRtResults.Average(r => r.RoundTripTimeMs);
// All averages are over per-op µs (iter-independent). Batch-time averaging would mix rows
// measured with different iter counts (post-calibration), producing meaningless numbers.
var memPackAvgSer = memPackSerResults.Count > 0 ? memPackSerResults.Average(r => SerPerOp(r)) : 0;
var memPackAvgDes = memPackDesResults.Average(r => DesPerOp(r));
var memPackAvgRt = memPackRtResults.Average(r => RtPerOp(r));
var memPackAvgSize = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).Average(r => r.SerializedSize);
var memPackAvgSerAlloc = memPackSerResults.Count > 0 ? memPackSerResults.Average(r => r.SerializeAllocBytesPerOp) : 0;
var memPackAvgDesAlloc = memPackDesResults.Count > 0 ? memPackDesResults.Average(r => r.DeserializeAllocBytesPerOp) : 0;
var acBinaryAvgSer = acBinarySerResults.Count > 0 ? acBinarySerResults.Average(r => r.SerializeTimeMs) : 0;
var acBinaryAvgDes = acBinaryDesResults.Average(r => r.DeserializeTimeMs);
var acBinaryAvgRt = acBinaryRtResults.Average(r => r.RoundTripTimeMs);
var acBinaryAvgSer = acBinarySerResults.Count > 0 ? acBinarySerResults.Average(r => SerPerOp(r)) : 0;
var acBinaryAvgDes = acBinaryDesResults.Average(r => DesPerOp(r));
var acBinaryAvgRt = acBinaryRtResults.Average(r => RtPerOp(r));
var acBinaryAvgSize = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)).Average(r => r.SerializedSize);
var acBinaryAvgSerAlloc = acBinarySerResults.Count > 0 ? acBinarySerResults.Average(r => r.SerializeAllocBytesPerOp) : 0;
var acBinaryAvgDesAlloc = acBinaryDesResults.Count > 0 ? acBinaryDesResults.Average(r => r.DeserializeAllocBytesPerOp) : 0;
@ -2589,7 +2954,7 @@ public static class Program
{
var serPctAll = (acBinaryAvgSer / memPackAvgSer - 1) * 100;
System.Console.ForegroundColor = serPctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
System.Console.WriteLine($" Serialize: {serPctAll:+0;-0}% ({ToPerOpMicros(acBinaryAvgSer):F2} µs/op vs {ToPerOpMicros(memPackAvgSer):F2} µs/op)");
System.Console.WriteLine($" Serialize: {serPctAll:+0;-0}% ({acBinaryAvgSer:F2} µs/op vs {memPackAvgSer:F2} µs/op)");
System.Console.ResetColor();
}
@ -2598,11 +2963,11 @@ public static class Program
var sizePctAll = (acBinaryAvgSize / memPackAvgSize - 1) * 100;
System.Console.ForegroundColor = desPctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
System.Console.WriteLine($" Deserialize: {desPctAll:+0;-0}% ({ToPerOpMicros(acBinaryAvgDes):F2} µs/op vs {ToPerOpMicros(memPackAvgDes):F2} µs/op)");
System.Console.WriteLine($" Deserialize: {desPctAll:+0;-0}% ({acBinaryAvgDes:F2} µs/op vs {memPackAvgDes:F2} µs/op)");
System.Console.ResetColor();
System.Console.ForegroundColor = rtPctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
System.Console.WriteLine($" Round-trip: {rtPctAll:+0;-0}% ({ToPerOpMicros(acBinaryAvgRt):F2} µs/op vs {ToPerOpMicros(memPackAvgRt):F2} µs/op)");
System.Console.WriteLine($" Round-trip: {rtPctAll:+0;-0}% ({acBinaryAvgRt:F2} µs/op vs {memPackAvgRt:F2} µs/op)");
System.Console.ResetColor();
System.Console.ForegroundColor = sizePctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
@ -2663,8 +3028,9 @@ public static class Program
sb.AppendLine("║ SERIALIZER BENCHMARK RESULTS ║");
sb.AppendLine($"║ Generated: {DateTime.Now:yyyy-MM-dd HH:mm:ss}".PadRight(100) + "║");
sb.AppendLine($"║ Build: {BuildConfiguration}".PadRight(100) + "║");
sb.AppendLine($"║ Iterations: {TestIterations}".PadRight(100) + "║");
sb.AppendLine($"║ Samples: {BenchmarkSamples} (median)".PadRight(100) + "║");
sb.AppendLine($"║ Charset: {GetCurrentCharsetName()}".PadRight(100) + "║");
sb.AppendLine($"║ Iterations: per-cell adaptive (~{TargetSampleMs} ms target)".PadRight(100) + "║");
sb.AppendLine($"║ Samples: {BenchmarkSamples} (median) + 1 pilot discarded".PadRight(100) + "║");
sb.AppendLine($"║ Test Type: {testDataSets.FirstOrDefault()?.TypeName ?? "unknown"}".PadRight(100) + "║");
sb.AppendLine("╚══════════════════════════════════════════════════════════════════════════════════════════════════════╝");
sb.AppendLine();
@ -2691,7 +3057,7 @@ public static class Program
var testResults = results.Where(r => r.TestDataName == testData.DisplayName).ToList();
foreach (var result in testResults)
{
sb.AppendLine($"{result.TestDataName},{result.Engine},{result.IoMode},{result.DispatchMode},{result.OptionsPreset},{result.SerializedSize},{ToPerOpMicros(result.SerializeTimeMs):F2},{ToPerOpMicros(result.DeserializeTimeMs):F2},{ToPerOpMicros(result.RoundTripTimeMs):F2},{result.SerializeAllocBytesPerOp},{result.DeserializeAllocBytesPerOp},{result.RoundTripAllocBytesPerOp},{result.SetupSerializeAllocBytes},{result.SetupDeserializeAllocBytes}");
sb.AppendLine($"{result.TestDataName},{result.Engine},{result.IoMode},{result.DispatchMode},{result.OptionsPreset},{result.SerializedSize},{SerPerOp(result):F2},{DesPerOp(result):F2},{RtPerOp(result):F2},{result.SerializeAllocBytesPerOp},{result.DeserializeAllocBytesPerOp},{result.RoundTripAllocBytesPerOp},{result.SetupSerializeAllocBytes},{result.SetupDeserializeAllocBytes}");
}
}
sb.AppendLine();
@ -2703,7 +3069,8 @@ public static class Program
foreach (var testData in testDataSets)
{
var testResults = results.Where(r => r.TestDataName == testData.DisplayName).OrderBy(r => r.RoundTripTimeMs).ToList();
// Order by per-op µs (iter-independent) — rows may have different iter counts post-calibration.
var testResults = results.Where(r => r.TestDataName == testData.DisplayName).OrderBy(r => RtPerOp(r)).ToList();
var memPackResult = testResults.FirstOrDefault(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray));
// Pin the comparison to AcBinary's SGen variant — apples-to-apples vs MemoryPack (also source-generated).
// The Runtime variant is shown alongside in the table for context, not used as the headline number.
@ -2722,9 +3089,9 @@ public static class Program
var size = $"{result.SerializedSize:N0}";
var setup = $"{ToKilobytes(result.SetupSerializeAllocBytes):F2} / {ToKilobytes(result.SetupDeserializeAllocBytes):F2}";
var ser = result.SerializeTimeMs > 0 ? $"{ToPerOpMicros(result.SerializeTimeMs):F2}" : "N/A";
var des = result.DeserializeTimeMs > 0 ? $"{ToPerOpMicros(result.DeserializeTimeMs):F2}" : "N/A";
var rt = result.RoundTripTimeMs > 0 ? $"{ToPerOpMicros(result.RoundTripTimeMs):F2}" : "N/A";
var ser = result.SerializeTimeMs > 0 ? $"{SerPerOp(result):F2}" : "N/A";
var des = result.DeserializeTimeMs > 0 ? $"{DesPerOp(result):F2}" : "N/A";
var rt = result.RoundTripTimeMs > 0 ? $"{RtPerOp(result):F2}" : "N/A";
var serAlloc = result.SerializeTimeMs > 0 ? $"{ToKilobytes(result.SerializeAllocBytesPerOp):F2}" : "N/A";
var desAlloc = result.DeserializeTimeMs > 0 ? $"{ToKilobytes(result.DeserializeAllocBytesPerOp):F2}" : "N/A";
@ -2735,9 +3102,10 @@ public static class Program
if (memPackResult != null && acBinaryResult != null)
{
var sizePct = (acBinaryResult.SerializedSize / (double)memPackResult.SerializedSize - 1) * 100;
var serPct = memPackResult.SerializeTimeMs > 0 ? (acBinaryResult.SerializeTimeMs / memPackResult.SerializeTimeMs - 1) * 100 : 0;
var desPct = memPackResult.DeserializeTimeMs > 0 ? (acBinaryResult.DeserializeTimeMs / memPackResult.DeserializeTimeMs - 1) * 100 : 0;
var rtPct = memPackResult.RoundTripTimeMs > 0 ? (acBinaryResult.RoundTripTimeMs / memPackResult.RoundTripTimeMs - 1) * 100 : 0;
// Per-op µs ratio (iter-independent) — Ser/Des may have different iter counts on the two rows.
var serPct = SerPerOp(memPackResult) > 0 ? (SerPerOp(acBinaryResult) / SerPerOp(memPackResult) - 1) * 100 : 0;
var desPct = DesPerOp(memPackResult) > 0 ? (DesPerOp(acBinaryResult) / DesPerOp(memPackResult) - 1) * 100 : 0;
var rtPct = RtPerOp(memPackResult) > 0 ? (RtPerOp(acBinaryResult) / RtPerOp(memPackResult) - 1) * 100 : 0;
sb.AppendLine($" {"AcBinary (Byte[])"} vs {"MemoryPack (Byte[])"}: Size {sizePct:+0;-0}% │ Ser {serPct:+0;-0}% │ Des {desPct:+0;-0}% │ RT {rtPct:+0;-0}%");
}
@ -2777,31 +3145,32 @@ public static class Program
if (memPackSerResults2.Count > 0 && acBinarySerResults2.Count > 0)
{
var memPackAvgSer2 = memPackSerResults2.Average(r => r.SerializeTimeMs);
var acBinaryAvgSer2 = acBinarySerResults2.Average(r => r.SerializeTimeMs);
// Per-op µs averages (iter-independent) — see comment above the parallel block in PrintSummary.
var memPackAvgSer2 = memPackSerResults2.Average(r => SerPerOp(r));
var acBinaryAvgSer2 = acBinarySerResults2.Average(r => SerPerOp(r));
var memPackAvgSerAlloc2 = memPackSerResults2.Average(r => r.SerializeAllocBytesPerOp);
var acBinaryAvgSerAlloc2 = acBinarySerResults2.Average(r => r.SerializeAllocBytesPerOp);
sb.AppendLine($" Serialize: {((acBinaryAvgSer2 / memPackAvgSer2 - 1) * 100):+0;-0}% ({ToPerOpMicros(acBinaryAvgSer2):F2} µs/op vs {ToPerOpMicros(memPackAvgSer2):F2} µs/op)");
sb.AppendLine($" Serialize: {((acBinaryAvgSer2 / memPackAvgSer2 - 1) * 100):+0;-0}% ({acBinaryAvgSer2:F2} µs/op vs {memPackAvgSer2:F2} µs/op)");
if (memPackAvgSerAlloc2 > 0)
sb.AppendLine($" Ser Alloc: {((acBinaryAvgSerAlloc2 / memPackAvgSerAlloc2 - 1) * 100):+0;-0}% ({acBinaryAvgSerAlloc2:F0} B/op vs {memPackAvgSerAlloc2:F0} B/op)");
}
if (memPackDesResults2.Count > 0 && acBinaryDesResults2.Count > 0)
{
var memPackAvgDes2 = memPackDesResults2.Average(r => r.DeserializeTimeMs);
var acBinaryAvgDes2 = acBinaryDesResults2.Average(r => r.DeserializeTimeMs);
var memPackAvgDes2 = memPackDesResults2.Average(r => DesPerOp(r));
var acBinaryAvgDes2 = acBinaryDesResults2.Average(r => DesPerOp(r));
var memPackAvgDesAlloc2 = memPackDesResults2.Average(r => r.DeserializeAllocBytesPerOp);
var acBinaryAvgDesAlloc2 = acBinaryDesResults2.Average(r => r.DeserializeAllocBytesPerOp);
sb.AppendLine($" Deserialize: {((acBinaryAvgDes2 / memPackAvgDes2 - 1) * 100):+0;-0}% ({ToPerOpMicros(acBinaryAvgDes2):F2} µs/op vs {ToPerOpMicros(memPackAvgDes2):F2} µs/op)");
sb.AppendLine($" Deserialize: {((acBinaryAvgDes2 / memPackAvgDes2 - 1) * 100):+0;-0}% ({acBinaryAvgDes2:F2} µs/op vs {memPackAvgDes2:F2} µs/op)");
if (memPackAvgDesAlloc2 > 0)
sb.AppendLine($" Des Alloc: {((acBinaryAvgDesAlloc2 / memPackAvgDesAlloc2 - 1) * 100):+0;-0}% ({acBinaryAvgDesAlloc2:F0} B/op vs {memPackAvgDesAlloc2:F0} B/op)");
}
if (memPackRtResults2.Count > 0 && acBinaryRtResults2.Count > 0)
{
var memPackAvgRt2 = memPackRtResults2.Average(r => r.RoundTripTimeMs);
var acBinaryAvgRt2 = acBinaryRtResults2.Average(r => r.RoundTripTimeMs);
sb.AppendLine($" Round-trip: {((acBinaryAvgRt2 / memPackAvgRt2 - 1) * 100):+0;-0}% ({ToPerOpMicros(acBinaryAvgRt2):F2} µs/op vs {ToPerOpMicros(memPackAvgRt2):F2} µs/op)");
var memPackAvgRt2 = memPackRtResults2.Average(r => RtPerOp(r));
var acBinaryAvgRt2 = acBinaryRtResults2.Average(r => RtPerOp(r));
sb.AppendLine($" Round-trip: {((acBinaryAvgRt2 / memPackAvgRt2 - 1) * 100):+0;-0}% ({acBinaryAvgRt2:F2} µs/op vs {memPackAvgRt2:F2} µs/op)");
}
var memPackAvgSize2 = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).Average(r => r.SerializedSize);
@ -2821,7 +3190,7 @@ public static class Program
var sb = new StringBuilder();
var testTypeName = testDataSets.FirstOrDefault()?.TypeName ?? "unknown";
sb.AppendLine($"# AcBinary Benchmark {BuildConfiguration} {DateTime.Now:yyyy-MM-dd HH:mm:ss}");
sb.AppendLine($"Iterations: {TestIterations} | Warmup: {WarmupIterations} | Samples: {BenchmarkSamples} (median) | .NET: {Environment.Version} | TestType: {testTypeName}");
sb.AppendLine($"Charset: {GetCurrentCharsetName()} | Iterations: per-cell adaptive (target ~{TargetSampleMs} ms/sample) | Warmup: {WarmupIterations} | Samples: {BenchmarkSamples} (median) + 1 pilot discarded | .NET: {Environment.Version} | TestType: {testTypeName} | UnstableCV threshold: {UnstableCVThreshold * 100:F0}%");
sb.AppendLine($"Baseline: {"MemoryPack (Byte[])"} (SOTA reference) | Verified: round-trip correctness checked once per cell before warmup");
// Options summary
@ -2839,31 +3208,48 @@ public static class Program
sb.AppendLine($"- **{name}**: {opts}");
}
// Flat results table sorted by test data then round-trip (now includes Alloc columns)
// Flat results table sorted by test data then round-trip (now includes Alloc + Iter columns).
// Iter column shows per-row Ser/Des iteration counts (post-adaptive-calibration), so the reader
// can verify that each cell's batch sample landed near the TargetSampleMs window.
sb.AppendLine();
sb.AppendLine("## Results");
sb.AppendLine();
sb.AppendLine("TestData | Engine | IO | Mode | Options | Size(B) | Ser(µs/op) | Deser(µs/op) | RT(µs/op) | SerAlloc(KB/op) | DesAlloc(KB/op) | RTAlloc(KB/op) | Setup S/D(KB)");
sb.AppendLine("---|---|---|---|---|---|---|---|---|---|---|---|---");
sb.AppendLine("TestData | Engine | IO | Mode | Options | Size(B) | Ser(µs/op) | Deser(µs/op) | RT(µs/op) | SerAlloc(KB/op) | DesAlloc(KB/op) | RTAlloc(KB/op) | Setup S/D(KB) | Iter Ser/Des");
sb.AppendLine("---|---|---|---|---|---|---|---|---|---|---|---|---|---");
foreach (var testData in testDataSets)
{
var testResults = results
.Where(r => r.TestDataName == testData.DisplayName)
.OrderBy(r => r.RoundTripTimeMs)
// Per-op µs (iter-independent) ordering — mixing iter counts within a cell is now expected.
.OrderBy(r => RtPerOp(r))
.ToList();
foreach (var r in testResults)
{
var inv = System.Globalization.CultureInfo.InvariantCulture;
var ser = r.SerializeTimeMs > 0 ? ToPerOpMicros(r.SerializeTimeMs).ToString("F2", inv) : "-";
var des = r.DeserializeTimeMs > 0 ? ToPerOpMicros(r.DeserializeTimeMs).ToString("F2", inv) : "-";
var rt = r.RoundTripTimeMs > 0 ? ToPerOpMicros(r.RoundTripTimeMs).ToString("F2", inv) : "-";
// Per-cell median + inter-sample range (min..max) + CV-threshold marker (⚠X.X% when CV > 3%).
// Range surfaces the noise floor for each row so a small inter-engine delta is easy to
// judge against the row's noise. Format: "26.86 (24.50..29.10)" or
// "26.86 (24.50..29.10) ⚠5.2%" when stddev/mean exceeds the unstable threshold.
// When only one sample was taken (Debug / quick mode) min == max == median; collapse
// to bare median to avoid visual clutter.
var ser = r.SerializeTimeMs > 0 ? FormatMicrosWithRange(r.SerializeTimeMs, r.SerializeTimeMinMs, r.SerializeTimeMaxMs, r.SerializeTimeStdDevMs, r.SerializeIterations, inv) : "-";
var des = r.DeserializeTimeMs > 0 ? FormatMicrosWithRange(r.DeserializeTimeMs, r.DeserializeTimeMinMs, r.DeserializeTimeMaxMs, r.DeserializeTimeStdDevMs, r.DeserializeIterations, inv) : "-";
var rt = r.RoundTripTimeMs > 0
? (r.IsRoundTripOnly
? FormatMicrosWithRange(r.RoundTripTimeMs, r.RoundTripTimeMinMs, r.RoundTripTimeMaxMs, r.RoundTripTimeStdDevMs, r.RoundTripIterations, inv)
: RtPerOp(r).ToString("F2", inv))
: "-";
var serAlloc = r.SerializeTimeMs > 0 ? ToKilobytes(r.SerializeAllocBytesPerOp).ToString("F2", inv) : "-";
var desAlloc = r.DeserializeTimeMs > 0 ? ToKilobytes(r.DeserializeAllocBytesPerOp).ToString("F2", inv) : "-";
var rtAlloc = r.RoundTripAllocBytesPerOp > 0 ? ToKilobytes(r.RoundTripAllocBytesPerOp).ToString("F2", inv) : "-";
var setupAlloc = $"{ToKilobytes(r.SetupSerializeAllocBytes).ToString("F2", inv)} / {ToKilobytes(r.SetupDeserializeAllocBytes).ToString("F2", inv)}";
sb.AppendLine($"{r.TestDataName} | {r.Engine} | {r.IoMode} | {r.DispatchMode} | {r.OptionsPreset} | {r.SerializedSize} | {ser} | {des} | {rt} | {serAlloc} | {desAlloc} | {rtAlloc} | {setupAlloc}");
// Iter Ser/Des column — per-row adaptive iter counts. RT-only rows show Iter for RT.
var iterCol = r.IsRoundTripOnly
? r.RoundTripIterations.ToString(inv)
: $"{(r.SerializeIterations > 0 ? r.SerializeIterations.ToString(inv) : "-")} / {(r.DeserializeIterations > 0 ? r.DeserializeIterations.ToString(inv) : "-")}";
sb.AppendLine($"{r.TestDataName} | {r.Engine} | {r.IoMode} | {r.DispatchMode} | {r.OptionsPreset} | {r.SerializedSize} | {ser} | {des} | {rt} | {serAlloc} | {desAlloc} | {rtAlloc} | {setupAlloc} | {iterCol}");
}
}

View File

@ -5,10 +5,59 @@ using System.Runtime.CompilerServices;
namespace AyCode.Core.Tests.TestModels;
/// <summary>
/// Charset suffix presets for the per-property string augmentation in
/// <c>BenchmarkTestDataProvider.ToLongString</c>. The benchmark applies the configured suffix
/// to every short (≤ <c>FixStrMaxLength</c>) string property across the test data graph (via reflection
/// in <c>BenchmarkTestDataProvider.EnsureAllStringsBypassFixStr</c>), producing long-string
/// benchmark payloads with a controlled UTF-8 content profile.
///
/// Switch by assigning to <see cref="BenchmarkTestDataProvider.LongStringSuffix"/> from the interactive
/// Settings → Charset submenu (or programmatically). The active charset is recorded in the .LLM
/// markdown output header so per-charset bench files are self-documenting.
/// </summary>
public static class CharsetSuffixes
{
/// <summary>Empty suffix — short Hungarian baseline strings (e.g. "SharedTag") stay short, hitting
/// the FixStr fast-path. Stress-test for FixStr / short-string code paths. Note: the baseline
/// property values remain Hungarian; only the suffix is empty. Despite the "FixAscii" name, this
/// option does NOT change baseline values to ASCII — it suppresses the suffix that would otherwise
/// push every property past the FixStr boundary.</summary>
public const string Latin1FixAscii = "";
/// <summary>Short Latin1 mixed (Hungarian, ~24 char) — typical European i18n payload, short
/// multi-byte runs. Below the 32-char FixStr boundary on the suffix alone, but combined with
/// baseline values pushes every property past it.</summary>
public const string Latin1Short = " árvíztűrő tükörfúrógép";
/// <summary>Long Latin1 mixed (~47 char) — exceeds the 32-char FixStr boundary on the suffix alone,
/// exercising the StringSmall+ tier path with Latin1 mixed content (Hungarian accented letters).</summary>
public const string Latin1Long = " árvíztűrő tükörfúrógép a magyar betűzés tesztje";
/// <summary>CJK BMP (Chinese / Japanese / Korean Basic Multilingual Plane) — long homogeneous
/// 3-byte UTF-8 runs. Primary win region for V4N2 Phase 3 SIMD multi-byte transcoder work.</summary>
public const string CjkBmp = " 你好世界 こんにちは 안녕하세요";
/// <summary>Cyrillic (Russian / Ukrainian / etc.) — long homogeneous 2-byte runs, different shape
/// than Hungarian mixed (where 2-byte chars are short interspersed runs).</summary>
public const string Cyrillic = " Привет мир дорогой друг";
/// <summary>Mixed full-spectrum (Hungarian + CJK + Cyrillic + emoji surrogate pairs) — multi-tier
/// coverage in one payload. Stresses surrogate-pair handling in the UTF-8 transcoder.</summary>
public const string Mixed = " árvíz 你好 Привет 😀";
}
public static class BenchmarkTestDataProvider
{
private const int FixStrMaxLength = 31;
private const string LongStringSuffix = "__Benchmárk_Long_String_Söffix__";
/// <summary>
/// Active long-string suffix appended to short string properties during benchmark data construction.
/// Defaults to <see cref="CharsetSuffixes.Latin1Long"/> (~47-char Latin1 mixed) — backward-compatible
/// in spirit with the prior fixed default (Latin1 mixed family, ~32 char). Switch from
/// <see cref="CharsetSuffixes"/> to measure other UTF-8 content profiles.
/// </summary>
public static string LongStringSuffix = CharsetSuffixes.Latin1Long;
private sealed class ReferenceComparer : IEqualityComparer<object>
{
@ -32,8 +81,8 @@ public static class BenchmarkTestDataProvider
public static TestOrder CreateProfilerOrder()
{
TestDataFactory.ResetIdCounter();
var sharedTag = TestDataFactory.CreateTag("KözösCímke");
var sharedUser = TestDataFactory.CreateUser("közösfelhasználó");
var sharedTag = TestDataFactory.CreateTag("SharedTag");
var sharedUser = TestDataFactory.CreateUser("shareduser");
return TestDataFactory.CreateOrder(
itemCount: 3,
palletsPerItem: 3,
@ -47,8 +96,8 @@ public static class BenchmarkTestDataProvider
{
if (resetId) TestDataFactory.ResetIdCounter();
var sharedTag = TestDataFactory.CreateTag("KözösCímke");
var sharedUser = TestDataFactory.CreateUser("közösfelhasználó");
var sharedTag = TestDataFactory.CreateTag("SharedTag");
var sharedUser = TestDataFactory.CreateUser("shareduser");
var order = TestDataFactory.CreateOrder(
itemCount: 2,
@ -69,16 +118,16 @@ public static class BenchmarkTestDataProvider
{
if (resetId) TestDataFactory.ResetIdCounter();
var sharedTag = TestDataFactory.CreateTag("KözösCímke");
var sharedUser = TestDataFactory.CreateUser("közösfelhasználó");
var sharedMeta = TestDataFactory.CreateMetadata("közös", withChild: true);
var sharedTag = TestDataFactory.CreateTag("SharedTag");
var sharedUser = TestDataFactory.CreateUser("shareduser");
var sharedMeta = TestDataFactory.CreateMetadata("shared", withChild: true);
var sharedPreferences = new UserPreferences
{
Theme = "sötét",
Language = "magyar",
Theme = "dark",
Language = "hungarian",
NotificationsEnabled = true,
EmailDigestFrequency = "hetenkénti"
EmailDigestFrequency = "weekly"
};
sharedUser.Preferences = sharedPreferences;
@ -103,15 +152,15 @@ public static class BenchmarkTestDataProvider
{
if (resetId) TestDataFactory.ResetIdCounter();
var sharedTag = TestDataFactory.CreateTag("KözösCímke");
var sharedUser = TestDataFactory.CreateUser("közösfelhasználó");
var sharedTag = TestDataFactory.CreateTag("SharedTag");
var sharedUser = TestDataFactory.CreateUser("shareduser");
var sharedPreferences = new UserPreferences
{
Theme = "világos",
Language = "német",
Theme = "light",
Language = "german",
NotificationsEnabled = false,
EmailDigestFrequency = "naponkénti"
EmailDigestFrequency = "daily"
};
sharedUser.Preferences = sharedPreferences;
@ -135,15 +184,15 @@ public static class BenchmarkTestDataProvider
{
if (resetId) TestDataFactory.ResetIdCounter();
var sharedTag = TestDataFactory.CreateTag("IsmétlődőCímke");
var sharedUser = TestDataFactory.CreateUser("ismétlődőfelhasználó");
var sharedTag = TestDataFactory.CreateTag("RepeatedTag");
var sharedUser = TestDataFactory.CreateUser("repeateduser");
var sharedPreferences = new UserPreferences
{
Theme = "sötét",
Language = "magyar",
Theme = "dark",
Language = "hungarian",
NotificationsEnabled = true,
EmailDigestFrequency = "hetenkénti"
EmailDigestFrequency = "weekly"
};
sharedUser.Preferences = sharedPreferences;
@ -159,16 +208,17 @@ public static class BenchmarkTestDataProvider
// Repeated string fields — ProductName on items + PalletCode on pallets. Both are common
// across the hierarchy, exercising string-interning deduplication on the Default preset
// (which has UseStringInterning = All). Targeting ~20% repeated-string share overall.
// Strings contain non-ASCII characters (Hungarian accented letters → multi-byte UTF-8) so the
// benchmark reflects real-world i18n payloads, not just the ASCII FixStr fast-path.
// Baselines are short ASCII (≤ FixStrMaxLength) so EnsureAllStringsBypassFixStr appends the
// active CharsetSuffix — the resulting payload's UTF-8 content profile is governed entirely
// by the selected charset (not contaminated by hard-coded Hungarian baseline values).
foreach (var item in order.Items)
{
item.Status = TestStatus.Processing;
item.ProductName = "TermékNév_IsmétlődőTesztAdat_árvíztűrőtükörfúrógép";
item.ProductName = "ProductName";
foreach (var pallet in item.Pallets)
{
pallet.PalletCode = "RaklapKód_IsmétlődőTesztAdat_árvíztűrő";
pallet.PalletCode = "PalletCode";
}
}
@ -183,16 +233,16 @@ public static class BenchmarkTestDataProvider
{
if (resetId) TestDataFactory.ResetIdCounter();
var sharedTag = TestDataFactory.CreateTag("MélyCímke");
var sharedUser = TestDataFactory.CreateUser("mélyfelhasználó");
var sharedCategory = TestDataFactory.CreateCategory("MélyKategória");
var sharedTag = TestDataFactory.CreateTag("DeepTag");
var sharedUser = TestDataFactory.CreateUser("deepuser");
var sharedCategory = TestDataFactory.CreateCategory("DeepCategory");
var sharedPreferences = new UserPreferences
{
Theme = "világos",
Language = "francia",
Theme = "light",
Language = "french",
NotificationsEnabled = false,
EmailDigestFrequency = "havonkénti"
EmailDigestFrequency = "monthly"
};
sharedUser.Preferences = sharedPreferences;

View File

@ -534,9 +534,12 @@ public static partial class AcBinaryDeserializer
/// <see cref="Utf8Transcoder.CountUtf8Chars"/> Pass 1.
/// </summary>
/// <remarks>
/// Wire context: tier markers (StringSmall/Medium/Big, StringInternFirstSmall/Medium) carry the
/// char count alongside the byte count, so this method can <see cref="string.Create{TState}"/>
/// directly with the known target capacity and decode in a single pass through the bytes.
/// Single method (no dispatcher/core split): the V4N4 split attempt did not pay off — the AOT
/// did NOT inline the dispatcher despite `[AggressiveInlining]` (disasm 15:12 confirmed both
/// dispatcher AND core body remained as call-targets), so the only effect was +1 call instruction
/// per decode (Small Deser regression +16.6 pp). Reverted to single method — `string.Create`
/// callback uses a cached static lambda (delegate caching confirmed by `test static; jne skip ctor`
/// pattern in disasm).
///
/// <para>Compact mode only — FastWire mode never emits H2Q6 tier markers (its
/// <see cref="ReadStringUtf8"/> path handles UTF-16 raw memcpy).</para>

View File

@ -735,11 +735,17 @@ public static partial class AcBinarySerializer
/// <para>Caller MUST guarantee non-empty input (<c>value.Length &gt; 0</c>) — empty strings
/// are handled by the higher-level <c>WriteString</c> via the <c>StringEmpty</c> marker.</para>
/// </remarks>
// V4N4 method-split reverted (2026-05-07): the split (Writer dispatcher + SmallFast + DispatchLong
// + FastWire) was tested 2026-05-07 in two configurations (15:13:39 AggressiveInlining → regression;
// 15:29:21 NoInlining-on-SmallFast → marginal/inconsistent). Bench-to-bench variance proved
// unmeasurable on the available hardware — the optimization-value signal is below the noise floor.
// Reverted to the single-method state (matches 09:39:09 baseline). The A-direction packed-header
// store optimization (Unsafe.WriteUnaligned ushort/uint/ulong) is preserved — it was already in the
// 09:39:09 baseline and is instruction-level, not affected by AOT inline-pressure variance.
public void WriteStringWithDispatch(string value)
{
var charLength = value.Length;
// Single overflow guard: catches charLength > MaxStringCharLength where charLength*4 would wrap.
// Predict-friendly (always false on realistic input). NoInlining throw helper keeps the hot path tight.
// Overflow guard (O7G2) — predict-friendly (always false on realistic input). NoInlining throw helper.
if ((uint)charLength > BinaryTypeCode.MaxStringCharLength) ThrowStringTooLong(charLength);
if (FastWire)
@ -759,20 +765,11 @@ public static partial class AcBinarySerializer
// Compact mode — H2Q6 post-encode tier dispatch (wire-optimal).
//
// Two-step tier logic:
// 1. reserveHeader (from charLength, worst-case 4 byte/char): bounds the buffer allocation
// AND the encode position. Tight reserve (3/5/9) avoids large memmove on the hot path.
// 1. reserveHeader (from charLength, worst-case 4 byte/char): bounds buffer allocation
// AND encode position. Tight reserve (3/5/9) avoids large memmove on the hot path.
// 2. actualHeader (from bytesWritten after encode): chooses the smallest fitting tier.
// A mostly-ASCII string in the 64-16383 char band gets Small (3 byte header) even though
// reserve was Medium (5 byte) — body is left-shifted by 2 bytes to compact.
//
// Why post-encode tier choice (vs. pre-chosen): mostly-ASCII content (English description fields,
// log/error messages, URL paths) at 64+ char would otherwise pay +2 byte/string for Medium
// header when Small fits. Production payloads include both Magyar/CJK multi-byte AND ASCII-
// dominated strings; wire-size narrative ("smallest") matters across the realistic mix.
//
// ASCII override (bytesWritten == charLength) emits FixStrAscii / StringAscii with their own
// compact headers (1 byte / 1+VarUInt) — body shifted left from the encode position.
// (charLength already validated at method entry — charLength * 4 cannot overflow here.)
var maxBytes = charLength * 4;
int reserveHeader;
@ -791,8 +788,9 @@ public static partial class AcBinarySerializer
// ASCII override — FixStrAscii (≤31) or StringAscii (>31) with compact header
if (bytesWritten <= BinaryTypeCode.FixStrAsciiMaxLength)
{
_buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(savedPos + 1, bytesWritten));
var shift = reserveHeader - 1;
if (shift > 0)
_buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(savedPos + 1, bytesWritten));
_buffer[savedPos] = BinaryTypeCode.EncodeFixStrAscii(bytesWritten);
_position = savedPos + 1 + bytesWritten;
}
@ -801,12 +799,10 @@ public static partial class AcBinarySerializer
var actualVarUIntSize = VarUIntSize((uint)bytesWritten);
var asciiHeader = 1 + actualVarUIntSize;
var shift = reserveHeader - asciiHeader;
if (shift > 0) _buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(encodeStart - shift, bytesWritten));
if (shift > 0)
_buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(encodeStart - shift, bytesWritten));
_buffer[savedPos] = BinaryTypeCode.StringAscii;
_position = savedPos + 1;
WriteVarUIntUnsafe((uint)bytesWritten);
_position += bytesWritten;
}
@ -832,29 +828,29 @@ public static partial class AcBinarySerializer
break;
}
var shift = reserveHeader - actualHeader;
if (shift > 0) _buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(encodeStart - shift, bytesWritten));
var nonAsciiShift = reserveHeader - actualHeader;
if (nonAsciiShift > 0) _buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(encodeStart - nonAsciiShift, bytesWritten));
_buffer[savedPos] = tierMarker;
switch (actualHeader)
{
case 3:
{
// Pack charLen:8 | utf8Len:8 → single ushort store (vs 2 byte-stores)
// A-direction: pack charLen:8 | utf8Len:8 → single ushort store
var packed = (ushort)(charLength | (bytesWritten << 8));
Unsafe.WriteUnaligned<ushort>(ref _buffer[savedPos + 1], packed);
break;
}
case 5:
{
// Pack charLen:16 | utf8Len:16 → single uint store, LE (vs 2 ushort-stores)
// A-direction: pack charLen:16 | utf8Len:16 → single uint store, LE
var packed = (uint)charLength | ((uint)bytesWritten << 16);
Unsafe.WriteUnaligned<uint>(ref _buffer[savedPos + 1], packed);
break;
}
default:
{
// Pack charLen:32 | utf8Len:32 → single ulong store, LE (vs 2 uint-stores)
// A-direction: pack charLen:32 | utf8Len:32 → single ulong store, LE
var packed = (ulong)(uint)charLength | ((ulong)(uint)bytesWritten << 32);
Unsafe.WriteUnaligned<ulong>(ref _buffer[savedPos + 1], packed);
break;

View File

@ -495,6 +495,10 @@ internal static class Utf8Transcoder
/// JIT compiles the switch into a jump table for predictable dispatch on mixed content.
/// Hungarian text typical pattern: ASCII run (Phase 1/2 widening) → 2-byte char (Phase 3
/// case &lt; 0xE0) → ASCII run → 2-byte char → ... — each phase optimal for its segment.
///
/// V4N2 Phase 2.5 (run-length scalar decoder) attempted 2026-05-07 — both full and hybrid
/// (3-byte do-while only) variants showed bench-instability and unmeasurable optimization
/// signal on the available hardware. Reverted to the switch-jumptable per-char baseline.
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int DecodeUtf8SinglePass(ReadOnlySpan<byte> src, Span<char> dst)
@ -514,10 +518,6 @@ internal static class Utf8Transcoder
// Widen 32 bytes → 2 × Vector256<ushort> (32 chars total). Each Vector256<ushort>
// holds 16 ushort, so the upper half stores at dstIdx + 16 (= Vector256<ushort>.Count).
// Earlier latent bug used Vector128<ushort>.Count (= 8) here, causing overlap on
// indices 8-15 and uninitialized 24-31 — hidden by the Hungarian benchmark's early
// ASCII bail-out (no 32+ byte ASCII run). Validated by Utf8TranscoderTests
// LongAscii32Plus + AsciiExactly32Bytes round-trips.
var (lower, upper) = Vector256.Widen(v);
lower.StoreUnsafe(ref dstRef, (uint)dstIdx);
upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector256<ushort>.Count));
@ -526,10 +526,10 @@ internal static class Utf8Transcoder
}
}
// Phase 2/3 — scalar loop with DWORD ASCII batch
// Phase 2/2.5/3 — DWORD ASCII batch + run-length scalar decoder + 4-byte fallback
while (srcIdx < src.Length)
{
// DWORD ASCII batch: 4 ASCII bytes → 4 chars per iter
// Phase 2 — DWORD ASCII batch: 4 ASCII bytes → 4 chars per iter (unchanged)
if (src.Length - srcIdx >= 4)
{
var dword = Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref srcRef, srcIdx));
@ -545,12 +545,17 @@ internal static class Utf8Transcoder
}
}
// Scalar multi-byte branch (jump-table compile via switch)
// Phase 2.5 — lead-byte selects run-type, inner do-while decodes the full run.
// Benefit vs. per-char switch-jumptable: the switch dispatch fires once per run-start,
// not once per char. Long homogeneous runs (CJK 3-byte chunks, Latin/Cyrillic/Greek
// 2-byte sequences) get tight branchless inner loops.
var b0 = Unsafe.Add(ref srcRef, srcIdx);
switch (b0)
{
case < 0x80:
// 1-byte ASCII (U+0000U+007F)
// 1-byte ASCII single (single-byte tail of a run that the DWORD batch couldn't cover).
// No do-while loop here — the DWORD batch already handles long ASCII runs above;
// this case is the 1-3 byte tail before the next non-ASCII byte.
Unsafe.Add(ref dstRef, dstIdx++) = b0;
srcIdx += 1;
break;
@ -577,6 +582,7 @@ internal static class Utf8Transcoder
{
// 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000U+10FFFF
// Supplementary plane (emoji, rare CJK ext) → UTF-16 surrogate pair.
// No do-while: 4-byte sequences are typically isolated (single emoji in mixed text).
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
var b3 = Unsafe.Add(ref srcRef, srcIdx + 3);

File diff suppressed because one or more lines are too long