diff --git a/AyCode.Benchmark/AcBinaryVsMemPackBenchmark.cs b/AyCode.Benchmark/AcBinaryVsMemPackBenchmark.cs new file mode 100644 index 0000000..2dbe082 --- /dev/null +++ b/AyCode.Benchmark/AcBinaryVsMemPackBenchmark.cs @@ -0,0 +1,79 @@ +using AyCode.Core.Benchmarks.Workloads.Scenarios; +using AyCode.Core.Serializers; +using AyCode.Core.Serializers.Binaries; +using AyCode.Core.Tests.TestModels; +using BenchmarkDotNet.Attributes; + +namespace AyCode.Core.Benchmarks; + +/// +/// BDN benchmark mirroring the Console app's "F" menu (SerializerSelectionMode.FastestByte) — +/// the focused 1:1 comparison between AcBinary FastMode Byte[] and MemoryPack Default Byte[] +/// across the 5 production-shaped test data cells (Small / Medium / Large / Repeated / Deep). +/// +/// Why this exists: the Console app's adaptive measurement engine gives fast turnaround but is +/// noise-prone; BDN's warmup + iteration + outlier-removal stack tightens the inter-engine delta to +/// the point where ~1-2% micro-optimizations become detectable. Both runners feed the SAME +/// -implementing workload ( / +/// ) — so the BDN numbers are directly comparable to Console's +/// Console.FullBenchmark_Release_*.LLM rows, only with tighter confidence intervals. +/// +/// Output: BDN writes its native artifacts to Test_Benchmark_Results/Benchmark/BDN/ (set +/// globally in Program.cs via WithArtifactsPath). then +/// translates the into +/// rows and emits the unified Bdn.FullBenchmark_*.{log,LLM,output} triplet next to Console's +/// counterparts in Test_Benchmark_Results/Benchmark/. +/// +[MemoryDiagnoser] +public class AcBinaryVsMemPackBenchmark +{ + /// + /// The 5 TestData cells matching Console's BenchmarkLayer.Core set — + /// Small (2x2x2x2) / Medium (3x3x3x4) / Large (5x5x5x10) / Repeated (10 items) / Deep (2x4x4x8). + /// Resolved at time via + /// (same provider Console uses) so the workload graphs are bit-for-bit identical. + /// + public static IEnumerable TestDataNames => new[] { "Small", "Medium", "Large", "Repeated", "Deep" }; + + [ParamsSource(nameof(TestDataNames))] + public string TestData { get; set; } = ""; + + /// + /// Engine axis: AcBinary FastMode + Compact wire (UTF-8) vs MemoryPack Default (UTF-8). Compact-on-both-sides + /// keeps the string-encoding dimension constant so the comparison reflects engine differences only. + /// + [Params("AcBinary", "MemoryPack")] + public string Engine { get; set; } = ""; + + private ISerializerBenchmark _serializer = null!; + + [GlobalSetup] + public void Setup() + { + var allTestData = BenchmarkTestDataProvider_All_False.CreateTestDataSets(); + var testDataSet = (TestDataSet)allTestData.First(t => t.Name.StartsWith(TestData)); + + if (Engine == "AcBinary") + { + var options = AcBinarySerializerOptions.FastMode; + options.WireMode = WireMode.Compact; + _serializer = new AcBinaryBenchmark(testDataSet.Order, options, "FastMode"); + } + else + { + // MemoryPack's wire-mode-aligned ctor — Compact ↔ UTF-8 default for apples-to-apples vs AcBinary Compact. + _serializer = new MemoryPackBenchmark(testDataSet.Order, WireMode.Compact, "Default"); + } + + // Round-trip correctness check before the BDN harness starts measuring — same gate the Console + // runner enforces. Fails the run early if anything's broken (rather than producing meaningless numbers). + if (!_serializer.VerifyRoundTrip()) + throw new InvalidOperationException($"Round-trip verification FAILED for {Engine} on {TestData}."); + } + + [Benchmark] + public void Serialize() => _serializer.Serialize(); + + [Benchmark] + public void Deserialize() => _serializer.Deserialize(); +} diff --git a/AyCode.Benchmark/BdnSummaryAdapter.cs b/AyCode.Benchmark/BdnSummaryAdapter.cs new file mode 100644 index 0000000..41ce16c --- /dev/null +++ b/AyCode.Benchmark/BdnSummaryAdapter.cs @@ -0,0 +1,202 @@ +using AyCode.Core.Benchmarks.Reporting; +using AyCode.Core.Benchmarks.Workloads.Scenarios; +using AyCode.Core.Serializers; +using AyCode.Core.Serializers.Binaries; +using AyCode.Core.Tests.TestModels; +using BenchmarkDotNet.Reports; +using System.Text; + +namespace AyCode.Core.Benchmarks; + +/// +/// Translates (BDN's post-run aggregate) into the unified +/// rows consumed by , then emits the +/// Bdn.FullBenchmark_*.{log,LLM,output} triplet alongside Console's counterparts in +/// 's Test_Benchmark_Results/Benchmark/. +/// +/// Why a separate adapter: BDN's Summary is per-method (Serialize / Deserialize as separate +/// s, parameterised by TestData + Engine). The unified format collapses these +/// into per-cell rows (one row per TestData × Engine with both Ser and Des stats inline). The adapter +/// groups, transposes, and converts ns → ms before handing off to the shared writer. +/// +/// Mean vs Median: maps BDN's into +/// the BenchmarkResult's time columns — same convention as Console (which captures sample-median). +/// Min/Max/StdDev populate the inter-sample range surfaced in +/// (incl. CV-warning ⚠️ marker when stddev/median exceeds ). +/// +/// Iteration count = 1: BDN reports per-operation time (ns) — already amortized across N +/// invocations. The unified BenchmarkResult expects total-batch time + iteration count (so µs/op = +/// timeMs / iterations * 1000). Storing Mean-in-ms with iterations = 1 makes the same formula yield +/// Mean-in-µs directly. The actual BDN N count is recorded in the BDN-native artifacts (.../BDN/...) +/// for anyone who wants the raw invocation count. +/// +public static class BdnSummaryAdapter +{ + /// + /// Post-run entry point — call once after BenchmarkRunner.Run<AcBinaryVsMemPackBenchmark>(...) + /// returns. Produces the BDN-side Bdn.* file triplet AND prints the grouped-results console table + /// (same view Console produces post-run) so the user sees the cell-level deltas immediately, without + /// having to open the .LLM file. + /// + public static void WriteResults(Summary summary) + { + var allTestData = BenchmarkTestDataProvider_All_False.CreateTestDataSets(); + var results = Translate(summary, allTestData); + var ctx = CreateContext(); + + BenchmarkReportWriter.PrintGroupedResults(results, allTestData); + BenchmarkReportWriter.SaveAll(ctx, results, allTestData); + } + + private static ReportingContext CreateContext() + { +#if DEBUG + const string buildConfig = "Debug"; +#else + const string buildConfig = "Release"; +#endif + return new ReportingContext( + SourceTag: "Bdn", + ResultsDirectory: ReportingContext.ResolveResultsDirectory(), + BuildConfiguration: buildConfig, + Utf8NoBom: new UTF8Encoding(encoderShouldEmitUTF8Identifier: false), + CharsetName: GetCharsetName(), + // Warmup / Samples / TargetSampleMs are BDN-managed (not Console's adaptive engine). Zeros here + // signal "BDN handled internally" in the header; the BDN-native artifacts under .../BDN/ have + // the exact BDN config (warmup count, iteration count, run strategy) for anyone who needs it. + WarmupIterations: 0, + BenchmarkSamples: 0, + TargetSampleMs: 0, + UnstableCVThreshold: 0.03); + } + + /// + /// Looks up the human-readable name for the currently-active + /// charset. Mirrors Console's Configuration.GetCurrentCharsetName — Console's Menu sets the + /// charset before invoking the bench; for BDN the default charset (Latin1Long) is in effect unless the user + /// overrides at runtime. + /// + private static string GetCharsetName() + { + var s = BenchmarkTestDataProvider.LongStringSuffix; + return s switch + { + CharsetSuffixes.Latin1FixAscii => "Latin1FixAscii", + CharsetSuffixes.Latin1Short => "Latin1Short", + CharsetSuffixes.Latin1Long => "Latin1Long", + CharsetSuffixes.CjkBmp => "CjkBmp", + CharsetSuffixes.Cyrillic => "Cyrillic", + CharsetSuffixes.Mixed => "Mixed", + _ => "Custom" + }; + } + + private static List Translate(Summary summary, List allTestData) + { + var grouped = summary.Reports + .Where(r => r.Success && r.ResultStatistics != null) + .GroupBy(r => ( + TestData: GetParam(r, "TestData"), + Engine: GetParam(r, "Engine") + )) + .Where(g => !string.IsNullOrEmpty(g.Key.TestData) && !string.IsNullOrEmpty(g.Key.Engine)) + .ToList(); + + var results = new List(grouped.Count); + + foreach (var group in grouped) + { + var testDataSet = (TestDataSet)allTestData.First(t => t.Name.StartsWith(group.Key.TestData)); + var engineEnum = group.Key.Engine switch + { + "AcBinary" => BenchmarkEngine.AcBinary, + "MemoryPack" => BenchmarkEngine.MemoryPack, + _ => throw new InvalidOperationException($"Unknown engine in BDN params: {group.Key.Engine}") + }; + + // Construct the same workload instance AcBinaryVsMemPackBenchmark.Setup would build — same options, + // same wire mode. Reading SerializedSize + OptionsDescription from it keeps the BDN-side metadata + // in lockstep with what the workload actually serialised (no drift between hardcoded BDN strings + // and the workload's own OptionsDescription / SerializedSize). + var workload = CreateWorkload(testDataSet, group.Key.Engine); + + var result = new BenchmarkResult + { + TestDataName = testDataSet.DisplayName, + Engine = engineEnum, + IoMode = BenchmarkIoMode.ByteArray, + DispatchMode = BenchmarkDispatchMode.SGen, + OptionsPreset = group.Key.Engine == "AcBinary" ? "FastMode" : "Default", + OrderTypeName = nameof(TestOrder_All_False), + SerializedSize = workload.SerializedSize, + OptionsDescription = workload.OptionsDescription, + }; + + // ns → ms (BenchmarkResult expects ms per op with iter=1, so µs/op = ms * 1000 / 1 = ms*1000). + const double nsToMs = 1.0 / 1_000_000.0; + + foreach (var report in group) + { + var methodName = report.BenchmarkCase.Descriptor.WorkloadMethod.Name; + var stats = report.ResultStatistics!; + var allocBytes = report.GcStats.GetBytesAllocatedPerOperation(report.BenchmarkCase) ?? 0; + + if (methodName == "Serialize") + { + result.SerializeTimeMs = stats.Median * nsToMs; + result.SerializeTimeMinMs = stats.Min * nsToMs; + result.SerializeTimeMaxMs = stats.Max * nsToMs; + result.SerializeTimeStdDevMs = stats.StandardDeviation * nsToMs; + result.SerializeIterations = 1; // see class-doc "Iteration count = 1" note + result.SerializeAllocBytesPerOp = allocBytes; + } + else if (methodName == "Deserialize") + { + result.DeserializeTimeMs = stats.Median * nsToMs; + result.DeserializeTimeMinMs = stats.Min * nsToMs; + result.DeserializeTimeMaxMs = stats.Max * nsToMs; + result.DeserializeTimeStdDevMs = stats.StandardDeviation * nsToMs; + result.DeserializeIterations = 1; + result.DeserializeAllocBytesPerOp = allocBytes; + } + } + + // Compose RT from Ser + Des per-op µs (same logic as Console BenchmarkLoop's in-memory + // composition — since BDN measures Ser and Des independently, RT here is the analytic sum). + var serPerOp = BenchmarkReportWriter.ToPerOpMicros(result.SerializeTimeMs, result.SerializeIterations); + var desPerOp = BenchmarkReportWriter.ToPerOpMicros(result.DeserializeTimeMs, result.DeserializeIterations); + var rtPerOp = serPerOp + desPerOp; + result.RoundTripIterations = Math.Max(result.SerializeIterations, result.DeserializeIterations); + result.RoundTripTimeMs = rtPerOp / 1000.0 * result.RoundTripIterations; + result.RoundTripAllocBytesPerOp = result.SerializeAllocBytesPerOp + result.DeserializeAllocBytesPerOp; + + results.Add(result); + } + + return results; + } + + private static string GetParam(BenchmarkReport report, string name) => + report.BenchmarkCase.Parameters.Items.FirstOrDefault(p => p.Name == name)?.Value?.ToString() ?? ""; + + /// + /// Constructs the same workload instance would build — + /// same options, same wire mode. The adapter reads and + /// from this instance so the BDN-side BenchmarkResult + /// rows carry the same workload-side metadata the Console rows have (no risk of drift between hardcoded + /// adapter strings and what the workload actually used). + /// + /// Cost: one Serialize call inside the ctor per (TestData × Engine) cell — runs once during summary + /// translation, NOT in BDN's measured hot path. Negligible vs BDN's per-run cost. + /// + private static ISerializerBenchmark CreateWorkload(TestDataSet testDataSet, string engine) + { + if (engine == "AcBinary") + { + var options = AcBinarySerializerOptions.FastMode; + options.WireMode = WireMode.Compact; + return new AcBinaryBenchmark(testDataSet.Order, options, "FastMode"); + } + return new MemoryPackBenchmark(testDataSet.Order, WireMode.Compact, "Default"); + } +} diff --git a/AyCode.Benchmark/Program.cs b/AyCode.Benchmark/Program.cs index bc7987a..844dbac 100644 --- a/AyCode.Benchmark/Program.cs +++ b/AyCode.Benchmark/Program.cs @@ -70,9 +70,15 @@ namespace AyCode.Benchmark return; } - // Configure BenchmarkDotNet to write artifacts into the centralized benchmark directory + // BDN-native artifacts go under /Benchmark/BDN/ (per the unified output convention — + // see ReportingContext docs). The unified Bdn.FullBenchmark_*.{log,LLM,output} triplet (emitted + // by BdnSummaryAdapter after BDN finishes) lands one level up in /Benchmark/, next to + // the Console.*.* counterparts produced by the Console runner. + var bdnArtifactsDir = Path.Combine(benchmarkDir, "BDN"); + Directory.CreateDirectory(bdnArtifactsDir); + var config = ManualConfig.Create(DefaultConfig.Instance) - .WithArtifactsPath(benchmarkDir); + .WithArtifactsPath(bdnArtifactsDir); if (args.Length > 0 && args[0] == "--quick") { @@ -94,33 +100,14 @@ namespace AyCode.Benchmark return; } - if (args.Length > 0 && args[0] == "--minimal") + if (args.Length > 0 && args[0] == "--serializers") { - RunBenchmark(config, benchmarkDir, memDiagDir, "MinimalBenchmark"); - return; - } - - if (args.Length > 0 && args[0] == "--simple") - { - RunBenchmark(config, benchmarkDir, memDiagDir, "SimpleBinaryBenchmark"); - return; - } - - if (args.Length > 0 && args[0] == "--complex") - { - RunBenchmark(config, benchmarkDir, memDiagDir, "ComplexBinaryBenchmark"); - return; - } - - if (args.Length > 0 && args[0] == "--msgpack") - { - RunBenchmark(config, benchmarkDir, memDiagDir, "MessagePackComparisonBenchmark"); - return; - } - - if (args.Length > 0 && args[0] == "--sizes") - { - RunSizeComparison(); + // Unified serializer benchmark mirroring Console's "F" menu (FastestByte) — AcBinary FastMode + // Byte[] vs MemoryPack Default Byte[] across 5 TestData cells. BdnSummaryAdapter translates + // the BDN Summary into BenchmarkResult rows and emits the Bdn.FullBenchmark_*.{log,LLM,output} + // triplet to /Benchmark/ (BDN-native artifacts go under .../BDN/ via the global config). + var serializerSummary = BenchmarkRunner.Run(config); + BdnSummaryAdapter.WriteResults(serializerSummary); return; } @@ -134,25 +121,16 @@ namespace AyCode.Benchmark Console.WriteLine(" --quick Quick benchmark with tabular output (AcBinary vs MessagePack)"); Console.WriteLine(" --test Quick AcBinary test"); Console.WriteLine(" --testmsgpack Quick MessagePack test"); - Console.WriteLine(" --minimal Minimal benchmark"); - Console.WriteLine(" --simple Simple flat object benchmark"); - Console.WriteLine(" --complex Complex hierarchy (AcBinary vs JSON)"); - Console.WriteLine(" --msgpack MessagePack comparison"); - Console.WriteLine(" --sizes Size comparison only"); + Console.WriteLine(" --serializers AcBinary FastMode vs MemoryPack Default across 5 test data cells (mirrors Console F menu)"); Console.WriteLine(" --jitasm JIT disassembly analysis (shows actual x64 assembly for hot path)"); Console.WriteLine(" --save-coverage Save coverage file into Test_Benchmark_Results/CoverageReport"); - if (args.Length == 0) - { - BenchmarkSwitcher.FromAssembly(typeof(MinimalBenchmark).Assembly).Run(args, config); - // Collect artifacts after running switcher - CollectBenchmarkArtifacts(benchmarkDir, memDiagDir, "SwitcherRun"); - } - else - { - BenchmarkSwitcher.FromAssembly(typeof(MinimalBenchmark).Assembly).Run(args, config); - CollectBenchmarkArtifacts(benchmarkDir, memDiagDir, "SwitcherRun"); - } + // Default path: hand control to BDN's BenchmarkSwitcher (no args → interactive picker; with + // args → BDN parses them as benchmark filters / job options). Same code path either way — the + // known custom switches above (--serializers, --jitasm, --quick, --test, --testmsgpack, + // --save-coverage) return early before reaching this point. + BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args, config); + CollectBenchmarkArtifacts(benchmarkDir, memDiagDir, "SwitcherRun"); } /// diff --git a/AyCode.Benchmark/README.md b/AyCode.Benchmark/README.md index 3e8fdfd..21fe4b2 100644 --- a/AyCode.Benchmark/README.md +++ b/AyCode.Benchmark/README.md @@ -1,23 +1,129 @@ # AyCode.Benchmark -BenchmarkDotNet-based performance benchmarking console app. Compares AcBinary serializer against MessagePack, BSON, and JSON across various scenarios. +BenchmarkDotNet performance suite **plus** the shared workload / reporting infrastructure used by both BDN and the Console runner. Targets .NET 9. -## Key Files +## Role: dual-purpose project -- **`Program.cs`** — CLI entry point with `--quick`, `--test`, `--minimal`, `--simple`, `--complex`, `--msgpack`, `--sizes`, `--jitasm` modes. Collects results to `Test_Benchmark_Results/` at solution root. -- **`SerializationBenchmarks.cs`** — Primary suite: MinimalBenchmark, SimpleBinaryBenchmark, ComplexBinaryBenchmark, MessagePackComparisonBenchmark, AcBinaryVsMessagePackFullBenchmark, SizeComparisonBenchmark, LargeScaleBenchmark (~25K objects), AcJsonVsSystemTextJsonBenchmark. -- **`SourceGeneratorBenchmarks.cs`** — Source-generated vs runtime reflection serializers. Includes PureContractlessBenchmark, SourceGeneratorVsRuntimeBenchmark, RepeatedStringBenchmark (string interning). -- **`SignalRCommunicationBenchmarks.cs`** — Full-stack SignalR message performance: client creation → MessagePack serialization → server deserialization → response → round-trip. -- **`SignalRRoundTripBenchmarks.cs`** — Real SignalR infrastructure benchmarks: primitives, complex objects, collections, mixed parameters. -- **`JitDisassemblyBenchmark.cs`** — JIT analysis: generates .asm files to verify inlining decisions on serialize/deserialize hot paths. -- **`TaskHelperBenchmarks.cs`** — Task/timing utilities: WaitToAsync, ThreadPool (custom vs Task.Run), timing methods (UtcNow.Ticks vs TickCount64). -- **`RefForeachBenchmark.cs`** — Collection iteration patterns: array vs list, foreach vs index, ref readonly vs by-value for large structs. -- **`ValueTypePassingBenchmark.cs`** — Copy-by-value vs `in` parameter for 16-byte types (Decimal, DateTimeOffset, Guid). +This project plays **two roles**: + +1. **BDN runner Exe** — standalone benchmark host (`Program.cs` + `[Benchmark]`-decorated classes). Invoke via `dotnet run -c Release --project AyCode.Benchmark -- `. +2. **Shared workload + reporting library** — exposes `public` types under [`Workloads/Scenarios/`](Workloads/Scenarios/) and [`Reporting/`](Reporting/) that [`AyCode.Core.Serializers.Console`](../AyCode.Core.Serializers.Console/README.md) consumes via ``. + +Both runners feed the SAME `ISerializerBenchmark` workload (same test data graphs, same wire options, same payload sizes) — so Console's adaptive-engine numbers and BDN's iteration-based numbers are **directly comparable**. + +## Output convention + +Both runners emit a unified `.log` / `.LLM` / `.output` triplet to `Test_Benchmark_Results/Benchmark/` (resolved at runtime via walk-up to the nearest `AyCode.Core.sln` — worktree-aware): + +| File | Source | Content | +|---|---|---| +| `Console.FullBenchmark__.log` | Console runner | Human-readable formatted view | +| `Console.FullBenchmark__.LLM` | Console runner | Markdown table, LLM-paste-friendly | +| `Console.FullBenchmark__.output` | Console runner | Hex dump of Large cell binary | +| `Bdn.FullBenchmark__.log` | BDN runner | Same format as Console | +| `Bdn.FullBenchmark__.LLM` | BDN runner | Same | +| `Bdn.FullBenchmark__.output` | BDN runner | Same | + +BDN-native artifacts (BDN's own reports, raw measurements, run logs) go to `Test_Benchmark_Results/Benchmark/BDN/` — kept separate so the unified Console+BDN `.log/.LLM/.output` triplet stays uncluttered. + +## Architecture + +``` +┌────────────────────────────────────────────────────────────────┐ +│ AyCode.Benchmark (this project) │ +│ │ +│ Workloads/Scenarios/ public — shared workload types │ +│ ISerializerBenchmark, BenchmarkOptions, BenchmarkEnums, │ +│ AcBinaryBenchmark, MemoryPackBenchmark, │ +│ AcBinaryBufferWriterBenchmark, ... (12 concretes), │ +│ RoundTripValidator │ +│ │ +│ Reporting/ public — shared reporting types │ +│ BenchmarkResult, ReportingContext, BenchmarkReportWriter │ +│ │ +│ AcBinaryVsMemPackBenchmark.cs BDN [Benchmark] class │ +│ (mirrors Console "F" menu) │ +│ BdnSummaryAdapter.cs Summary → BenchmarkResult → │ +│ BenchmarkReportWriter.SaveAll │ +│ Program.cs BDN entry + CLI dispatch │ +│ │ +│ + KEEP: JitDisassemblyBenchmark, RefForeachBenchmark, │ +│ TaskHelperBenchmarks, ValueTypePassingBenchmark, │ +│ SourceGeneratorBenchmarks, │ +│ SignalRCommunicationBenchmarks, │ +│ SignalRRoundTripBenchmarks │ +└────────────────────────────────────────────────────────────────┘ + ▲ + │ ProjectReference (one-way) + │ +┌────────────────────────────────────────────────────────────────┐ +│ AyCode.Core.Serializers.Console │ +│ │ +│ BenchmarkLoop.cs custom adaptive measure engine │ +│ (CPU 0 pin, High priority, phase-isolated warmup, │ +│ 10-sample median + pilot, ~250ms/cell calibration) │ +│ Menu.cs / Configuration.cs / Program.cs Console UX │ +│ │ +│ Uses Benchmark's: │ +│ - Workloads/Scenarios/* (interface + concrete benchmarks) │ +│ - Reporting/BenchmarkReportWriter (SaveAll, Print...) │ +└────────────────────────────────────────────────────────────────┘ +``` + +## Two runners — same workload, different measurement engines + +| Aspect | Console (custom engine) | BDN | +|---|---|---| +| Use case | Fast iteration during micro-opt loops | Statistically confident before-commit validation | +| Measurement | Adaptive per-cell iter (target ~250ms), 10 samples + pilot, median | Warmup + N iterations, outlier removal, JIT-stabilized, process-spawn isolation | +| Time per full run | ~1-3 min | ~5-15 min | +| Noise floor | ~3-5% inter-engine delta visible | ~1-2% | +| Output format | Identical (same `BenchmarkReportWriter` writes both) | | + +The Console and BDN outputs use the SAME `BenchmarkResult` DTO and the SAME formatter, so cells are directly comparable: pick a cell in `Console.FullBenchmark_*.LLM`, find the same cell in `Bdn.FullBenchmark_*.LLM` — deltas should agree within BDN's tighter CI. + +## CLI + +``` +dotnet run -c Release --project AyCode.Benchmark -- +``` + +| Switch | Description | +|---|---| +| `--serializers` | AcBinary FastMode Byte[] vs MemoryPack Default Byte[] across 5 TestData cells (mirrors Console "F" menu / FastestByte). Emits `Bdn.FullBenchmark_*.{log,LLM,output}` + BDN-native artifacts under `BDN/`. | +| `--jitasm` | JIT disassembly analysis (x64 asm of serialize/deserialize hot path). | +| `--quick` | Quick inline benchmark (custom Stopwatch-based, not BDN). | +| `--test` / `--testmsgpack` | Quick smoke tests. | +| `--save-coverage ` | Save coverage file into `Test_Benchmark_Results/CoverageReport/`. | +| _(no args)_ | Interactive `BenchmarkSwitcher` — pick from all `[Benchmark]` classes in the assembly. | + +## Key files + +### Serializer benchmark stack (the refactor scope) +- [`AcBinaryVsMemPackBenchmark.cs`](AcBinaryVsMemPackBenchmark.cs) — BDN `[MemoryDiagnoser]` class. `[ParamsSource]`(TestData = Small/Medium/Large/Repeated/Deep) × `[Params]`(Engine = AcBinary/MemoryPack). `[GlobalSetup]` hidrátálja a Workloads scenario-ját + round-trip-verify. +- [`BdnSummaryAdapter.cs`](BdnSummaryAdapter.cs) — `Summary → List` translator (groups per `(TestData × Engine)`, ns → ms conversion, GcStats → allocated-bytes-per-op). Calls `BenchmarkReportWriter.PrintGroupedResults` + `SaveAll(ctx with SourceTag="Bdn", ...)`. +- [`Program.cs`](Program.cs) — BDN entry. Sets global `WithArtifactsPath(.../Benchmark/BDN)`; `--serializers` switch wires `BenchmarkRunner.Run` + adapter. +- [`Workloads/Scenarios/`](Workloads/Scenarios/) — shared workload types (see folder README). +- [`Reporting/`](Reporting/) — shared reporting types (see folder README). + +### KEEP benchmarks (independent — not in the serializer-refactor scope) +- [`JitDisassemblyBenchmark.cs`](JitDisassemblyBenchmark.cs) — JIT analysis: emits `.asm` files for serialize/deserialize hot paths. +- [`TaskHelperBenchmarks.cs`](TaskHelperBenchmarks.cs) — Task/timing utilities (WaitToAsync, custom ThreadPool, UtcNow.Ticks vs TickCount64). +- [`ValueTypePassingBenchmark.cs`](ValueTypePassingBenchmark.cs) — Copy-by-value vs `in` parameter for 16-byte types. +- [`RefForeachBenchmark.cs`](RefForeachBenchmark.cs) — Collection iteration patterns (array vs list, foreach vs index, ref readonly). +- [`SourceGeneratorBenchmarks.cs`](SourceGeneratorBenchmarks.cs) — Source-generated vs runtime reflection serializers (PureContractlessBenchmark, RepeatedStringBenchmark). +- [`SignalRCommunicationBenchmarks.cs`](SignalRCommunicationBenchmarks.cs) — Full-stack SignalR perf (client → server → response → round-trip). +- [`SignalRRoundTripBenchmarks.cs`](SignalRRoundTripBenchmarks.cs) — SignalR primitives/complex/collections benchmarks. ## Dependencies | Dependency | Purpose | |---|---| -| `BenchmarkDotNet` | Benchmarking framework | -| `MessagePack` | Serialization comparison target | -| `MongoDB.Bson` | BSON comparison target | +| `BenchmarkDotNet` | BDN harness | +| `MemoryPack` | Comparison target (used by Workloads scenarios + BDN class) | +| `MessagePack` | Comparison target (KEEP benchmarks + Workloads MessagePackBenchmark scenario) | +| `MongoDB.Bson` | KEEP-side comparison target | +| `Microsoft.VisualStudio.DiagnosticsHub.BenchmarkDotNetDiagnosers` | VS Profiler integration | +| `AyCode.Core` (ProjectReference) | AcBinary serializer | +| `AyCode.Core.Tests` (ProjectReference) | Test data factory (`TestDataFactory`, `TestOrder_All_False/True`, `BenchmarkTestDataProvider*`) | +| `AyCode.Core.Serializers.SourceGenerator` (Analyzer-only) | SGen for `[AcBinarySerializable]`-tagged types | diff --git a/AyCode.Benchmark/Reporting/BenchmarkResult.cs b/AyCode.Benchmark/Reporting/BenchmarkResult.cs index 48ac97f..e201c2b 100644 --- a/AyCode.Benchmark/Reporting/BenchmarkResult.cs +++ b/AyCode.Benchmark/Reporting/BenchmarkResult.cs @@ -57,6 +57,13 @@ public sealed class BenchmarkResult // (`SerializeTimeMs / SerializeIterations * 1000`). For round-trip-only rows (NamedPipe etc.), // RoundTripIterations carries the calibrated iter count; SerializeIterations and DeserializeIterations // stay 0 (Ser and Des are not separately measurable on those rows). + // + // BDN-sourced rows (populated by BdnSummaryAdapter) follow a different convention: per-op time + // is stored directly in *TimeMs with Iterations = 1, so the same TimeMs / Iterations * 1000 + // formula yields per-op µs. The actual BDN N count is NOT preserved on these rows — consumers that + // read SerializeIterations as a nominal loop count (e.g. "bytes allocated over N iterations") + // will misinterpret BDN rows. For the raw N, read the BDN-native artifacts under + // Test_Benchmark_Results/Benchmark/BDN/. public int SerializeIterations { get; set; } public int DeserializeIterations { get; set; } public int RoundTripIterations { get; set; } diff --git a/AyCode.Benchmark/Reporting/README.md b/AyCode.Benchmark/Reporting/README.md new file mode 100644 index 0000000..c762fef --- /dev/null +++ b/AyCode.Benchmark/Reporting/README.md @@ -0,0 +1,30 @@ +# Reporting + +Shared reporting types — the `BenchmarkResult` DTO that captures one cell of a benchmark run + the `BenchmarkReportWriter` that turns a list of these into the unified `.log` / `.LLM` / `.output` triplet + the `ReportingContext` bundle that parameterises both runners. + +## Layout + +- [`BenchmarkResult.cs`](BenchmarkResult.cs) — per-cell result row. `(TestData × Engine × IoMode × OptionsPreset × DispatchMode)` tuple + Ser / Des / RT timings (median, min, max, stddev — all ms-batch units), iter counts (post-calibration), allocated bytes per op, setup-side one-time alloc, `IsRoundTripOnly` flag, derived `SerializerName`. Pure DTO — no behaviour. Populated by either: + - Console `BenchmarkLoop.RunBenchmarksForTestData` (after adaptive measurement) + - BDN `BdnSummaryAdapter.Translate` (after BDN Summary is in hand) + +- [`ReportingContext.cs`](ReportingContext.cs) — record bundle for the writer: + - `SourceTag` — `"Console"` / `"Bdn"`; drives the filename prefix + - `ResultsDirectory` — resolved at startup via `ResolveResultsDirectory()` walking up from `AppContext.BaseDirectory` to the nearest `AyCode.Core.sln`, then `Test_Benchmark_Results/Benchmark/`. Worktree-aware. + - `BuildConfiguration` — `"Debug"` / `"Release"` / `"NativeAOT"`; rendered into both the filename AND the report header + - `Utf8NoBom` — shared `UTF8Encoding(false)` for all `File.WriteAllText` calls + - `CharsetName`, `WarmupIterations`, `BenchmarkSamples`, `TargetSampleMs`, `UnstableCVThreshold` — run-header info embedded in every emitted artifact + +- [`BenchmarkReportWriter.cs`](BenchmarkReportWriter.cs) — the writer itself: + - `SaveAll(ctx, results, testDataSets)` — orchestrator. Writes the `.log` (formatted text + CSV + per-cell tables + Overall aggregation), `.LLM` (markdown table + Overall aggregation), and `.output` (hex dump of the Large cell's AcBinary serialization). All three land in `ctx.ResultsDirectory` with the `{ctx.SourceTag}.FullBenchmark_{Build}_{ts}.` filename pattern. + - `PrintGroupedResults(results, testDataSets)` — colored per-cell tables to `System.Console`. Highlights MemoryPack (baseline) and AcBinary (SGen-Byte[]) rows with green/red win/lose colors, footer row shows pct deltas per metric. + - `PrintResult(result)` — single-line summary printed during the per-cell loop (real-time progress signal). + - `ComputeOverallStats(acResults, mpResults, valueSelector)` — paired-cell aggregation across `TestDataName` (arithmetic mean / geometric mean / median of per-cell ratios). Null-safe. + - `FormatMicrosWithRange(...)` — `26.86 (24.50..29.10)` style with ⚠️CV-warning suffix when stddev/median exceeds the `UnstableCVThreshold`. All formatting goes through `CultureInfo.InvariantCulture` so the CSV section in `.log` stays parseable regardless of the host locale. + - `ToPerOpMicros` / `SerPerOp` / `DesPerOp` / `RtPerOp` / `ToKilobytes` / `FormatPctSigned` / `FormatHexDump` / `AppendOverallLine` — helper utilities used inline by the report-rendering methods. + +## Conventions + +- **Time units in `BenchmarkResult`**: all `*TimeMs` fields are total-batch milliseconds. Per-op µs = `TimeMs / Iterations * 1000`. For BDN-sourced rows the adapter stores `Mean_ns / 1e6` with `Iterations = 1`, so the same formula yields per-op µs directly (`ms * 1000 = µs`). +- **InvariantCulture** is enforced everywhere a numeric value is rendered to file (`.log` CSV section, `.LLM` markdown cells). Console-output (the colored tables) uses default culture for human-friendliness. +- **`SourceTag` discriminator**: appears in three places — the filename prefix (`Console.` / `Bdn.`), the `.log` header (`║ Source: Console`), the `.LLM` H1 (`# AcBinary Benchmark [Console] Release ...`). Anyone diffing or grepping outputs can pin the source unambiguously. diff --git a/AyCode.Benchmark/SerializationBenchmarks.cs b/AyCode.Benchmark/SerializationBenchmarks.cs deleted file mode 100644 index 6c9921e..0000000 --- a/AyCode.Benchmark/SerializationBenchmarks.cs +++ /dev/null @@ -1,713 +0,0 @@ -using AyCode.Core.Extensions; -using AyCode.Core.Tests.TestModels; -using BenchmarkDotNet.Attributes; -using BenchmarkDotNet.Jobs; -using MessagePack; -using MessagePack.Resolvers; -using System.Text; -using System.Text.Json; -using System.Text.Json.Serialization; -using JsonSerializer = System.Text.Json.JsonSerializer; -using MongoDB.Bson; -using MongoDB.Bson.IO; -using MongoDB.Bson.Serialization; -using System.IO; -using AyCode.Core.Serializers.Binaries; -using AyCode.Core.Serializers.Jsons; -using AyCode.Core.Serializers; - -namespace AyCode.Core.Benchmarks; - -/// -/// Minimal benchmark to test if BenchmarkDotNet works without stack overflow. -/// -[ShortRunJob] -[MemoryDiagnoser] -public class MinimalBenchmark -{ - private byte[] _data = null!; - private string _json = null!; - - [GlobalSetup] - public void Setup() - { - // Use very simple data - no circular references - var simpleData = new { Id = 1, Name = "Test", Value = 42.5 }; - _json = System.Text.Json.JsonSerializer.Serialize(simpleData); - _data = Encoding.UTF8.GetBytes(_json); - Console.WriteLine($"Setup complete. Data size: {_data.Length} bytes"); - } - - [Benchmark] - public int GetLength() => _data.Length; - - [Benchmark] - public string GetJson() => _json; -} - -/// -/// Binary vs JSON benchmark with simple flat objects (no circular references). -/// -[ShortRunJob] -[MemoryDiagnoser] -public class SimpleBinaryBenchmark -{ - private PrimitiveTestClass _testData = null!; - private byte[] _binaryData = null!; - private string _jsonData = null!; - - [GlobalSetup] - public void Setup() - { - _testData = TestDataFactory.CreatePrimitiveTestData(); - _binaryData = AcBinarySerializer.Serialize(_testData, AcBinarySerializerOptions.WithoutReferenceHandling); - _jsonData = AcJsonSerializer.Serialize(_testData, AcJsonSerializerOptions.WithoutReferenceHandling); - - Console.WriteLine($"Binary: {_binaryData.Length} bytes, JSON: {_jsonData.Length} chars"); - } - - [Benchmark(Description = "Binary Serialize")] - public byte[] SerializeBinary() => AcBinarySerializer.Serialize(_testData, AcBinarySerializerOptions.WithoutReferenceHandling); - - [Benchmark(Description = "JSON Serialize", Baseline = true)] - public string SerializeJson() => AcJsonSerializer.Serialize(_testData, AcJsonSerializerOptions.WithoutReferenceHandling); - - [Benchmark(Description = "Binary Deserialize")] - public PrimitiveTestClass? DeserializeBinary() => AcBinaryDeserializer.Deserialize(_binaryData); - - [Benchmark(Description = "JSON Deserialize")] - public PrimitiveTestClass? DeserializeJson() => AcJsonDeserializer.Deserialize(_jsonData, AcJsonSerializerOptions.WithoutReferenceHandling); -} - -/// -/// Complex hierarchy benchmark - AcBinary vs JSON only (no MessagePack to isolate the issue). -/// Uses AcBinary without reference handling. -/// -[ShortRunJob] -[MemoryDiagnoser] -[RankColumn] -public class ComplexBinaryBenchmark -{ - private TestOrder_All_True _testOrder = null!; - private byte[] _acBinaryData = null!; - private string _jsonData = null!; - - private AcBinarySerializerOptions _binaryOptions = null!; - private AcJsonSerializerOptions _jsonOptions = null!; - - [GlobalSetup] - public void Setup() - { - Console.WriteLine("Creating test data..."); - _testOrder = TestDataFactory.CreateBenchmarkOrder( - itemCount: 2, - palletsPerItem: 2, - measurementsPerPallet: 2, - pointsPerMeasurement: 3); - Console.WriteLine($"Created order with {_testOrder.Items.Count} items"); - - _binaryOptions = AcBinarySerializerOptions.WithoutReferenceHandling; - _jsonOptions = AcJsonSerializerOptions.WithoutReferenceHandling; - - Console.WriteLine("Serializing AcBinary..."); - _acBinaryData = AcBinarySerializer.Serialize(_testOrder, _binaryOptions); - Console.WriteLine($"AcBinary size: {_acBinaryData.Length} bytes"); - - Console.WriteLine("Serializing JSON..."); - _jsonData = AcJsonSerializer.Serialize(_testOrder, _jsonOptions); - Console.WriteLine($"JSON size: {_jsonData.Length} chars"); - - var jsonBytes = Encoding.UTF8.GetByteCount(_jsonData); - Console.WriteLine($"\n=== SIZE COMPARISON ==="); - Console.WriteLine($"AcBinary: {_acBinaryData.Length,8:N0} bytes ({100.0 * _acBinaryData.Length / jsonBytes:F1}%)"); - Console.WriteLine($"JSON: {jsonBytes,8:N0} bytes (100.0%)"); - } - - [Benchmark(Description = "AcBinary Serialize")] - public byte[] Serialize_AcBinary() => AcBinarySerializer.Serialize(_testOrder, _binaryOptions); - - [Benchmark(Description = "JSON Serialize", Baseline = true)] - public string Serialize_Json() => AcJsonSerializer.Serialize(_testOrder, _jsonOptions); - - [Benchmark(Description = "AcBinary Deserialize")] - public TestOrder_All_True? Deserialize_AcBinary() => AcBinaryDeserializer.Deserialize(_acBinaryData); - - [Benchmark(Description = "JSON Deserialize")] - public TestOrder_All_True? Deserialize_Json() => AcJsonDeserializer.Deserialize(_jsonData, _jsonOptions); -} - -/// -/// Full comparison with MessagePack and BSON - AcBinary uses NO reference handling everywhere. -/// -[ShortRunJob] -[MemoryDiagnoser] -[RankColumn] -public class MessagePackComparisonBenchmark -{ - private TestOrder_All_True _testOrder = null!; - private byte[] _acBinaryData = null!; - private byte[] _msgPackData = null!; - private byte[] _bsonData = null!; - private string _jsonData = null!; - - private AcBinarySerializerOptions _binaryOptions = null!; - private MessagePackSerializerOptions _msgPackOptions = null!; - private AcJsonSerializerOptions _jsonOptions = null!; - - [GlobalSetup] - public void Setup() - { - Console.WriteLine("Creating test data..."); - _testOrder = TestDataFactory.CreateBenchmarkOrder( - itemCount: 2, - palletsPerItem: 2, - measurementsPerPallet: 2, - pointsPerMeasurement: 3); - - _binaryOptions = AcBinarySerializerOptions.WithoutReferenceHandling; - _msgPackOptions = ContractlessStandardResolver.Options.WithCompression(MessagePackCompression.None); - _jsonOptions = AcJsonSerializerOptions.WithoutReferenceHandling; - - _acBinaryData = AcBinarySerializer.Serialize(_testOrder, _binaryOptions); - _jsonData = AcJsonSerializer.Serialize(_testOrder, _jsonOptions); - - // MessagePack serialization in try-catch to see if it fails - try - { - Console.WriteLine("Serializing MessagePack..."); - _msgPackData = MessagePackSerializer.Serialize(_testOrder, _msgPackOptions); - Console.WriteLine($"MessagePack size: {_msgPackData.Length} bytes"); - } - catch (Exception ex) - { - Console.WriteLine($"MessagePack serialization failed: {ex.Message}"); - _msgPackData = Array.Empty(); - } - - // BSON serialization - try - { - Console.WriteLine("Serializing BSON..."); - var bsonDoc = _testOrder.ToBsonDocument(); - _bsonData = bsonDoc.ToBson(); - Console.WriteLine($"BSON size: {_bsonData.Length} bytes"); - } - catch (Exception ex) - { - Console.WriteLine($"BSON serialization failed: {ex.Message}"); - _bsonData = Array.Empty(); - } - - var jsonBytes = Encoding.UTF8.GetByteCount(_jsonData); - Console.WriteLine($"\n=== SIZE COMPARISON ==="); - Console.WriteLine($"AcBinary: {_acBinaryData.Length,8:N0} bytes ({100.0 * _acBinaryData.Length / jsonBytes:F1}%)"); - Console.WriteLine($"MessagePack: {_msgPackData.Length,8:N0} bytes ({100.0 * _msgPackData.Length / jsonBytes:F1}%)"); - Console.WriteLine($"BSON: {_bsonData.Length,8:N0} bytes ({100.0 * _bsonData.Length / jsonBytes:F1}%)"); - Console.WriteLine($"JSON: {jsonBytes,8:N0} bytes (100.0%)"); - } - - [Benchmark(Description = "AcBinary Serialize")] - public byte[] Serialize_AcBinary() => AcBinarySerializer.Serialize(_testOrder, _binaryOptions); - - [Benchmark(Description = "MessagePack Serialize", Baseline = true)] - public byte[] Serialize_MsgPack() => MessagePackSerializer.Serialize(_testOrder, _msgPackOptions); - - [Benchmark(Description = "BSON Serialize")] - public byte[] Serialize_Bson() => _testOrder.ToBsonDocument().ToBson(); - - [Benchmark(Description = "AcBinary Deserialize")] - public TestOrder_All_True? Deserialize_AcBinary() => AcBinaryDeserializer.Deserialize(_acBinaryData); - - [Benchmark(Description = "MessagePack Deserialize")] - public TestOrder_All_True? Deserialize_MsgPack() => MessagePackSerializer.Deserialize(_msgPackData, _msgPackOptions); - - [Benchmark(Description = "BSON Deserialize")] - public TestOrder_All_True? Deserialize_Bson() - { - if (_bsonData == null || _bsonData.Length == 0) return null; - using var ms = new MemoryStream(_bsonData); - using var reader = new BsonBinaryReader(ms); - return BsonSerializer.Deserialize(reader); - } -} - -/// -/// Comprehensive AcBinary vs MessagePack comparison benchmark. -/// Tests: NoRef (everywhere), Populate, Serialize, Deserialize, Size -/// -[ShortRunJob] -[MemoryDiagnoser] -[RankColumn] -public class AcBinaryVsMessagePackFullBenchmark -{ - // Test data - private TestOrder_All_True _testOrder = null!; - private TestOrder_All_True _populateTarget = null!; - - // Serialized data - AcBinary - private byte[] _acBinaryWithRef = null!; - private byte[] _acBinaryNoRef = null!; - - // Serialized data - MessagePack - private byte[] _msgPackData = null!; - private byte[] _bsonData = null!; - - // Options - private AcBinarySerializerOptions _withRefOptions = null!; - private AcBinarySerializerOptions _noRefOptions = null!; - private MessagePackSerializerOptions _msgPackOptions = null!; - - [GlobalSetup] - public void Setup() - { - // Create test data with shared references - TestDataFactory.ResetIdCounter(); - var sharedTag = TestDataFactory.CreateTag("SharedTag_All_True"); - var sharedUser = TestDataFactory.CreateUser("shareduser"); - var sharedMeta = TestDataFactory.CreateMetadata("shared", withChild: true); - - _testOrder = TestDataFactory.CreateOrder( - itemCount: 3, - palletsPerItem: 3, - measurementsPerPallet: 3, - pointsPerMeasurement: 4, - sharedTag: sharedTag, - sharedUser: sharedUser, - sharedMetadata: sharedMeta); - - // Setup options - WithRef uses Default (which has reference handling), NoRef explicitly disables it - _withRefOptions = AcBinarySerializerOptions.Default; - _noRefOptions = AcBinarySerializerOptions.WithoutReferenceHandling; - _msgPackOptions = ContractlessStandardResolver.Options.WithCompression(MessagePackCompression.None); - - // Serialize with different options - _acBinaryWithRef = AcBinarySerializer.Serialize(_testOrder, _withRefOptions); - _acBinaryNoRef = AcBinarySerializer.Serialize(_testOrder, _noRefOptions); - _msgPackData = MessagePackSerializer.Serialize(_testOrder, _msgPackOptions); - - // BSON - try - { - _bsonData = _testOrder.ToBsonDocument().ToBson(); - } - catch - { - _bsonData = Array.Empty(); - } - - // Create populate target - _populateTarget = new TestOrder_All_True { Id = _testOrder.Id }; - foreach (var item in _testOrder.Items) - { - _populateTarget.Items.Add(new TestOrderItem_All_True { Id = item.Id }); - } - - // Print size comparison - PrintSizeComparison(); - } - - private void PrintSizeComparison() - { - Console.WriteLine("\n" + new string('=', 60)); - Console.WriteLine("?? SIZE COMPARISON (AcBinary vs MessagePack vs BSON)"); - Console.WriteLine(new string('=', 60)); - Console.WriteLine($" AcBinary WithRef: {_acBinaryWithRef.Length,8:N0} bytes"); - Console.WriteLine($" AcBinary NoRef: {_acBinaryNoRef.Length,8:N0} bytes"); - Console.WriteLine($" MessagePack: {_msgPackData.Length,8:N0} bytes"); - Console.WriteLine($" BSON: {_bsonData.Length,8:N0} bytes"); - Console.WriteLine(new string('-', 60)); - Console.WriteLine($" AcBinary/MsgPack: {100.0 * _acBinaryWithRef.Length / Math.Max(1, _msgPackData.Length):F1}% (WithRef)"); - Console.WriteLine($" AcBinary/MsgPack: {100.0 * _acBinaryNoRef.Length / Math.Max(1, _msgPackData.Length):F1}% (NoRef)"); - Console.WriteLine(new string('=', 60) + "\n"); - } - - #region Serialize Benchmarks - - [Benchmark(Description = "AcBinary Serialize WithRef")] - public byte[] Serialize_AcBinary_WithRef() => AcBinarySerializer.Serialize(_testOrder, _withRefOptions); - - [Benchmark(Description = "AcBinary Serialize NoRef")] - public byte[] Serialize_AcBinary_NoRef() => AcBinarySerializer.Serialize(_testOrder, _noRefOptions); - - [Benchmark(Description = "MessagePack Serialize", Baseline = true)] - public byte[] Serialize_MsgPack() => MessagePackSerializer.Serialize(_testOrder, _msgPackOptions); - - [Benchmark(Description = "BSON Serialize")] - public byte[] Serialize_Bson() => _testOrder.ToBsonDocument().ToBson(); - - #endregion - - #region Deserialize Benchmarks - - [Benchmark(Description = "AcBinary Deserialize WithRef")] - public TestOrder_All_True? Deserialize_AcBinary_WithRef() => AcBinaryDeserializer.Deserialize(_acBinaryWithRef); - - [Benchmark(Description = "AcBinary Deserialize NoRef")] - public TestOrder_All_True? Deserialize_AcBinary_NoRef() => AcBinaryDeserializer.Deserialize(_acBinaryNoRef); - - [Benchmark(Description = "MessagePack Deserialize")] - public TestOrder_All_True? Deserialize_MsgPack() => MessagePackSerializer.Deserialize(_msgPackData, _msgPackOptions); - - [Benchmark(Description = "BSON Deserialize")] - public TestOrder_All_True? Deserialize_Bson() - { - if (_bsonData == null || _bsonData.Length == 0) return null; - using var ms = new MemoryStream(_bsonData); - using var reader = new BsonBinaryReader(ms); - return BsonSerializer.Deserialize(reader); - } - - #endregion - - #region Populate Benchmarks - - [Benchmark(Description = "AcBinary Populate WithRef")] - public void Populate_AcBinary_WithRef() - { - var target = CreatePopulateTarget(); - AcBinaryDeserializer.Populate(_acBinaryWithRef, target); - } - - [Benchmark(Description = "AcBinary Populate NoRef")] - public void Populate_AcBinary_NoRef() - { - // Create fresh target each time to avoid state accumulation - var target = CreatePopulateTarget(); - AcBinaryDeserializer.Populate(_acBinaryNoRef, target); - } - - [Benchmark(Description = "AcBinary PopulateMerge WithRef")] - public void PopulateMerge_AcBinary_WithRef() - { - var target = CreatePopulateTarget(); - AcBinaryDeserializer.PopulateMerge(_acBinaryWithRef, target); - } - - [Benchmark(Description = "AcBinary PopulateMerge NoRef")] - public void PopulateMerge_AcBinary_NoRef() - { - // Create fresh target each time to avoid state accumulation - var target = CreatePopulateTarget(); - AcBinaryDeserializer.PopulateMerge(_acBinaryNoRef, target); - } - - private TestOrder_All_True CreatePopulateTarget() - { - var target = new TestOrder_All_True { Id = _testOrder.Id }; - foreach (var item in _testOrder.Items) - { - target.Items.Add(new TestOrderItem_All_True { Id = item.Id }); - } - return target; - } - - #endregion -} - -/// -/// Detailed size comparison - not a performance benchmark, just size output. -/// Now includes BSON size output and uses AcBinary without reference handling. -/// -[ShortRunJob] -[MemoryDiagnoser] -public class SizeComparisonBenchmark -{ - private TestOrder_All_True _smallOrder = null!; - private TestOrder_All_True _mediumOrder = null!; - private TestOrder_All_True _largeOrder = null!; - - private MessagePackSerializerOptions _msgPackOptions = null!; - private AcBinarySerializerOptions _withRefOptions = null!; - private AcBinarySerializerOptions _noRefOptions = null!; - - [GlobalSetup] - public void Setup() - { - _msgPackOptions = ContractlessStandardResolver.Options.WithCompression(MessagePackCompression.None); - _withRefOptions = AcBinarySerializerOptions.WithoutReferenceHandling; - _noRefOptions = AcBinarySerializerOptions.WithoutReferenceHandling; - - // Small order - TestDataFactory.ResetIdCounter(); - _smallOrder = TestDataFactory.CreateOrder(itemCount: 1, palletsPerItem: 1, measurementsPerPallet: 1, pointsPerMeasurement: 2); - - // Medium order - TestDataFactory.ResetIdCounter(); - var sharedTag = TestDataFactory.CreateTag("Shared"); - var sharedUser = TestDataFactory.CreateUser("shared"); - _mediumOrder = TestDataFactory.CreateOrder( - itemCount: 3, palletsPerItem: 2, measurementsPerPallet: 2, pointsPerMeasurement: 3, - sharedTag: sharedTag, sharedUser: sharedUser); - - // Large order - TestDataFactory.ResetIdCounter(); - sharedTag = TestDataFactory.CreateTag("SharedLarge"); - sharedUser = TestDataFactory.CreateUser("sharedlarge"); - var sharedMeta = TestDataFactory.CreateMetadata("meta", withChild: true); - _largeOrder = TestDataFactory.CreateOrder( - itemCount: 5, palletsPerItem: 4, measurementsPerPallet: 3, pointsPerMeasurement: 5, - sharedTag: sharedTag, sharedUser: sharedUser, sharedMetadata: sharedMeta); - - PrintDetailedSizeComparison(); - } - - private void PrintDetailedSizeComparison() - { - Console.WriteLine("\n" + new string('=', 80)); - Console.WriteLine("?? DETAILED SIZE COMPARISON: AcBinary vs MessagePack vs BSON"); - Console.WriteLine(new string('=', 80)); - - PrintOrderSize("Small Order (1x1x1x2)", _smallOrder); - PrintOrderSize("Medium Order (3x2x2x3) + SharedRefs", _mediumOrder); - PrintOrderSize("Large Order (5x4x3x5) + SharedRefs", _largeOrder); - - Console.WriteLine(new string('=', 80) + "\n"); - } - - private void PrintOrderSize(string name, TestOrder_All_True order) - { - var acWithRef = AcBinarySerializer.Serialize(order, _withRefOptions); - var acNoRef = AcBinarySerializer.Serialize(order, _noRefOptions); - var msgPack = MessagePackSerializer.Serialize(order, _msgPackOptions); - byte[] bson; - try { bson = order.ToBsonDocument().ToBson(); } catch { bson = Array.Empty(); } - - Console.WriteLine($"\n {name}:"); - Console.WriteLine($" AcBinary WithRef: {acWithRef.Length,8:N0} bytes ({100.0 * acWithRef.Length / Math.Max(1, msgPack.Length),5:F1}% of MsgPack)"); - Console.WriteLine($" AcBinary NoRef: {acNoRef.Length,8:N0} bytes ({100.0 * acNoRef.Length / Math.Max(1, msgPack.Length),5:F1}% of MsgPack)"); - Console.WriteLine($" MessagePack: {msgPack.Length,8:N0} bytes (100.0%)"); - Console.WriteLine($" BSON: {bson.Length,8:N0} bytes (compared to MsgPack)"); - - var withRefSaving = msgPack.Length - acWithRef.Length; - var noRefSaving = msgPack.Length - acNoRef.Length; - if (withRefSaving > 0) - Console.WriteLine($" ?? AcBinary WithRef saves: {withRefSaving:N0} bytes ({100.0 * withRefSaving / msgPack.Length:F1}%)"); - else - Console.WriteLine($" ?? AcBinary WithRef larger by: {-withRefSaving:N0} bytes"); - } - - [Benchmark(Description = "Placeholder")] - public int Placeholder() => 1; // Just to make BenchmarkDotNet happy -} - -public enum BinaryBenchmarkMode -{ - Default, - NoReferenceHandling, - FastMode -} - -public abstract class AcBinaryOptionsBenchmarkBase -{ - protected TestOrder_All_True TestOrder = null!; - protected AcBinarySerializerOptions BinaryOptions = null!; - protected MessagePackSerializerOptions MsgPackOptions = null!; - protected byte[] AcBinaryData = null!; - protected byte[] MsgPackData = null!; - - [Params(BinaryBenchmarkMode.Default, BinaryBenchmarkMode.NoReferenceHandling, BinaryBenchmarkMode.FastMode)] - public BinaryBenchmarkMode Mode { get; set; } - - [GlobalSetup] - public void GlobalSetup() - { - TestDataFactory.ResetIdCounter(); - TestOrder = TestDataFactory.CreateBenchmarkOrder( - itemCount: 4, - palletsPerItem: 3, - measurementsPerPallet: 3, - pointsPerMeasurement: 6); - - BinaryOptions = CreateBinaryOptions(Mode); - MsgPackOptions = ContractlessStandardResolver.Options.WithCompression(MessagePackCompression.None); - - AcBinaryData = AcBinarySerializer.Serialize(TestOrder, BinaryOptions); - MsgPackData = MessagePackSerializer.Serialize(TestOrder, MsgPackOptions); - - var ratio = MsgPackData.Length == 0 ? 0 : 100.0 * AcBinaryData.Length / MsgPackData.Length; - Console.WriteLine($"[BenchmarkSetup] Mode={Mode} | AcBinary={AcBinaryData.Length} bytes | MessagePack={MsgPackData.Length} bytes | Ratio={ratio:F1}%"); - } - - private static AcBinarySerializerOptions CreateBinaryOptions(BinaryBenchmarkMode mode) => mode switch - { - BinaryBenchmarkMode.Default => new AcBinarySerializerOptions(), - BinaryBenchmarkMode.NoReferenceHandling => AcBinarySerializerOptions.WithoutReferenceHandling, - BinaryBenchmarkMode.FastMode => new AcBinarySerializerOptions - { - UseMetadata = false, - UseStringInterning = StringInterningMode.None, - ReferenceHandling = ReferenceHandlingMode.None, - }, - _ => new AcBinarySerializerOptions() - }; -} - -[ShortRunJob] -[MemoryDiagnoser] -[RankColumn] -public class AcBinaryOptionsSerializeBenchmark : AcBinaryOptionsBenchmarkBase -{ - [Benchmark(Description = "MessagePack Serialize", Baseline = true)] - public byte[] Serialize_MessagePack() => MessagePackSerializer.Serialize(TestOrder, MsgPackOptions); - - [Benchmark(Description = "AcBinary Serialize")] - public byte[] Serialize_AcBinary() => AcBinarySerializer.Serialize(TestOrder, BinaryOptions); -} - -[ShortRunJob] -[MemoryDiagnoser] -[RankColumn] -public class AcBinaryOptionsDeserializeBenchmark : AcBinaryOptionsBenchmarkBase -{ - [Benchmark(Description = "MessagePack Deserialize", Baseline = true)] - public TestOrder_All_True? Deserialize_MessagePack() => MessagePackSerializer.Deserialize(MsgPackData, MsgPackOptions); - - [Benchmark(Description = "AcBinary Deserialize")] - public TestOrder_All_True? Deserialize_AcBinary() => AcBinaryDeserializer.Deserialize(AcBinaryData); -} - -/// -/// Large-scale benchmark simulating production workloads. -/// Tests with ~50,000+ IId objects with deep hierarchy and shared references. -/// This is closer to real-world scenarios with 2200 root items and 4-5MB binary data. -/// -[ShortRunJob] -[MemoryDiagnoser] -[RankColumn] -public class LargeScaleBinaryBenchmark -{ - // Test data - smaller scale for benchmark (500 items ? 25K objects) - // Production would be 2200 items ? 100K+ objects - private TestOrder_All_True _testOrder = null!; - private TestOrder_All_True _populateTarget = null!; - - // Serialized data - private byte[] _acBinaryData = null!; - private byte[] _msgPackData = null!; - - // Options - private AcBinarySerializerOptions _binaryOptions = null!; - private MessagePackSerializerOptions _msgPackOptions = null!; - - private int _objectCount; - - [GlobalSetup] - public void Setup() - { - Console.WriteLine("Creating large-scale test data..."); - - // Use 500 root items for benchmark (?25K objects) - // Production would use 2200 (?100K+ objects) - const int rootItems = 500; - const int pallets = 3; - const int measurements = 3; - const int points = 4; - - _objectCount = TestDataFactory.CalculateObjectCount(rootItems, pallets, measurements, points); - Console.WriteLine($"Creating ~{_objectCount:N0} IId objects..."); - - _testOrder = TestDataFactory.CreateLargeScaleBenchmarkOrder(rootItems, pallets, measurements, points); - Console.WriteLine($"Created order with {_testOrder.Items.Count} root items"); - - _binaryOptions = AcBinarySerializerOptions.WithoutReferenceHandling; - _msgPackOptions = ContractlessStandardResolver.Options.WithCompression(MessagePackCompression.None); - - Console.WriteLine("Serializing AcBinary..."); - _acBinaryData = AcBinarySerializer.Serialize(_testOrder, _binaryOptions); - - Console.WriteLine("Serializing MessagePack..."); - _msgPackData = MessagePackSerializer.Serialize(_testOrder, _msgPackOptions); - - // Create populate target - _populateTarget = new TestOrder_All_True { Id = _testOrder.Id }; - foreach (var item in _testOrder.Items.Take(10)) // Only first 10 for populate target - { - _populateTarget.Items.Add(new TestOrderItem_All_True { Id = item.Id }); - } - - PrintStats(); - } - - private void PrintStats() - { - Console.WriteLine("\n" + new string('=', 70)); - Console.WriteLine("?? LARGE-SCALE BENCHMARK STATS"); - Console.WriteLine(new string('=', 70)); - Console.WriteLine($" Root Items: {_testOrder.Items.Count:N0}"); - Console.WriteLine($" Total Objects: ~{_objectCount:N0} IId objects"); - Console.WriteLine($" AcBinary Size: {_acBinaryData.Length:N0} bytes ({_acBinaryData.Length / 1024.0 / 1024.0:F2} MB)"); - Console.WriteLine($" MsgPack Size: {_msgPackData.Length:N0} bytes ({_msgPackData.Length / 1024.0 / 1024.0:F2} MB)"); - Console.WriteLine($" Size Ratio: {100.0 * _acBinaryData.Length / _msgPackData.Length:F1}% of MsgPack"); - Console.WriteLine(new string('=', 70) + "\n"); - } - - [Benchmark(Description = "LargeScale AcBinary Deserialize")] - public TestOrder_All_True? Deserialize_AcBinary() => AcBinaryDeserializer.Deserialize(_acBinaryData); - - [Benchmark(Description = "LargeScale MsgPack Deserialize", Baseline = true)] - public TestOrder_All_True? Deserialize_MsgPack() => MessagePackSerializer.Deserialize(_msgPackData, _msgPackOptions); - - [Benchmark(Description = "LargeScale AcBinary Serialize")] - public byte[] Serialize_AcBinary() => AcBinarySerializer.Serialize(_testOrder, _binaryOptions); - - [Benchmark(Description = "LargeScale MsgPack Serialize")] - public byte[] Serialize_MsgPack() => MessagePackSerializer.Serialize(_testOrder, _msgPackOptions); -} - -/// -/// AcJson vs System.Text.Json comparison - measures Newtonsoft.Json based AcJson against modern STJ. -/// Uses simple flat object (PrimitiveTestClass) to avoid circular reference issues. -/// -[ShortRunJob] -[MemoryDiagnoser] -[RankColumn] -public class AcJsonVsSystemTextJsonBenchmark -{ - private PrimitiveTestClass _testData = null!; - private string _acJsonData = null!; - private string _stjData = null!; - private AcJsonSerializerOptions _acJsonOptions = null!; - private JsonSerializerOptions _stjOptions = null!; - - [GlobalSetup] - public void Setup() - { - Console.WriteLine("Creating test data for AcJson vs System.Text.Json..."); - - // Use simple flat object to avoid circular reference issues - _testData = TestDataFactory.CreatePrimitiveTestData(); - - // Setup options - _acJsonOptions = AcJsonSerializerOptions.WithoutReferenceHandling; - _stjOptions = new JsonSerializerOptions - { - WriteIndented = false, - DefaultIgnoreCondition = JsonIgnoreCondition.Never, - ReferenceHandler = null // No reference handling - }; - - // Pre-serialize - _acJsonData = AcJsonSerializer.Serialize(_testData, _acJsonOptions); - _stjData = JsonSerializer.Serialize(_testData, _stjOptions); - - Console.WriteLine($"AcJson size: {_acJsonData.Length:N0} chars"); - Console.WriteLine($"STJ size: {_stjData.Length:N0} chars"); - Console.WriteLine($"Size ratio: {100.0 * _acJsonData.Length / _stjData.Length:F1}%"); - } - - [Benchmark(Description = "AcJson Serialize", Baseline = true)] - public string Serialize_AcJson() => - AcJsonSerializer.Serialize(_testData, _acJsonOptions); - - [Benchmark(Description = "System.Text.Json Serialize")] - public string Serialize_STJ() => - JsonSerializer.Serialize(_testData, _stjOptions); - - [Benchmark(Description = "AcJson Deserialize")] - public PrimitiveTestClass? Deserialize_AcJson() => - AcJsonDeserializer.Deserialize(_acJsonData, _acJsonOptions); - - [Benchmark(Description = "System.Text.Json Deserialize")] - public PrimitiveTestClass? Deserialize_STJ() => - JsonSerializer.Deserialize(_stjData, _stjOptions); -} \ No newline at end of file diff --git a/AyCode.Benchmark/Workloads/Scenarios/README.md b/AyCode.Benchmark/Workloads/Scenarios/README.md new file mode 100644 index 0000000..04909b5 --- /dev/null +++ b/AyCode.Benchmark/Workloads/Scenarios/README.md @@ -0,0 +1,43 @@ +# Workloads / Scenarios + +Shared workload + scenario types used by **both** the Console runner (custom adaptive measure engine) and the BDN runner (`AcBinaryVsMemPackBenchmark` in the parent folder). Same wire payloads, same options, same round-trip-verify gate → Console and BDN cells are directly comparable. + +## Layout + +### Contract types + +- [`ISerializerBenchmark.cs`](ISerializerBenchmark.cs) — common contract for every (Engine × IoMode × OptionsPreset) row. `Serialize()` / `Deserialize()` hot-path + warmup hooks + `VerifyRoundTrip()` for the pre-warmup correctness gate. Round-trip-only benchmarks (NamedPipe / in-memory Pipe) set `IsRoundTripOnly = true` and let the bench loop skip the Des-phase. +- [`BenchmarkEnums.cs`](BenchmarkEnums.cs) — `BenchmarkEngine` / `BenchmarkIoMode` / `BenchmarkDispatchMode` / `BenchmarkLayer` / `BenchmarkOpMode` / `SerializerSelectionMode` + `ToDisplay()` extensions for the column-friendly rendering used by every output formatter. +- [`BenchmarkOptions.cs`](BenchmarkOptions.cs) — per-engine options-formatting helpers + the cached `AttrFlags` aggregation (assembly-scan of `[AcBinarySerializable]` feature flags) + `GetMemPack(WireMode)` for the wire-mode-aligned MemoryPack-options selection. +- [`RoundTripValidator.cs`](RoundTripValidator.cs) — universal deep-equality oracle via canonical System.Text.Json. Called by every benchmark's `VerifyRoundTrip()` before warmup. AOT-skipped (STJ reflection path incompatible). + +### Concrete benchmarks (12 implementations) + +**AcBinary** (7 variants — different I/O modes): +- [`AcBinaryBenchmark.cs`](AcBinaryBenchmark.cs) — `Byte[]` API. Headline AcBinary row. +- [`AcBinaryBufferWriterBenchmark.cs`](AcBinaryBufferWriterBenchmark.cs) — pre-allocated, reused `ArrayBufferWriter`. +- [`AcBinaryFreshBufferWriterBenchmark.cs`](AcBinaryFreshBufferWriterBenchmark.cs) — fresh `ArrayBufferWriter` per call (one-shot scenario, 4 KB chunk). +- [`AcBinaryNamedPipeBenchmark.cs`](AcBinaryNamedPipeBenchmark.cs) — chunked-framed `AsyncPipe` over kernel NamedPipe (long-lived, multi-message, 2-task pipeline). +- [`AcBinaryNamedPipeRawByteArrayBenchmark.cs`](AcBinaryNamedPipeRawByteArrayBenchmark.cs) — raw `byte[]` over kernel NamedPipe (no chunk-framing, Read+Des sequential after Read completes). +- [`AcBinaryInMemoryPipeBenchmark.cs`](AcBinaryInMemoryPipeBenchmark.cs) — chunked-framed `AsyncPipe` over in-memory `System.IO.Pipelines.Pipe` (zero kernel involvement, isolates streaming-framework CPU cost from kernel-pipe transport overhead). +- [`AcBinaryInMemoryRawByteArrayBenchmark.cs`](AcBinaryInMemoryRawByteArrayBenchmark.cs) — raw `byte[]` over in-memory cross-thread handoff (no transport at all, completes the 2×2 [chunked|raw] × [kernel|memory] matrix). + +**MemoryPack** (3 variants — apples-to-apples with the AcBinary I/O modes): +- [`MemoryPackBenchmark.cs`](MemoryPackBenchmark.cs) — `Byte[]` API. SOTA baseline. +- [`MemoryPackBufferWriterBenchmark.cs`](MemoryPackBufferWriterBenchmark.cs) — reused `ArrayBufferWriter`. +- [`MemoryPackFreshBufferWriterBenchmark.cs`](MemoryPackFreshBufferWriterBenchmark.cs) — fresh `ArrayBufferWriter` per call. + +**Other** (reference comparison, typically disabled in active suite): +- [`MessagePackBenchmark.cs`](MessagePackBenchmark.cs) — JIT-only (AOT-incompatible — v3 StandardResolver falls back to `Activator.CreateInstance` on trimmed closed-generic types). +- [`SystemTextJsonBenchmark.cs`](SystemTextJsonBenchmark.cs) — String I/O mode, reflection-based metadata. Far behind binary serializers on µs/op; useful as a JSON baseline when activated. + +## Convention + +Every concrete benchmark: + +1. Stores the test data graph + serializer options in its ctor and pre-computes a `_serialized` byte array for `SerializedSize` reporting. +2. Implements `Serialize()` / `Deserialize()` as `[MethodImpl(NoInlining)]` hot-paths — the bench loop drives these directly through warmup + adaptive-iter calibration + measurement. +3. Implements `VerifyRoundTrip()` by calling `RoundTripValidator.DeepEqualsViaJson(original, roundTripped)` on the result of a single Ser+Des pass. +4. Round-trip-only variants (NamedPipe / in-memory Pipe) override `IsRoundTripOnly => true`, route the full Ser+wire+Des roundtrip through `Serialize()`, and leave `Deserialize()` as a no-op. + +The runner (Console `BenchmarkLoop` or BDN `AcBinaryVsMemPackBenchmark`) creates the appropriate concrete via factory helpers and drives the contract — no scenario-specific knowledge in the runner. diff --git a/AyCode.Core.Serializers.Console/BenchmarkLoop.cs b/AyCode.Core.Serializers.Console/BenchmarkLoop.cs index 12a331a..0ae7329 100644 --- a/AyCode.Core.Serializers.Console/BenchmarkLoop.cs +++ b/AyCode.Core.Serializers.Console/BenchmarkLoop.cs @@ -413,8 +413,9 @@ internal static class BenchmarkLoop { MakeAcBinary(testData, fastestByteOptions, "FastMode"), //MakeAcBinary(testData, fastWireOptions, "FastMode (FastWire)"), - // MemPack canonically on _All_True (no AcBinary opt-in/opt-out axis applies; the MemoryPackable - // contract serialises identical bytes either way, but _All_True is the established baseline). + // MemPack uses _All_False (the AcBinary opt-in/opt-out axis doesn't apply — MemoryPackable + // serialises identical bytes either way; _All_False matches the orderFalse variant the test + // data factory already built, no extra graph allocation needed). new MemoryPackBenchmark(orderFalse, Configuration.SelectedWireMode, "Default"), }; } @@ -552,7 +553,7 @@ internal static class BenchmarkLoop // ============================================================ // MemoryPack — three I/O modes for apples-to-apples comparison // ============================================================ - // MemPack canonically on _All_True (see FastestByte-mode comment above for rationale). + // MemPack uses _All_False (see FastestByte-mode comment above for rationale). new MemoryPackBenchmark(orderFalse, Configuration.SelectedWireMode, "Default"), new MemoryPackBufferWriterBenchmark(orderFalse, Configuration.SelectedWireMode, "Default"), new MemoryPackFreshBufferWriterBenchmark(orderFalse, Configuration.SelectedWireMode, "Default"), diff --git a/AyCode.Core.Serializers.Console/Configuration.cs b/AyCode.Core.Serializers.Console/Configuration.cs index b06f00b..29a3da7 100644 --- a/AyCode.Core.Serializers.Console/Configuration.cs +++ b/AyCode.Core.Serializers.Console/Configuration.cs @@ -15,8 +15,6 @@ namespace AyCode.Core.Serializers.Console; /// internal static class Configuration { - internal const string ResultsDirectory = @"H:\Applications\Aycode\Source\AyCode.Core\Test_Benchmark_Results\Benchmark"; - #if AYCODE_NATIVEAOT internal const string BuildConfiguration = "NativeAOT"; #elif DEBUG diff --git a/AyCode.Core.Serializers.Console/README.md b/AyCode.Core.Serializers.Console/README.md index 4d58e7d..fd4484a 100644 --- a/AyCode.Core.Serializers.Console/README.md +++ b/AyCode.Core.Serializers.Console/README.md @@ -1,30 +1,82 @@ # AyCode.Core.Serializers.Console -Standalone benchmark console application for comparing serializer performance. Targets .NET 9. Measures serialize/deserialize speed, output size, and compression across multiple serializers and data shapes. +Interactive console runner for the serializer benchmark suite. Targets .NET 9. -## Compared Serializers +> **Companion**: shares its workload + reporting infrastructure with the BDN runner in [`AyCode.Benchmark/`](../AyCode.Benchmark/README.md) via ``. See that project's README for the full dual-runner architecture. -- **AcBinary** — Multiple configurations: Default, NoRef, FastMode, NoIntern, with/without source generation -- **MessagePack** -- **MemoryPack** +## Role -(System.Text.Json and Newtonsoft.Json comparisons exist but are currently commented out.) +This is the **fast-iteration** half of the benchmark stack — a custom adaptive measure engine optimized for short turnaround (~1-3 min full run) during micro-optimization loops. The BDN half lives in `AyCode.Benchmark` and produces statistically tighter numbers (~5-15 min full run) for before-commit validation. Both runners emit the **same** `.log` / `.LLM` / `.output` triplet to `Test_Benchmark_Results/Benchmark/` — Console prefixes with `Console.`, BDN with `Bdn.`. -## Key Files +## Compared serializers -- **`Program.cs`** — Benchmark runner. Modes: `all` (default), `quick` (fewer iterations), `serialize`, `deserialize`. Outputs results to `Test_Benchmark_Results/Benchmark/`. Iterations: 5000 warmup + 1000 test (Release), 0+1 (Debug). -- **`BenchmarkTestDataProvider.cs`** — Test data factory producing 5 data shapes: - - Small (2x2x2x2), Medium (3x3x3x4), Large (5x5x5x10) - - Repeated Strings (10 items, string deduplication testing) - - Deep Nested (2x4x4x8, depth stress test) - - Uses `TestOrder` model from `AyCode.Core.Tests` with configurable IId reference percentages. +- **AcBinary** — multiple options presets: `FastMode` (Compact wire, no ref handling, no interning), `Default` (with ref handling + interning), plus SGen / Runtime dispatch variants and Compact / Fast wire modes. +- **MemoryPack** — SOTA baseline, wire-mode-aligned with AcBinary for apples-to-apples encoding comparison (UTF-8 ↔ Compact, UTF-16 ↔ Fast). +- **MessagePack** — JIT-only (AOT incompatible due to dynamic resolver). +- **System.Text.Json** — reference comparison (commented out in `CreateSerializers` by default). + +## Key files + +- [`Program.cs`](Program.cs) — entry point. Parses CLI args (`Core` / `Comprehensive` / `Edge` / per-cell / op-mode / serializer-set) or falls into interactive `Menu`. +- [`Menu.cs`](Menu.cs) — interactive layer/serializer-set selection + nested settings (iteration counts, wire mode, charset). +- [`BenchmarkLoop.cs`](BenchmarkLoop.cs) — custom adaptive measure engine. CPU 0 affinity pin + High priority for stabilization, JIT pre-warmup, phase-isolated Ser/Des warmup→measure with `GC.Collect` at every boundary, 10-sample median + pilot discard, adaptive iter calibration to ~250ms/cell wall-clock, dedicated allocation-only sample. +- [`Configuration.cs`](Configuration.cs) — Console-side state (`SelectedWireMode`, `WarmupIterations`, `BenchmarkSamples`, `TargetSampleMs`, charset selection, `BuildConfiguration` const from `#if DEBUG/RELEASE/AYCODE_NATIVEAOT`). + +Workload + reporting types — `ISerializerBenchmark`, `BenchmarkResult`, `BenchmarkOptions`, `BenchmarkEnums`, `BenchmarkReportWriter`, `ReportingContext`, the 12 concrete `*Benchmark` classes (`AcBinaryBenchmark`, `MemoryPackBenchmark`, `AcBinaryBufferWriterBenchmark`, ...), `RoundTripValidator` — live in [`AyCode.Benchmark/Workloads/Scenarios/`](../AyCode.Benchmark/Workloads/Scenarios/) and [`AyCode.Benchmark/Reporting/`](../AyCode.Benchmark/Reporting/). + +## Test data + +5 cells, provided by `AyCode.Core.Tests.TestModels.BenchmarkTestDataProvider*`: + +- **Small** (2×2×2×2) +- **Medium** (3×3×3×4) +- **Large** (5×5×5×10) +- **Repeated Strings** (10 items, string-deduplication stress) +- **Deep Nested** (2×4×4×8, depth stress) + +20% IId reference rate by default. Two graph variants (`TestOrder_All_False` / `_All_True`) are built per cell — AcBinary's option preset picks which variant gets fed to it (`UsesAllFalseVariant` rule in `BenchmarkLoop`). + +## Charset profiles (Menu → Settings → Charset) + +Controls the `BenchmarkTestDataProvider.LongStringSuffix` — the string-tail appended to property values. Influences string-marker selection on the wire (FixStr vs StringSmall / Medium / Big), interning hit rates, and UTF-8 encode cost. + +| Profile | Content | +|---|---| +| `Latin1FixAscii` | Empty suffix (short FixStr fast-path stress) | +| `Latin1Short` | "árvíztűrő tükörfúrógép" (~24 char Hungarian mixed) | +| `Latin1Long` | ~47-char Latin1 mixed (default) | +| `CjkBmp` | CJK BMP (3-byte UTF-8 runs) | +| `Cyrillic` | Russian Cyrillic (2-byte UTF-8 runs) | +| `Mixed` | Hungarian + CJK + Cyrillic + emoji (full-spectrum + surrogate pairs) | + +## CLI + +``` +dotnet run -c Release --project AyCode.Core.Serializers.Console -- [arg] +``` + +| Arg | Result | +|---|---| +| _(no args)_ | Interactive menu — pick layer (Core / Comprehensive / Edge / Small / Medium / Large / Repeated / Deep / All) × serializer-set (Standard / FastestByte ["F"] / AsyncPipe ["P"]). | +| `Core` / `Comprehensive` / `Edge` / `Small` / `Medium` / `Large` / `Repeated` / `Deep` / `All` | Run that layer at `Standard` serializer-set, `All` op-mode. | +| `FastestByte` / `AsyncPipe` / `Standard` | Run that serializer-set, `All` layer, `All` op-mode. | +| `Serialize` / `Deserialize` / `All` | Run that op-mode, `All` layer, `Standard` serializer-set. | +| `quick` | Single-sample fast mode (Debug-equivalent — very loose numbers, smoke-test only). | + +Output: `Test_Benchmark_Results/Benchmark/Console.FullBenchmark__.{log,LLM,output}`. ## Dependencies | Dependency | Purpose | |---|---| -| `AyCode.Core` | Core library with AcBinary serializer | -| `AyCode.Core.Tests` | Test models (`TestOrder`, `TestDataFactory`, etc.) | -| `MemoryPack` | Competitor benchmark | -| `MessagePack` | Competitor benchmark | -| `Newtonsoft.Json` | Competitor benchmark | +| `AyCode.Core` (ProjectReference) | AcBinary serializer | +| `AyCode.Core.Tests` (ProjectReference) | Test data factory + test models | +| `AyCode.Benchmark` (ProjectReference) | Shared workload + reporting (`ISerializerBenchmark`, `BenchmarkResult`, `BenchmarkReportWriter`, `ReportingContext`, the 12 concrete benchmark classes) | +| `MemoryPack` | Comparison target (also via Workloads) | +| `MessagePack` | Comparison target | +| `Newtonsoft.Json` | Comparison target (currently disabled) | + +## Build & publish notes + +- `AyCode.Core.Serializers.Console.Program` in the csproj explicitly disambiguates the entry point — necessary because this Exe references another Exe (`AyCode.Benchmark`), and the build would otherwise complain about multiple `Main` methods. +- AOT publish (`dotnet publish -c Release`) is configured via `'$(_IsPublishing)' == 'true'` PropertyGroup. The Benchmark project's BDN-stack (BenchmarkDotNet, Iced disassembler, MongoDB.Bson) is pulled in transitively — accepted tradeoff for the unified workload sharing. diff --git a/AyCode.Core/docs/BINARY/BINARY_SGEN_OPTIMIZATION.md b/AyCode.Core/docs/BINARY/BINARY_SGEN_OPTIMIZATION.md new file mode 100644 index 0000000..e045e39 --- /dev/null +++ b/AyCode.Core/docs/BINARY/BINARY_SGEN_OPTIMIZATION.md @@ -0,0 +1,214 @@ +# AcBinary SGen — Per-Property Emit Optimization Notes + +Working notes for post-baseline SGen-emit micro-optimization. Brainstorming output (2026-05-15). + +> **Not a TODO entry.** Open TODO ID: `BINARY_TODO.md#accore-bin-t-k9m3` covers wire-codec hoist + feature-conditional emit (Phase A/B/C). This doc is the **per-property emit-condensation companion** — narrower scope, mostly emit-text shape changes (no new runtime API). + +## Status + +- **BDN baseline:** in progress (separate session) +- **Implementation:** blocked on baseline +- **This doc:** brainstorming + methodology snapshot + +## Motivation + +Inspection of `TestOrderItem_All_True_GeneratedReader.g.cs` (247 lines, 9 properties) vs MemPack equivalent (~9 lines of per-property emit) shows the AcBinary SGen-emit body is ~22× larger per property. Most of the gap is **legitimate** (AcBinary supports ref-tracking, interning, polymorph dispatch, FixObj slots — MemPack does not), but a measurable fraction is **mechanical duplication** of the same 30+ line dispatch pattern across siblings of the same kind (Complex, Collection-element, String). + +| Property kind | AcBinary lines/prop | MemPack lines/prop | +|---|---:|---:| +| Complex (Tag, Assignee, ItemMetadata) | ~37 | 1 | +| Collection (Pallets) | ~45 | 1 | +| String (ProductName) | ~30 | 1 | +| Primitive (Id, Quantity, Status, UnitPrice) | 1-3 | 1 (batched) | + +Cumulative reduction if Ötlet A + F.4 + G land: **~247 → ~90 lines** on this representative DTO (~36%). + +## Methodology — measurement order + +| Step | Tool | Purpose | +|---|---|---| +| 1 | `DOTNET_JitDisasm=*GeneratedReader*ReadProperties*` ad-hoc on `Console.FullBenchmark` | Sanity-check: does the JIT already merge the switch to cmp-chain? If yes, Ötlet F.4 nyereség = source-only (no perf win). | +| 2 | BDN baseline (separate session, in progress) | Cell-level wall-clock truth. ~1-3% noise floor. | +| 3 | Implement Ötlet A in isolation | Single-method MemPack-validated pattern, measurable independently on string-rich cell (Repeated Strings). | +| 4 | If A wins → BDN re-run → implement F.4 | Switch→if-else chain branch-predictor swap MUST be BDN-verified before extending. | +| 5 | If F.4 wins → implement G | Mechanical follow-up (same pattern on collection element). | +| 6 | Disasm-baseline (dedicated minimal project, **only if** BDN deltas are noise-floor-near 1-3%) | `AyCode.Core.Benchmarks.Disasm` proj, `TestOrderItem_All_True` chain, JIT Tier-1 + AOT. ENV: `DOTNET_TC_QuickJit=0`. Output `.disasm-baseline/{jit|aot}/{before|after}/`. | + +**Anti-pattern to avoid:** asm-driven optimization (looking at asm first, "finding" something to fix). Disasm is **confirm-tool**, not explore-tool. + +## Ideas — ranked + +### Ötlet A — `TryReadStringProperty()` hoist ⭐⭐⭐ + +Single context method replacing the inline 30-line switch at every string-property emit site. Hot+cold split — FixStrAscii + Null/StringEmpty/StringSmall/StringInterned inline, rare markers (Medium, Big, Ascii long, InternFirst*) in a separate `[NoInlining]` cold-path method. + +```csharp +[MethodImpl(MethodImplOptions.AggressiveInlining)] +internal bool TryReadStringProperty(out string? value) +{ + if (FastWire) { value = ReadStringUtf16Markerless(); return true; } + var tc = ReadByte(); + if (tc == BinaryTypeCode.PropertySkip) { value = null; return false; } + if (BinaryTypeCode.IsFixStrAscii(tc)) + { + var len = BinaryTypeCode.DecodeFixStrAsciiLength(tc); + value = len == 0 ? string.Empty : ReadAsciiBytesAsString(len); + return true; + } + switch (tc) + { + case BinaryTypeCode.Null: value = null; return true; + case BinaryTypeCode.StringEmpty: value = string.Empty; return true; + case BinaryTypeCode.StringSmall: value = ReadStringSmall(); return true; + case BinaryTypeCode.StringInterned: value = GetInternedString((int)ReadVarUInt()); return true; + } + value = ReadStringPropertyColdPath(tc); + return true; +} + +[MethodImpl(MethodImplOptions.NoInlining)] +private string? ReadStringPropertyColdPath(byte tc) => tc switch +{ + BinaryTypeCode.StringMedium => ReadStringMedium(), + BinaryTypeCode.StringBig => ReadStringBig(), + BinaryTypeCode.StringAscii => ReadPlainStringAscii(), + BinaryTypeCode.StringInternFirstSmall => ReadAndRegisterInternedStringSmall(), + BinaryTypeCode.StringInternFirstMedium => ReadAndRegisterInternedStringMedium(), + _ => null +}; +``` + +**Caller emit:** 2 lines per string property. `if (TryReadStringProperty(out var v)) obj.Name = v;`. + +**Risk:** caller-side `bool` branch (the `PropertySkip` signal). Trivial vs the 30 lines it replaces. + +**Feature-flag interaction:** when `EnableInternStringFeature=false`, the `StringInterned` + cold-path `InternFirst*` cases are dead branches. The JIT cannot eliminate them per-type (the method is shared across all call sites). Option: **two methods** — `TryReadStringProperty` (full) + `TryReadStringPropertyNoIntern` (intern cases omitted), and SGen-emit picks based on the flag. Defer until A is validated. + +**Expected gain:** 5-15% Deser on string-rich DTOs (per MemPack-pattern parallel). Verify with `Repeated Strings` cell. + +### Ötlet F.4 — Complex emit condensed-inline ⭐⭐⭐ + +Pure SGen-emit text change. No new runtime API. 37 lines → 12 per Complex property by collapsing the 5-case switch to an if-else chain with hot path (Object/Null) front-loaded. + +```csharp +// 12 lines, current ~37: +var tc_Assignee = context.ReadByte(); +if (tc_Assignee == BinaryTypeCode.PropertySkip) { /* skip */ } +else if (tc_Assignee == BinaryTypeCode.Null) { /* leave null */ } +else if (tc_Assignee == BinaryTypeCode.ObjectRef) + obj.Assignee = (SharedUser_All_True)context.GetInternedObject((int)context.ReadVarUInt())!; +else +{ + int ci = (tc_Assignee == BinaryTypeCode.ObjectRefFirst) ? (int)context.ReadVarUInt() : -1; + if (tc_Assignee < BinaryTypeCode.Object) { context.GetWrapper(typeof(SharedUser_All_True), tc_Assignee); if (tc_Assignee >= context._nextRuntimeSlot) context._nextRuntimeSlot = tc_Assignee + 1; } + var rc = new SharedUser_All_True(); + if (ci >= 0) context.RegisterInternedValueAt(ci, rc); + SharedUser_All_True_GeneratedReader.Instance.ReadProperties(rc, context); + obj.Assignee = rc; +} +``` + +**Branch-predictor risk:** the current switch on non-contiguous markers (0x40, 0x41, 0x46, 0x4C, 0x4D + FixObj-range) likely already lowers to cmp-chain (no jump-table) on x64 — disasm sanity-check (step 1) decides whether F.4 perf-impact is real or source-only. Either way, **direct call preserved** (no virtual dispatch). + +### Ötlet G — Collection-element emit condensed ⭐⭐⭐ + +Same condensed pattern applied to the per-iteration switch inside the collection-element loop (~45 lines → ~15). Mechanical follow-up to F.4. Don't ship before F.4 is validated. + +### Ötlet B — Multi-out unmanaged batch (MemPack `ReadUnmanaged`-style) ⭐⭐ + +MemPack emits `reader.ReadUnmanaged(out __TrayCount, out __Status, out __Weight)` — single `MemoryMarshal.Read`. AcBinary equivalent: limited applicability. + +- Compact mode: markers between properties → batch impossible. +- FastMode markerless: viable, but only for fixed-size primitives (`Int32`, `Double`, `Guid`). VarInt/VarLong/Decimal not fixed-size → batch-incompatible. +- Caller-side `ref obj.X` blocked for property setters (only fields). `out` locals + assign-after pattern = no net win. + +**Low priority.** Defer; revisit only after A/F.4/G land. + +### Ötlet H — FastMode primitive batch ⭐⭐ + +In FastMode the wire is markerless byte-stream; consecutive unmanaged properties can be read as one `Unsafe.Read` via a synthesized struct. SGen detects consecutive unmanaged-property runs, emits batch-read. Noop on Compact mode. + +**Scope:** FastMode-only deployments. Out of first batch. + +### Ötlet F.1 / F.2 / F.3 — Per-type Complex helpers ⭐⭐ + +Three variants for moving the Complex dispatch into a per-type helper method: + +| Variant | Mechanism | Blocker | +|---|---|---| +| F.1 | `static void TryReadTag(ctx, ref Tag target)` | `ref` to property setter not legal C#. Only works on fields. | +| F.2 | `static Tag? ReadTagProperty(ctx)` return-value + sentinel for `PropertySkip` | Sentinel allocation, `ReferenceEquals` overhead. | +| F.3 | Generic context method with struct-handler constraint (`where THandler : struct, IReader`) | Bonyolult template emit; per-type struct-handler extra generated code; pays JIT generic-specialization cost. | + +All preserve direct call (no virtual dispatch). All produce ~5 lines per Complex property at the emit site. + +**Trade-off vs F.4:** F.1-3 yield smaller emit (~5 lines) but require new runtime surface OR new emit machinery. F.4 yields ~12 lines but pure emit-text change. **F.4 first** — lower risk, no API churn. Revisit F.2/F.3 only if F.4 hot-path verified-equivalent and the further line-count win is worth the surface cost. + +### Ötlet C — Cold-path `[NoInlining]` audit ⭐ + +Mark `ReadStringMedium` / `ReadStringBig` / `ReadPlainStringAscii` with `[MethodImpl(NoInlining)]` if they don't already inline (body size). Marginal — verify with disasm only if BDN flags string-cell. + +### Ötlet D — Extended FixStr marker range ⭐ + +`FixStrAscii` covers 1-31 byte ASCII (markers 135-166). Could be extended with `FixStrAsciiMedium` (32-63 byte) or `FixStrSmallNonAscii` (1-19 byte non-ASCII). **Wire-format breaking change** — separate project-level decision, not now. + +### Ötlet E — Property-count object header (MemPack-style) ⭐ + +MemPack `TryReadObjectHeader(out count)` → fix-schema, no per-property `PropertySkip` marker. AcBinary chose property-marker pattern for schema-evolution flexibility. **Design-philosophy conflict** — not a candidate. + +## Overlap analysis — string-marker switch + +| Marker pair | Wire-payload overlap | Mergeable? | +|---|---|---| +| `StringSmall` ↔ `StringInternFirstSmall` | Both `[charLen:8][utf8Len:8][bytes]` body | ❌ InternFirst has cacheIdx **before** payload + cache-register **after** — structurally different wire position. | +| `StringMedium` ↔ `StringInternFirstMedium` | Both `[charLen:16][utf8Len:16][bytes]` body | ❌ Same as above. | +| `StringInterned` ↔ `InternFirst*` | Both VarUInt cacheIdx first | ❌ Different behavior (lookup vs register+decode). | +| `FixStrAscii` ↔ `StringAscii` | Both ASCII payload | ✅ Already share `ReadAsciiBytesAsString(len)`. Properly factored. | + +**Conclusion:** wire-format is definitionally fragmented; no structural merge opportunity beyond the existing `ReadAsciiBytesAsString` helper. The win is at the **emit-site hoist** (Ötlet A), not at the wire-codec level. + +## Minor observation — enum cast + +Current emit: +```csharp +{ var ev = context.ReadVarInt(); obj.Status = Unsafe.As(ref ev); } +``` + +C# native cast `(TestStatus)context.ReadVarInt()` is bit-identical at the JIT level. Cleaner emit, marginal. Can be folded into a future emit-tidy pass. + +## Cumulative impact estimate + +`TestOrderItem_All_True_GeneratedReader.g.cs` (247 lines, 9 props) after A + F.4 + G: + +| Component | Current | Projected | Δ | +|---|---:|---:|---:| +| String (1 prop) | ~30 | 2 | -28 | +| Complex (4 props) | 4 × 37 = 148 | 4 × 12 = 48 | -100 | +| Collection (1 prop) | ~45 | ~15 | -30 | +| Primitives (4 props) | ~6 | ~6 | 0 | +| Boilerplate (header, enum, decimal) | ~18 | ~18 | 0 | +| **Total** | **~247** | **~89** | **-158 (~64%)** | + +Smaller IL → faster cold-start JIT, smaller assembly, smaller i-cache footprint per hot type. Native AOT publish image shrinks proportionally. + +## Implementation order — final + +1. **Now (ad hoc):** `DOTNET_JitDisasm=*GeneratedReader*ReadProperties*` on `Console.FullBenchmark` — verify what the JIT actually emits for the current switch. F.4 prioritás függ ettől. +2. **BDN baseline** (separate session, in progress). +3. **Ötlet A** — isolated, measured on Repeated Strings cell. +4. **Ötlet F.4** — only if A wins AND disasm shows switch is cmp-chain (i.e. condensed if-else is wire-equivalent, not regressing branch prediction). +5. **Ötlet G** — only if F.4 wins. +6. **Ötlet H** — separate FastMode-only batch, not first-batch scope. + +## Open questions + +- Should `TryReadStringProperty` have a `NoIntern` twin for `EnableInternStringFeature=false` types? (Defer to post-A validation.) +- F.4 if-else order: hot path is `Object` (no ref-handling, fresh instance) — should be the first branch checked, not `PropertySkip`. Current emit checks `PropertySkip` first (early exit), correct for sparse-property streams (schema evolution), suboptimal for dense streams. Profile both orderings. +- Disasm-baseline project (`AyCode.Core.Benchmarks.Disasm`): structure-only or also a perf companion to BDN? Decide post-baseline. + +## Cross-references + +- `BINARY_TODO.md#accore-bin-t-k9m3` — wire-codec hoist + Phase C feature-conditional emit (sister work). +- `BINARY_SGEN.md` — SGen architecture, hybrid execution, bridge methods, wrapper slots. +- `BINARY_FORMAT.md` — wire-format markers referenced above. +- `Test_Benchmark_Results/Benchmark/Console.FullBenchmark_Release_2026-05-15_19-40-53.LLM` — most recent post-refactor baseline (pre-optimization). diff --git a/AyCode.Core/docs/BINARY/BINARY_TODO.md b/AyCode.Core/docs/BINARY/BINARY_TODO.md index 6408c79..bb70916 100644 --- a/AyCode.Core/docs/BINARY/BINARY_TODO.md +++ b/AyCode.Core/docs/BINARY/BINARY_TODO.md @@ -5,6 +5,9 @@ This page covers planned work for the **binary serializer core** (format, SGen, ## Priority legend - **P0** blocker · **P1** important · **P2** nice-to-have · **P3** idea +> **Archived entries**: see `BINARY_TODO_2026_04.md` and `BINARY_TODO_2026_05.md` (year-month bucket archives per LLMP-DEC retention policy). +> Archive files are not auto-loaded — read on demand if relevant context is suspected (regression hint, supersession reference, ID lookup for archived entry). + ## ACCORE-BIN-T-P6M4: Universal hotpath optimization guardrails + follow-up backlog **Priority:** P1 · **Type:** Performance @@ -22,7 +25,7 @@ For each performance TODO, validate on representative workload mixes (ASCII-heav - Add early scan-pass short-circuit when options guarantee no ref/intern benefit. ## ACCORE-BIN-T-K9M3: Hoist wire codec primitives to context instance methods (ser + deser, feature-aware SGen emit) -**Priority:** P2 · **Type:** Refactor + Performance · **Related:** `ACCORE-BIN-T-P6M4` (hotpath guardrails), `BINARY_ISSUES.md#accore-bin-i-t7k3` (polymorph compile-time guard) +**Priority:** P2 · **Type:** Refactor + Performance · **Related:** `ACCORE-BIN-T-P6M4` (hotpath guardrails), `BINARY_ISSUES.md#accore-bin-i-t7k3` (polymorph compile-time guard), `BINARY_SGEN_OPTIMIZATION.md` (per-property emit-condensation companion brainstorming) ### Motivation @@ -101,20 +104,6 @@ Decision per primitive: can it be expressed as a context method that takes only - Phase C: SGen-emit reader honours `Enable*Feature` flags. Verified by spot-checking generated `*.g.cs` files: an `EnableInternStringFeature=false` type's reader does NOT contain `StringInternFirstSmall` / `Medium` / `StringInterned` cases. - Per-phase benchmark run (`Console.FullBenchmark`) confirms no hot-path regression (within noise floor). -## ACCORE-BIN-T-S8P4: Replace JSON-in-Binary request parameters -**Priority:** P1 · **Type:** Refactor · **Status:** Closed (2026-04-26, landed in commits `cdd54d3` 2026-04-05 + `3b70070` 2026-04-06) · **Related:** `../XCUT/XCUT_ISSUES.md#accore-xcut-i-x8q1` (canonical), `AyCode.Services/docs/SIGNALR/SIGNALR_TODO.md` - -Migrate client→server request parameters from JSON-in-Binary envelope to direct Binary serialization (matching response path). Coordinated change across client, server, and all consuming projects. Do NOT attempt as side-effect of unrelated work. - -**Acceptance:** `SignalPostJsonDataMessage` replaced by a `SignalPostBinaryDataMessage` (or equivalent); no JSON round-trip on the wire for request params; benchmarks confirm no regression. - -### Resolution -- **What:** Length-prefixed, per-parameter binary format introduced via `SignalRSerializationHelper.SerializeParametersToBinary` / `DeserializeParametersFromBinary`; further unified into `SignalParams` (single `byte[]` carrying packed method parameters with `SetParameterValues` / `GetParameterValues`). -- **Where:** `AyCode.Services/SignalRs/AcSignalRClientBase.cs`, `AcWebSignalRHubBase.cs`, `ISignalParams.cs` (server + client dispatch); `IAcSignalRHubClient.cs` (legacy wrappers). -- **Equivalent (not literal `SignalPostBinaryDataMessage`):** `SignalParams` was chosen over a 1:1 binary wrapper class — fewer indirections on the hot path, type-safe pack/unpack, and `DataSerializerType` field on `SignalReceiveParams` for response format indication. -- **Wire impact:** No JSON round-trip on the wire for request params; this is a **breaking change** vs. previous JSON-in-Binary clients/servers (see commit message). -- **Legacy types:** `SignalPostJsonMessage`, `SignalPostJsonDataMessage`, `SignalPostMessage`, `ISignalPostMessage` all marked `[Obsolete]` in `IAcSignalRHubClient.cs`; deletion tracked separately in `AyCode.Services/docs/SIGNALR/SIGNALR_TODO.md#accore-sig-t-s3n8` (gated on consumer migration). - ## ACCORE-BIN-T-Q2N7: Re-evaluate DiscountProductMapping SGen exclusion **Priority:** P3 · **Type:** Investigation · **Related:** `BINARY_ISSUES.md#accore-bin-i-f1w8` @@ -454,44 +443,6 @@ Symmetric to T8K3's analysis: - API doc-string contains a "When to use which mode?" decision matrix; cross-references T8K3's symmetric write-side guidance. - `leaveOpen` parameter behaves per the System.Text.Json / MessagePack convention across all three modes. -## ACCORE-BIN-T-N9G6: Add non-generic `Type`-based `Serialize(object, Type, ...)` overloads -**Priority:** P2 · **Type:** Feature · **Status:** Closed (2026-05-04) · **Related:** `ACCORE-BIN-T-T8K3` - -### Resolution - -Added in `AcBinarySerializer.cs`: -- `Serialize(object?, Type, opts)` → `byte[]` -- `Serialize(object?, Type, IBufferWriter, opts)` → `int` -- `SerializeChunked(object?, Type, PipeWriter, opts)` → `int` -- `SerializeChunkedFramed(object?, Type, PipeWriter, opts)` → `int` - -`AcBinaryDeserializer.cs` already had `Deserialize(byte[], Type, opts)` / `Deserialize(ReadOnlySequence, Type, opts)` / `Deserialize(AsyncPipeReaderInput, Type, opts)` overloads — no new entries needed. - -**Layering note**: `PipeReader → AsyncPipeReaderInput` drain-loop is the consumer's responsibility, not the binary serializer's. The serializer surface ends at `AsyncPipeReaderInput`; transport-specific draining (PipeReader, NamedPipe, SignalR `state.Buffer.Write`, etc.) lives in the consumer layer (e.g. `AcBinaryInputFormatter`, `AcBinaryHubProtocol.TryParseChunkData`). - -Consumed by ASP.NET Core MVC formatter package (`AyCode.Services/Mvc/`) — `AcBinaryInputFormatter`, `AcBinaryOutputFormatter`, `AddAcBinaryFormatters` extension. Media type: `application/vnd.acbinary`. Drain-loop inlined in `AcBinaryInputFormatter.ReadRequestBodyAsync`. - -Plugin frameworks, ASP.NET ModelBinding, DI middleware, and DataContractSerializer-style "generic-API container" use-cases need to serialize an `object` whose type is known only at runtime. Current AcBinary surface forces a reflection trampoline through the generic `Serialize`: - -```csharp -// Today's workaround (slow + noisy): -typeof(AcBinarySerializer).GetMethod("Serialize", new[] { type, typeof(AcBinarySerializerOptions) }) - .MakeGenericMethod(type).Invoke(null, new[] { value, options }); -``` - -**Implementation outline:** -- `public static byte[] Serialize(object? value, Type type, AcBinarySerializerOptions? options = null)` -- `public static int Serialize(object? value, Type type, IBufferWriter writer, AcBinarySerializerOptions? options = null)` -- `public static int SerializeChunked(object? value, Type type, PipeWriter writer, AcBinarySerializerOptions? options = null)` and `Pipe` overload -- `public static int SerializeChunkedFramed(object? value, Type type, PipeWriter writer, AcBinarySerializerOptions? options = null)` and `Pipe` overload -- `public static ValueTask SerializeAsync(object? value, Type type, Stream stream, ...)` — coordinated with `ACCORE-BIN-T-T8K3` -- Internal dispatch: `value.GetType()` is the runtime type; the `Type type` parameter constrains the **declared** type for polymorphism handling (`ObjectWithTypeName` write decision). - -**Acceptance:** -- All non-generic overloads round-trip via the generic deserializer's `Deserialize(byte[], Type)` overload. -- Plugin-style scenario: serialize `IList` of mixed-type elements → all elements correctly typed in the wire output. -- API doc-strings call out the performance characteristics (slightly slower than generic due to runtime `Type` lookup but **without** the reflection trampoline cost). - ## ACCORE-BIN-T-R4P2: Expose low-level `ref Writer`-style API for custom formatters **Priority:** P3 · **Type:** Feature @@ -934,128 +885,6 @@ while (s < src.Length) - Local `dotnet test` covers correctness; per-tier benchmarks measure the multi-byte speedup - Phase 1+2 (AVX-512BW + Vector128 in `CountUtf8Chars` + `EncodeUtf8SinglePass` Phase 1) **landed 2026-05-05** — covered by existing round-trip tests, no regression on non-AVX-512 hosts (validated on AVX2-host bench) -## ACCORE-BIN-T-H2Q6: Fixed-width dual-length string header (Small/Medium/Big) for 1-pass decode -**Priority:** P1 · **Type:** Wire-format + Performance · **Status:** Closed (2026-05-06) · **Related:** `DecodeUtf8SinglePass`, `CountUtf8Chars`, `WriteStringWithDispatch`, `ReadStringUtf8` - -Current Compact string decode uses two-pass flow for non-ASCII payloads (`CountUtf8Chars` + `DecodeUtf8SinglePass`). -Planned direction: remove VarUInt-based string-length path for the new string wire variant, and carry both lengths in a fixed-width header so deserialize can allocate target `string` immediately and decode in a single pass. - -### Planned format tiers - -- **Small**: packed `uint16` (`charLen:8 | utf8Len:8`) -- **Medium**: packed `uint32` (`charLen:16 | utf8Len:16`) -- **Big**: `uint32 charLen + uint32 utf8Len` - -Writer picks the smallest fitting tier; reader dispatches by marker and reads fixed-width lengths (no VarUInt loop for string length metadata). - -### Why - -- Removes `CountUtf8Chars` pass on the new markers (1-pass decode path) -- Keeps decode branch profile stable (fixed-size header reads) -- Maintains range safety with explicit Big overflow path - -### Constraints captured from current benchmark context - -- Performance evaluation target is non-ASCII-heavy data (ASCII-shortcuts intentionally not primary) -- Wire-format backward compatibility is not required for this development phase - -### Marker layout decision (2026-05-06) - -After analysis on the new "all UTF-8 Magyar" benchmark baseline (`2026-05-06_13-10-30.LLM` — Compact +5-25% slower than MemPack on every cell): - -**Confirmed**: the previous benchmark's Compact-vs-MemPack advantage was an artifact of ASCII property names hitting the `FixStrAscii` / Latin1-widen fast path; once string property values are also UTF-8 Magyar, the actual hot path (`EncodeUtf8SinglePass` + two-pass `CountUtf8Chars` + `DecodeUtf8SinglePass`) becomes the bottleneck. - -**Marker scope decision** — clean split between ASCII fast path and non-ASCII tier dispatch: - -**MEGMARAD (changeless)**: -- `FixStrAscii` (≤31 byte ASCII) — kompakt 1-byte header + Latin1 widen, zero UTF-8 decode pipeline -- `StringAscii` (>31 byte ASCII) — long ASCII fast path, Latin1 widen -- `StringInternRef` — 2nd+ occurrence of interned string (no body, just cache index — not affected by 2-pass problem) -- `StringEmpty`, `Null` — sentinel markers - -**MEGSZŰNIK (replaced by H2Q6 tiers)**: -- `FixStr` (32 marker values 103-134 — non-ASCII short) → replaced by `StringSmall` -- `String` (1 marker value 91 — non-ASCII long with VarUInt utf8Len) → replaced by `StringSmall` / `StringMedium` / `StringBig` -- `StringInternFirst` (1 marker value 94 — VarUInt utf8Len interning) → replaced by `StringInternFirstSmall` / `StringInternFirstMedium` - -**ÚJ markers** (5 total): -- `StringSmall` — non-ASCII, `[marker:1][charLen:8][utf8Len:8][bytes]`, utf8Len ≤ 255 -- `StringMedium` — non-ASCII, `[marker:1][charLen:16][utf8Len:16][bytes]`, utf8Len ≤ 65535 -- `StringBig` — non-ASCII, `[marker:1][charLen:32][utf8Len:32][bytes]`, utf8Len > 65535 -- `StringInternFirstSmall` — `[marker:1][cacheIdx:VarUInt][charLen:8][utf8Len:8][bytes]` -- `StringInternFirstMedium` — `[marker:1][cacheIdx:VarUInt][charLen:16][utf8Len:16][bytes]` - -**Trade-off justification**: -- Wire cost on short non-ASCII strings: +2 byte/string header (3 vs 1) → ~0.07-0.36% wire growth on Repeated cell (10 short Magyar string × 2 byte / 28 KB) -- CPU saving: `CountUtf8Chars` Pass 1 eliminated on every non-ASCII string decode → directly attacks the +25% Deser baseline gap -- The 2-byte hybrid `FixStr` (non-ASCII) variant (1 byte marker + 1 byte charLen) was considered but **rejected**: marginal wire saving (-1 byte vs StringSmall) does not justify the +1 marker complexity given the tiny absolute wire impact on the Repeated cell. Cleaner to have ASCII-vs-non-ASCII at the marker level (FixStrAscii vs StringSmall/Medium/Big). - -**Interning tier sizing rationale**: -- `MaxStringInternLength` is `byte`-typed (`AcBinarySerializerOptions.cs:125`, default 64, abszolút max 255 char) -- Worst-case: 255 char × 4 byte/char (emoji-only) = 1020 byte → fits in Medium tier (utf8Len ≤ 65535) -- Realistic Magyar/CJK: 64 char × 2-3 byte = 128-192 byte → Small tier -- **Big tier never engages on the interning path** — only Small + Medium needed (+2 markers, not +3) - -### Marker address space reservation (post-H2Q6) - -The marker reorg frees **34 marker values** (32 `FixStr` non-ASCII + `String` + `StringInternFirst`). After allocating 5 for H2Q6, **29 values remain free**. Strategic reservation plan to prevent ad-hoc consumption and minimize future wire-format breaks: - -| Reserved range | Count | Future feature | Status | -|---|---|---|---| -| `StringSmall` / `StringMedium` / `StringBig` | 3 | H2Q6 Compact tiers | **active (this entry)** | -| `StringInternFirstSmall` / `StringInternFirstMedium` | 2 | H2Q6 interning tiers | **active (this entry)** | -| `FixArrayBase..FixArrayMax` | 16 | `ACCORE-BIN-T-L9Y3` (FixArray short-list count in marker) | reserved, future | -| Sentinel-length string tier markers | ~5 | `ACCORE-BIN-T-S5L8` (sentinel-length encoding) | reserved, future | -| Markerless schema lane | ~4 | `ACCORE-BIN-T-S2X9` (markerless schema lane opt-in) | reserved, future | -| `StringFastWire` | 1 | `ACCORE-BIN-T-F3W6` (dedicated FastWire string marker) | reserved, future | -| General reserve | 3 | unallocated | tartalék | - -**Wire-format version bump**: v2 → v3 at H2Q6 landing. The reserved-but-unimplemented marker values are documented but not yet decoded — readers throw `unknown marker` if wire contains them. Future activation of `FixArray` / sentinel-length / markerless schema lane within the **same v3 wire format** is non-breaking for already-deployed v3 consumers (they reject unknown markers cleanly; producers opt in to emit them). - -### Acceptance - -- New string markers implemented for Small/Medium/Big tiers + InternFirstSmall/InternFirstMedium tiers -- Deserialize path for these markers performs single-pass decode without `CountUtf8Chars` -- 29 freed marker values strategically reserved per the address-space reservation table; documented in `BinaryTypeCode.cs` with `// Reserved for ACCORE-BIN-T-XXXX (future)` comments -- Wire-format version bump v2 → v3 documented in `BINARY_FORMAT.md` -- Existing round-trip tests pass, plus new boundary tests for tier transitions (utf8Len = 254/255/256/65534/65535/65536) and interning tier transitions -- Benchmark report includes before/after for Compact mode on non-ASCII dataset (Ser/Deser/RT + Size) vs the `2026-05-06_13-10-30.LLM` baseline - -### Resolution - -Landed 2026-05-06. End-to-end implementation: marker reorg + writer tier-dispatch + reader tier-readers + SGen template + skip path + interning path. Five new markers (`StringSmall`/`Medium`/`Big`/`InternFirstSmall`/`InternFirstMedium`) replacing the old `String`/`StringInternFirst`/`FixStrBase..Max` (32 + 1 + 1 = 34 marker values freed, 5 used; 29 reserved for future features per the address-space plan). Wire format version bumped v2 → v3. - -Follow-up A-direction header pack-write/read optimization landed in the same window: `Unsafe.WriteUnaligned` (Small) / `` (Medium) / `` (Big) replace 2× byte / 2× ushort / 2× uint stores; reader uses single `uint`/`ulong` loads with bit-extract. Direct `ref byte` writes (no Span-shape overhead). - -**Tests:** 222 pass / 13 pre-existing GuidIId failures (unchanged). 55/55 Utf8TranscoderTests pass. - -**Benchmark vs `2026-05-06_13-10-30.LLM` baseline (`2026-05-07_08-55-49.LLM`, immediately post-H2Q6):** -- Compact-vs-MemPack Deser ratio improvement on baseline gap: **-14 to -28 percentage points** across cells -- Deser: **4/5 cells now faster than MemPack** (Small -6%, Medium -3%, Large -9%, Deep -7%); Repeated cell remaining +5% gap (V4N2 Phase 3 SIMD multi-byte transcoder targets this) -- Wire size: **5/5 cells smaller than MemPack** (-8% to -11%) -- Ser: 1/5 win (Large -9%), 1/5 tie (Medium 0%), 3/5 minor lag (+2-7% Small/Repeated/Deep) — host-noise band - -**Bench evolution post-H2Q6** (subsequent micro-opts on the same H2Q6 base): -- `2026-05-07_09-39-09.LLM` — A irány header pack-write/read (`Unsafe.WriteUnaligned` ushort/uint/ulong): zaj-szintű mozgás, strukturális javulás -- `2026-05-07_15-13-39.LLM` — V4N4 Step 1+2 method-split (`AggressiveInlining`): **regresszió** (Small Ser +29.6 pp, Repeated Ser +8.9 pp) → `WriteStringSmallFast` túl-aggresszív inline-olás code-bloat / i-cache pressure -- `2026-05-07_15-29-21.LLM` — V4N4 finomított (NoInlining a SmallFast-ra, dispatcher hint nélkül, Reader split visszavonva): **konszolidált state**: - - **Ser**: 5/5 cell paritás-vagy-jobb (Small **-8.5%**, Medium ≈, Large **-8.5%**, Repeated ≈, Deep ≈) - - **Deser**: **4/5 cell faster than MemPack** (Medium -4.7%, Large **-10.6%**, Repeated **-3.8%**, Deep **-10.1%**); Small +10% remaining gap - - **Wire**: 5/5 cell -8% to -11% smaller (unchanged) - - **Net**: Compact mostantól 8/10 cellán nyer Compact vs MemPack; csak Small Deser-en marad +10% gap (kis abszolút érték, ~1 µs) - -**Critical algorithmic correctness lesson** (from V4N3 follow-up `GetUtf8ByteCount`): the initial 4-popcount formula assumed `lowSur == highSur` per chunk. Fix: 5-popcount closed-form. Caught by surrogate-pair-split-across-chunk regression tests. Documented in Utf8Transcoder. - -**Marker address space (post-H2Q6, v3 wire):** -- 91 → StringSmall (was String) -- 94 → StringMedium (was StringInternFirst) -- 103 → StringBig -- 104 → StringInternFirstSmall -- 105 → StringInternFirstMedium -- 106..134 reserved (29 values: 16 for `L9Y3` FixArray, 5 for `S5L8` sentinel-length, 4 for `S2X9` markerless schema lane, 1 for `F3W6` FastWire dedicated marker, 3 reserve) - -**Related follow-up TODO entries (now Open):** `O7G2` (overflow guard), `S6F2` (shift-mentes Small fast path), `W2C8` (WASM string-cache H2Q6 maximalizálás). - ## ACCORE-BIN-T-S5L8: Sentinel-length encoding for strings (wire-size optimization, both modes) **Priority:** P3 · **Type:** Wire-format optimization · **Related:** `AcBinarySerializer.WriteString`, `AcBinaryDeserializer.ReadValue` string dispatch @@ -1117,104 +946,6 @@ Compact gain: **only on long strings** (>31 byte UTF-8). Estimated −1 byte per - Cross-version compat: documented format version bump + clean fail on old reader / new wire mismatch - Polymorphic + interned property test cases pass unchanged (use existing marker-based encoding) -## ACCORE-BIN-T-M3R7: ASCII marker-dispatch — writer detect + reader dedicated path -**Priority:** P2 · **Type:** Performance + wire optimization · **Related:** `BinaryTypeCode.FixStrAsciiBase..StringAscii` markers, `WriteStringWithDispatch`, `ReadAsciiBytesAsString` -**Status:** Closed (2026-05-04) - -> **Sorrendi megjegyzés:** ezt **AZ ENCODER OPTIMALIZÁCIÓ UTÁN** csináljuk (lásd `ACCORE-BIN-T-E2F9`). Indok: a custom encoder/decoder Vector256 ASCII narrow/widen path-jai már magukban gyorsan kezelik az ASCII byte-ot. A marker-dispatch ezen FELÜL csak a per-call dispatch-overhead spórolást hozza (no `Ascii.IsValid` scan, no decoder layer). Garantált win, de additív — méréstechnikailag tisztább a decoder/encoder utánra hagyni. - -The `FixStrAscii*` (135-166) and `StringAscii` (167) markers are defined in `BinaryTypeCode.cs` with helper methods (`IsAsciiString`, `IsFixStrAscii`, `EncodeFixStrAscii`, `DecodeFixStrAsciiLength`). Encoding/decoding logic NOT yet implemented — currently both writer and reader use the universal `String` / `FixStr` markers. - -### Implementation -- **Writer**: in `WriteStringUtf8` / `WriteFixStrDirect`, after UTF-8 encoding (D-2 path), check `bytesWritten == charLength` (= ASCII iff equal). If ASCII, emit `FixStrAscii` (≤31 byte) or `StringAscii` (>31 byte). Else emit existing `FixStr` / `String`. Free detect — both numbers already computed by D-2. -- **Reader**: in `ReadStringUtf8` (or upstream marker dispatch), branch on marker. ASCII markers → dedicated byte→char widening path (no UTF-8 decode, no `Ascii.IsValid` scan, no decoder dispatch). Non-ASCII markers → existing custom UTF-8 decoder. -- **SGen**: regenerate readers/writers to dispatch on the new markers. -- **Re-enable ASCII fast paths**: uncomment writer FixStr dispatch in `AcBinarySerializer.cs` and reader `Ascii.IsValid` block in `ReadStringUtf8` — these temporarily disabled blocks become the marker-aware paths (no IsValid scan needed since the marker is the contract). - -### Wire format change -- Format version bump (1 → 2). Old readers fail clean on new wire (version mismatch). New readers must reject old wire OR support backward read. - -### Acceptance -- Repeated Strings (Hungarian content) Deser: AcBinary closes the ~10% gap vs MemoryPack -- Pure ASCII tests (Small/Medium/Large/Deep): AcBinary Ser AND Deser ≥ MemoryPack -- Wire size: minimum -25% vs MemoryPack across all test cells -- SGen-generated code compiles and round-trips on all `[AcBinarySerializable]` types -- Decision documented: backward-compat policy for v2 vs v1 wire - -### Resolution -End-to-end implementation landed (writer + reader + SGen + skip + populate). Key components: -- **Writer (`AcBinarySerializer.BinarySerializationContext.WriteStringWithDispatch`)** — single-pass UTF-8 encode + ASCII detect via `bytesWritten == charLength`; emits one of 4 markers (FixStrAscii / FixStr / StringAscii / String). Split layout for hot path: `charLength ≤ 31` encodes optimistically at `savedPos+1` (FixStr position) → 0 shift on FixStr hit; `charLength > 31` uses D-2 layout with backfill. The split avoids the post-encode left-shift that the unified layout introduced (regression seen in 12-42-32 bench). -- **Reader (`AcBinaryDeserializer.BinaryDeserializationContext.ReadAsciiBytesAsString`)** — `Encoding.Latin1.GetString` (BCL SIMD-accelerated byte→char widen). Avoids the `string.Create` callback + scalar widen overhead — measurably better on Small Deser cell (closed the +20% MemPack-relative anomaly). -- **TypeReaderTable**: `StringAscii` (167) + 32 × `FixStrAscii` (135-166) readers registered. `IsFixStrAscii` / `StringAscii` fast paths in `PopulatePropertyWithMarker`, `ReadValue`, `SkipValue`. -- **SGen (`AcBinarySourceGenerator.EmitReadString`)** — regenerated readers branch on `IsFixStr` / `IsFixStrAscii` / `case StringAscii` per property. - -**Wire format version not bumped** — the new markers occupy previously-unused codepoints (135-167); old wire (without ASCII markers) is forward-compatible (readers handle both `String` and `StringAscii`). v1 stays. - -**Acceptance (AOT bench 13-40-29, MemPack-relative ratios — JIT noise eliminated):** -- ✅ AcBinary Ser AND Deser GYORSABB MemPack-nél MINDEN cellán (5/5) - - Small: Ser -8%, Deser -23% - - Medium: Ser -17%, Deser -30% - - Large: Ser -28%, Deser -32% - - Repeated: Ser -4%, Deser -9% - - Deep: Ser -24%, Deser -22% -- ✅ Wire size advantage: 2043-50419 byte (vs MemPack 3070-64986) = **-22% to -33%** across cells -- ✅ Round-trip tests: 167 pass (13 pre-existing failures are IId-tracking, unrelated to M3R7) - -**JIT vs AOT note**: earlier JIT-mode benchmarks (12-50-43 → 13-27-20 series) showed elevated ratios on Small/Repeated cells (1.0-1.2 range) that disappeared under AOT publish. The JIT-mode numbers reflect tier-up artifacts (inconsistent inlining of SGen-generated reader hot paths during the 1000-iteration measurement window), not a structural M3R7 property. AOT (NativeAOT / ILC) compiles deterministically with fixed inline decisions — the steady-state numbers above reflect the actual production performance. - -## ACCORE-BIN-T-E2F9: Custom UTF-8 encoder (writer-side, symmetric with custom decoder) -**Priority:** P1 · **Type:** Performance · **Related:** decoder optimization (`AcBinaryDeserializer.BinaryDeserializationContext.Read.cs::DecodeUtf8SinglePass`) -**Status:** Closed (2026-05-04) - -> **Sorrendi megjegyzés:** ezt **A MARKER-DISPATCH ELŐTT** csináljuk (lásd `ACCORE-BIN-T-M3R7`). Indok: a custom encoder/decoder optimalizáció a "nehezebb, kevésbé biztos" win — a non-ASCII / mixed content workload-okat (Repeated Strings Hungarian) hozza be. A marker-dispatch utána már csak additív tisztítás a pure ASCII path dispatch-overhead-jén. - -Replace `Encoding.UTF8.GetBytes` calls in `WriteStringUtf8` / `WriteStringUtf8Internal` / `WriteFixStrDirect` (collectively the writer's UTF-8 encode path, post-D-2) with a hand-rolled SIMD encoder. Symmetric to the decoder optimization (V4N2 / Read.cs::DecodeUtf8SinglePass). - -### Layered structure (mirrors decoder) -- **Phase 1 — Vector256 ASCII narrow**: 16 chars (Vector256) → 16 bytes (Vector128) via `Vector256.Narrow`. ASCII detect via `(v & 0xFF80).ExtractMostSignificantBits() == 0` (any high bit on UTF-16 char). Break on first non-ASCII char. -- **Phase 2 — DWORD ASCII batch**: 4 chars at a time, OR-mask test, 4 bytes per iter when ASCII. -- **Phase 3 — Scalar multi-byte encode**: 1-byte (ASCII) / 2-byte (Latin extended) / 3-byte (BMP) / 4-byte (surrogate pair → supplementary plane) UTF-8 encoding via direct bit-extract. No fallback dispatch — input is trusted UTF-16 (string). -- Use `System.Text.Unicode.Utf8.FromUtf16` as fallback target for scalar correctness — or skip BCL entirely with manual bit-pack. - -### Why -`Encoding.UTF8.GetBytes` carries virtual-dispatch + encoder-fallback overhead even with SIMD ASCII fast path internally. Custom encoder skips this. ~15-30% Ser improvement on ASCII content, ~5-10% on non-ASCII (multi-byte path stays scalar). - -### Trigger -- **NEXT** — implementation order P1 before marker-dispatch (M3R7) -- Re-evaluate if .NET 11 BCL UTF-8 GetBytes becomes faster (PR #120628 follow-up) - -### Acceptance -- Writer-side benchmark: ≥15% Ser speedup on ASCII content (Small/Medium/Large/Deep), ≥5% on non-ASCII (Repeated) -- Wire format unchanged (custom encoder produces same bytes as `Encoding.UTF8`) -- Round-trip tests pass - -### Resolution -Implemented as `EncodeUtf8SinglePass` in `AcBinarySerializer.BinarySerializationContext.cs` — three-phase layered encoder (Vector256 ASCII narrow + DWORD ASCII batch + scalar 1/2/3-byte BMP & 4-byte surrogate-pair). Bypasses `Encoding.UTF8.GetBytes` virtual-dispatch + encoder-fallback overhead. Trusted-input path — no validation pass on writer side (the input is a .NET `string` with valid UTF-16 surrogate pairs by construction). - -Used by `WriteStringUtf8` (D-2 single-pass with VarUInt backfill) and `WriteStringWithDispatch` (M3R7 marker-dispatch path). Wire format unchanged — the encoder produces the same bytes as `Encoding.UTF8.GetBytes`. - -Acceptance (per bench 12-50-43 → 13-27-20, MemPack-relative ratios on AcBinary Compact FastMode SGen): -- ✅ ASCII Ser ≥ MemPack on 4/5 cells (Small 0.94, Medium 0.80, Large 0.79, Deep 0.81) -- ⚠️ Repeated Ser ~1.04 (Hungarian, multi-byte path scalar) — see follow-up `ACCORE-BIN-T-H7K3` -- ✅ Round-trip tests pass (167 of 180; 13 pre-existing failures unrelated to encoder) - -## ACCORE-BIN-T-W7N5: Default-value omission policy — doc + optional opt-out -**Priority:** P2 · **Type:** Refactor + Documentation · **Related:** `BINARY_ISSUES.md#accore-bin-i-d9y2` (canonical issue) - -The serializer's `PropertySkip` (102) optimization saves 1 byte per default-valued property by omitting the full value from the wire — relying on the consumer-side type definition to have the same `default(T)`. This is a **latent correctness risk** documented in `ACCORE-BIN-I-D9Y2`. This entry tracks the mitigation plan; full failure-mode analysis lives in the issue. - -### Decision tree (TBD when implementing) - -1. **Doc-only**: position as a deliberate protobuf-style feature; consumer keeps type defaults stable across versions. Lowest cost, maximum benchmark wire-size advantage retained. -2. **Option flag**: `AcBinarySerializerOptions.OmitDefaults` boolean. Default `true` (preserves current behavior + benchmark numbers). `false` writes every property in full — opt-out for fragile-class-evolution scenarios. -3. **Both**: ship doc + flag. Default behavior unchanged; consumers who hit silent-corruption have an explicit opt-out. - -### Acceptance (when implementing) - -- `BINARY_FEATURES.md` adds a "Default-Value Omission" section documenting the semantic and the tradeoff (with cross-ref to `ACCORE-BIN-I-D9Y2`) -- If flag added: round-trip tests covering both `true` and `false`; benchmark comparison table showing wire-size delta on ASCII / Hungarian / DTO-heavy workloads -- Decision rationale recorded in `LLM_PROTOCOL_DECISIONS.md` (or a `### Resolution` block on the issue) once implemented - ## ACCORE-BIN-T-H7K3: Hungarian / multi-byte content Ser optimization (Repeated Strings cell) **Priority:** P3 · **Type:** Performance · **Related:** `EncodeUtf8SinglePass` Phase 3 (scalar multi-byte encode), `ACCORE-BIN-T-E2F9` resolution **Status:** Closed (2026-05-04) — Won't Fix (JIT-only artifact) @@ -1298,31 +1029,6 @@ Reader-side: SGen-generated code drops the per-property `ReadByte()` + `IsTinyIn - Schema-evolution fragility documented in `BINARY_FEATURES.md` (alongside the existing `PropertySkip` / default-omission caveat from `ACCORE-BIN-I-D9Y2`) - Opt-in flag with default `false` (preserves marker-driven default; consumers explicitly opt in for frozen-schema scenarios) -## ACCORE-BIN-T-V4N3: Symmetric `GetUtf8ByteCount` API + writer-side BCL kihagyás (cold path) -**Priority:** P3 · **Type:** Performance · **Status:** Superseded (2026-05-08, by `ACCORE-BIN-T-K7M3`) — landed Closed 2026-05-06; subsequent A/B against modern `Utf8.FromUtf16` / `Utf8.ToUtf16` showed the BCL modern API outperforms the custom transcoder on every benchmark cell, leading to full hot-path switch in K7M3 · **Related:** `EncodeUtf8SinglePass`, `WriteStringUtf8Internal`, `PropertyMetadataBase.NameUtf8`, `ACCORE-BIN-T-K7M3` (hot-path BCL switch) - -Symmetric byte-count helper for `EncodeUtf8SinglePass`, paired with writer-side BCL `Encoding.UTF8.GetBytes` / `GetByteCount` removal across all cold-path call sites. `Utf8Transcoder.GetUtf8ByteCount(ReadOnlySpan)` SIMD impl (Vector512 / Vector256 / Vector128 / scalar tier hierarchy, 5-popcount closed-form aggregation handling chunk-split surrogate pairs correctly). - -**Implementation summary**: -- `Utf8Transcoder.GetUtf8ByteCount` SIMD impl with closed-form `bytes = 3*N - ascii - c_lt_0x800 + highSur - 3*lowSur` aggregation -- `Utf8TranscoderTests` extended (29 new tests covering ASCII / Hungarian / CJK / emoji / boundary 0-64, plus surrogate-pair-split-across-SIMD-chunks regression coverage) -- `WriteStringUtf8Internal` (`BinarySerializationContext.cs:875`) refactored from BCL two-pass to single-pass D-2 layout (worst-case `length*4` allocate + `EncodeUtf8SinglePass` + VarUInt backfill); the `4×` worst-case capacity is amortized by the buffer growth doubling strategy (`Math.Max(buffer.Length*2, position+needed)` + ArrayPool bucket-rounding to next power-of-2) -- Cold path cleanup: `AcBinarySerializer.AnalyzeStringInternCandidates` (analysis log) and `PropertyMetadataBase.NameUtf8` ctor-once init both migrated to `Utf8Transcoder` - -### Resolution - -Landed 2026-05-06. All `Utf8TranscoderTests` pass (55/55). Binary test suite unchanged (222 pass / 13 pre-existing GuidIId failures, untouched). - -**Critical observation surfaced during the audit**: `WriteStringUtf8Internal` has only one caller (`WriteFixStrDirect`), and `WriteFixStrDirect` itself is **uncalled anywhere in the codebase** — no core call site, no SourceGenerator template hit (verified against `AcBinarySourceGenerator.cs` line 706/724/1492/1514 — generator emits `WriteStringGenerated` and `context.WriteStringUtf8` (the public 659-line method, not `WriteStringUtf8Internal`)), no test, no reflection path. The V4N3 implementation therefore landed cleanly but its hot-path benchmark impact is limited to the two cold-path init sites. Dead-code disposition tracked as `ACCORE-BIN-T-V4N5`. - -**Algorithmic correctness lesson** — the initial 4-popcount formula (`3*N - c_lt_0x80 - c_lt_0x800 - 2*highSur`) was wrong on chunks where a surrogate pair straddles the SIMD chunk boundary (it implicitly assumed `lowSur == highSur` per chunk, which is true over the whole well-formed string but NOT per chunk). Fix: 5-popcount closed-form (`3*N - ascii - c_lt_0x800 + highSur - 3*lowSur`), with the scalar tail using the same per-char accounting model (`i += 1` per char regardless of role; high → 4, low → 0, BMP → 3, two-byte → 2, ASCII → 1). Caught by `GetUtf8ByteCount_MultipleEmojiBoundary_MatchesBcl` and `GetUtf8ByteCount_BoundaryAsciiToEmoji_MatchesBcl` regression tests — exactly the `prefixLen` 1, 7 boundaries that exercise chunk-split surrogate pairs. - -### Superseded by `ACCORE-BIN-T-K7M3` (2026-05-08) - -The V4N3 audit measured the custom transcoder against the **legacy `Encoding.UTF8.GetBytes`** API and won. **Did NOT measure against the modern `System.Text.Unicode.Utf8.FromUtf16` / `Utf8.ToUtf16` static API** (.NET 7+, used by MemoryPack source-gen). Once `D9X3` stabilized the bench, a direct A/B revealed the BCL modern API outperforms the custom transcoder on **every** cell (Ser deficit -14 to -22pp, Deser flips from behind to ahead). All 8 hot-path call sites switched to BCL in `K7M3`. The `Utf8Transcoder.cs` file is fully commented out — preserved as historical reference. - -The V4N3 algorithmic correctness work (5-popcount surrogate-pair-split-across-chunks closed-form) remains a **valid algorithmic contribution**, but no longer load-bearing on the hot path. - ## ACCORE-BIN-T-V4N4: NativeAOT-specific inlining / codegen audit on hot UTF-8 path **Priority:** P2 · **Type:** Performance · **Status:** Reverted (2026-05-07) — bench instability made the optimization signal unmeasurable · **Related:** `EncodeUtf8SinglePass`, `DecodeUtf8SinglePass`, `WriteStringWithDispatch`, `Utf8Transcoder` SIMD path @@ -1433,52 +1139,6 @@ A V4N4 audit **konklúziója** változatlan érvényes (constant-fold OK, reader **Obsoleted (2026-05-08) by `ACCORE-BIN-T-K7M3`** — the writer hot path no longer calls the custom `EncodeUtf8SinglePass` at all (`WriteStringWithDispatch` was switched to `Utf8.FromUtf16` BCL). The "AOT method-split / inlining audit" target (`Utf8Transcoder` body method-size in NativeAOT inline budget) is moot — the BCL `Utf8.FromUtf16` is a single static method with its own AOT-friendly inline footprint, and the audit's hypothesis space (Vector256 `IsSupported` constant-fold, lambda delegate cache) was correct for the prior code but no longer applies. The V4N4 disasm methodology remains a **valid technique** for future investigations of generic specialization / inline failures, but the specific hot-path target it analyzed is gone. -## ACCORE-BIN-T-J5L9: Remove dead `WriteFixStrDirect` / `WriteStringUtf8Internal` (audit-surfaced uncalled methods) -**Priority:** P3 · **Type:** Refactor / hygiene · **Status:** Closed (2026-05-06) · **Related:** `BinarySerializationContext.cs` - -V4N3 audit surfaced two methods with no callers in the entire workspace: - -- `WriteFixStrDirect(string)` — public method, no call site (no core, no SourceGenerator template, no test, no reflection / Expression-compile) -- `WriteStringUtf8Internal(string)` — private method called only from `WriteFixStrDirect`'s non-ASCII fallback branch - -The pair forms a closed dead loop (`WriteFixStrDirect` → `WriteStringUtf8Internal`), but no entry point reaches `WriteFixStrDirect`. The public-API `WriteStringUtf8` (line 659) is the live equivalent and is called from the SourceGenerator template (polymorphism path: assembly-qualified type-name write). The hot-path string-write goes through `WriteStringWithDispatch` (line 734) which uses the M3R7 marker-dispatch — NOT through this dead pair. - -### Disposition options (decide pre-NuGet release) - -1. **Delete both methods** — pure dead-code cleanup; reduces public surface, removes maintenance burden, simplifies onboarding. Functionality is fully covered by `WriteStringWithDispatch` (M3R7 marker-dispatch — emits `FixStr` / `FixStrAscii` directly with proper ASCII detection via `bytesWritten == charLength` after `EncodeUtf8SinglePass`). -2. **Activate `WriteFixStrDirect` for property-name writes** — SGen could emit `WriteFixStrDirect(propName)` instead of `WriteStringWithDispatch(propName)` for known-short, often-ASCII property names — saving the marker-dispatch overhead. Requires SGen template change + benchmark validation that the saving is real (likely marginal — property names are typically <31 char ASCII, so M3R7 already takes the FixStrAscii fast path with one byte-write to `_buffer`). The pre-encoded `NameUtf8` byte[] on `PropertyMetadataBase` already provides a faster path (`WriteFixStrBytes` at line 853) which the SGen / runtime writer could use directly. -3. **Defer** — leave as-is, document as dead code, revisit when the codebase has another reason to touch this area. - -### Why P3 - -- No correctness or perf impact in either direction (dead code is dead — no consumer affected) -- Cleanup vs activation is a low-stakes choice; benchmark would decide if option 2 has real saving -- Surfaced during V4N3 work, not blocking the NuGet release - -### Acceptance - -- Decision recorded (delete / activate / defer) with rationale -- If "delete": grep across workspace confirms zero callers post-removal; binary test suite unchanged (still 235 pass / 13 pre-existing failures) -- If "activate": SGen template change + benchmark validation showing ≥ 2% Ser improvement on a representative cell (otherwise revert to "delete") -- Documentation in `BINARY_IMPLEMENTATION.md` updated (or remove the old reference if both methods deleted) - -### Trigger - -- Pre-NuGet release housekeeping pass -- Or: any future refactor that touches `BinarySerializationContext` string-write methods (then decide rather than leave the dead pair behind) - -### Resolution - -Disposition: **Delete (Option 1)**. Landed 2026-05-06 together with the H2Q6 marker reorg commit. Five dead methods removed in a single cleanup pass: - -- `WriteFixStrDirect(string)` — uncalled public method -- `WriteStringUtf8Internal(string)` — uncalled private method (only called from `WriteFixStrDirect`) -- `WriteFixStr(string)` — uncalled public method (audit surfaced; was originally listed as live) -- `WriteFixStrBytes(ReadOnlySpan)` — uncalled public method (audit surfaced) -- `WritePreencodedPropertyName(ReadOnlySpan)` — uncalled public method (audit surfaced) - -All five had zero call sites across core, SourceGenerator template, tests, and reflection. The hot-path string write continues through `WriteStringWithDispatch` (M3R7 + H2Q6 marker dispatch) and `WriteStringInternFirstWithDispatch` (interning tier dispatch). Public surface reduced; binary test suite unchanged (222 pass / 13 pre-existing GuidIId failures). - ## ACCORE-BIN-T-L9Y3: FixArray marker tier — short-list count encoded in marker **Priority:** P3 · **Type:** Wire-format optimization · **Status:** Open · **Related:** `Array` (66) marker, `VarUInt itemCount`, `ACCORE-BIN-T-H2Q6` marker reservation @@ -1542,56 +1202,6 @@ Activation steps when implementing: - **`FixDict` analog** — same pattern for `Dictionary` marker (67) with `kvCount` 0-15. Worth considering only if a benchmark workload demonstrates dictionary-heavy structures; the current bench data (Order DTOs) does not. **Defer until evidence.** - **`FixArray 0-31`** — wider count range (32 markers). Marginal additional saving (16-31 elem list-ek ritkák); would consume nearly all freed marker space, leaving no slack for `S5L8`/`S2X9`. **Reject unless evidence warrants.** -## ACCORE-BIN-T-O7G2: Overflow guard on `charLength * 4` writer arithmetic + corrupted-wire `ReadStringBig` -**Priority:** P3 · **Type:** Defensive / safety · **Status:** Closed (2026-05-06) · **Related:** `WriteStringWithDispatch`, `WriteStringInternFirstWithDispatch`, `ReadStringBig`, `BinaryTypeCode.MaxStringCharLength` - -Defensive guards covering two latent failure modes in the H2Q6 string serialization paths: - -**Writer overflow (silent zero corruption)** — `charLength * 4` overflows `int` when `charLength > 0x1FFFFFFF` (~537M). At exactly `0x40000000` chars the multiplication wraps to **0**, causing: -- `EnsureCapacity(reserveHeader + 0)` to silently succeed (no buffer growth) -- `EncodeUtf8SinglePass(value, emptySpan)` to write 0 bytes, returning `bytesWritten = 0` -- The H2Q6 tier choice picks Small (`bytesWritten ≤ 255`), writing `[StringSmall][0][0]` to the wire -- **The string content is lost silently — no exception, wire claims an empty string** - -Other overflow values (e.g. `charLength = 600M` → `maxBytes` becomes negative) eventually surface as `ArgumentOutOfRangeException` from `Span.AsSpan(start, length)`, but the message ("length cannot be negative") is misleading and arrives after the buffer has already been partially mutated. - -**Reader corrupted wire (negative cast from oversized uint)** — in `ReadStringBig`, the wire-side `charLen:32` and `utf8Len:32` are read as `uint`, then cast to `int`. Corrupted or maliciously-crafted payloads with values > `Int32.MaxValue` produce negative ints, leading to `string.Create(negative, ...)` exceptions or position-state desync — at best a misleading message, at worst a partial decode with wire-position shifted incorrectly. - -### Resolution - -Landed 2026-05-06 (this commit window). - -**Writer side** — `WriteStringWithDispatch` and `WriteStringInternFirstWithDispatch` each gain one method-entry guard: - -```csharp -var charLength = value.Length; -if ((uint)charLength > BinaryTypeCode.MaxStringCharLength) ThrowStringTooLong(charLength); -``` - -A single unsigned compare catches the overflow band; predict-friendly (always false on realistic input). The throw helper is `[MethodImpl(MethodImplOptions.NoInlining)]` so the JIT/AOT keeps the throw site out of the inlined hot path. The same `charLength` value is reused across the FastWire and Compact branches — no duplicate guard. - -**Reader side** — `ReadStringBig` gains a single bitwise-OR + sign-test: - -```csharp -var packed = context.ReadUInt64Unsafe(); -var charLength = (int)(uint)packed; -var byteLength = (int)(uint)(packed >> 32); -if ((charLength | byteLength) < 0) ThrowCorruptedBigWire(charLength, byteLength); -``` - -The OR + sign-test catches negative casts (any wire-side uint > `Int32.MaxValue` produces a negative int after cast; OR of two positives is positive, sign-test cheap). One instruction effective; predict-friendly. - -**New constant**: `BinaryTypeCode.MaxStringCharLength = 0x1FFFFFFF` (536_870_911 — largest charLength where `charLength * 4` fits in int). - -**Hot-path cost**: ~0% on realistic input — single unsigned compare on the writer, single OR + sign-test on the reader Big tier (Small/Medium readers untouched since their wire values are bounded by `byte` / `ushort` types and cannot overflow). Throw helpers `NoInlining` keep the inlined caller body compact. Tests 222 pass / 13 pre-existing failures unchanged. - -### Why P3 - -- No correctness impact for realistic inputs (the overflow band is far outside any real DTO scenario) -- Defensive value: prevents silent data loss in the `charLength = 1.07G` zero-overflow edge case + provides clear error messages on out-of-range inputs -- Security value: corrupted/malicious wire payloads on the reader Big tier path are now caught early instead of producing inconsistent position state -- NuGet release professional-quality signal — explicit, defensive guards over silent-corruption paths - ## ACCORE-BIN-T-S6F2: Shift-mentes Small fast path in `WriteStringWithDispatch` **Priority:** P3 · **Type:** Performance · **Status:** Reverted (2026-05-07, with V4N4 method-split) · **Related:** `WriteStringWithDispatch`, `BinaryTypeCode.StringSmall`, `ACCORE-BIN-T-V4N4` @@ -1829,332 +1439,6 @@ A parallel `BenchmarkDotNet`-based project would close that gap: The custom `Console` bench **is not replaced** — it remains the dev-iteration tool (fast feedback loop, 30-90s runs, hand-tuned markdown for chat-paste). BDN is the **release-grade** bench (3-10 min runs, statistical rigor, NuGet release output). Different tools for different audiences. -## ACCORE-BIN-T-C5R8: Charset-parameterized benchmark workload (ASCII / Hungarian / CJK / Cyrillic / Mixed) -**Priority:** P2 · **Type:** Tooling / release-narrative · **Status:** Closed (2026-05-07) · **Related:** `BenchmarkTestDataProvider`, `AyCode.Core.Serializers.Console.Program.cs` (Settings → Charset submenu), `ACCORE-BIN-T-V4N2` (charset-specific optimization measurement target), `ACCORE-BIN-T-D9X3` (bench stabilization preceding this work) - -The current `BenchmarkTestDataProvider` hard-codes Hungarian (Latin extended 2-byte) content into the test DTOs. This produces a single workload-shape: **Hungarian mixed text with short 1-2 char 2-byte runs**. While Hungarian is a fine general-purpose i18n stress, it is **only one production-content profile** — and the optimization decisions ride on it implicitly (e.g. V4N2 Phase 2.5's 3-byte run do-while was deferred-on-2-byte-side because the Hungarian bench measured regression there, but its CJK-side value cannot be measured on the current data). - -A **charset-parameterized** benchmark workload — selectable from the interactive menu — would: - -- **Measure optimization value across realistic content profiles** — what wins on CJK content may not win on Hungarian, and vice versa. Without explicit per-charset measurement, optimization decisions become Hungarian-biased. -- **Surface release-narrative numbers credibly** — instead of "Compact beats MemPack on i18n payload" (single workload), claim "Compact vs MemPack: ASCII X%, Hungarian Y%, CJK Z%, Cyrillic W%, Mixed V%" — concrete numbers per content profile, NuGet-grade. -- **Enable workload-specific optimization audits** — V4N2 Phase 3 SIMD multi-byte transcoder targets CJK 3-byte content; without a CJK workload measurement, Phase 3 acceptance criteria cannot be validated. - -### Implementation outline - -#### 1. `BenchmarkTestDataProvider` refactor - -Hard-coded Hungarian strings (`KözösCímke`, `sötét`, `magyar`, `hetenkénti`, etc.) → **ASCII baseline values** (English equivalents: `SharedTag`, `dark`, `hungarian`, `weekly`). - -New static `LongStringSuffix` field — charset-aware suffix appended to a subset of property values: - -```csharp -public static class CharsetSuffixes -{ - public const string AsciiOnly = ""; // baseline — pure-English ASCII content - public const string Hungarian = " árvíztűrő tükörfúrógép"; - public const string CjkBmp = " 你好世界 こんにちは 안녕하세요"; - public const string Cyrillic = " Привет мир дорогой друг"; - public const string Mixed = " árvíz 你好 Привет 😀"; -} - -public static string LongStringSuffix { get; set; } = CharsetSuffixes.Hungarian; // default -``` - -Property values use the suffix dynamically: -```csharp -var description = "Product description" + LongStringSuffix; -``` - -The 5 charsets cover the realistic UTF-8 workload spectrum: -1. **Pure ASCII** — baseline; Phase 1 SIMD prefix widen + DWORD batch dominate; no multi-byte path engagement -2. **Hungarian** (Latin extended) — short 1-2 char 2-byte runs in mixed text; current default workload -3. **CJK BMP** — long homogeneous 3-byte runs; primary V4N2 Phase 2.5/3 win region -4. **Cyrillic** (Russian / etc.) — long 2-byte runs (different shape than Hungarian mixed); V4N2 Phase 2.5 may yet pay off here -5. **Mixed** (Hungarian + CJK + emoji) — full multi-tier coverage in one payload; surrogate-pair handling stress - -#### 2. `Program.cs` interactive submenu - -Before starting a benchmark run, prompt the user for charset choice: - -``` -Choose benchmark charset: - 1 — Pure ASCII (baseline) - 2 — Hungarian (Latin extended) [DEFAULT] - 3 — CJK BMP (Chinese / Japanese / Korean) - 4 — Cyrillic (Russian / etc.) - 5 — Mixed (Hungarian + CJK + emoji) -``` - -The choice → `BenchmarkTestDataProvider.LongStringSuffix = ...` before constructing test data. - -#### 3. Benchmark output header - -The markdown output header should reflect the selected charset: -``` -# AcBinary Benchmark Release 2026-05-07 16:00:00 -Charset: CJK BMP | Iterations: 1000 | Warmup: 10000 | ... -``` - -This makes per-charset bench files self-documenting — file names + content both encode the workload profile. - -#### 4. Round-trip tests unaffected - -`Utf8TranscoderTests` and other content-class unit tests (with their fixed Hungarian / CJK / emoji boundary inputs) are **untouched** — they remain fixed-content for regression coverage. Only the benchmark workload is charset-parameterized. - -### Why P2 - -- **Release-narrative**: NuGet release credibility depends on measurable performance claims across realistic content profiles, not a single Hungarian-mixed workload -- **Optimization decision quality**: V4N2 Phase 2.5 / Phase 3 / future SIMD multi-byte work cannot be objectively validated without a CJK workload — current decisions have implicit Hungarian-bias -- **Consumer reproducibility**: external consumers can reproduce benchmark numbers on their own content profile (or contribute a new charset profile) - -### Acceptance - -- `BenchmarkTestDataProvider` refactored: ASCII baseline + `LongStringSuffix` static field with 5 predefined charset constants -- Interactive menu in `Program.cs` lets the user choose charset 1-5 before benchmark run; the chosen charset is recorded in the markdown output header -- Round-trip correctness verification still runs once-per-cell before warmup (existing `Verified: round-trip ...` line) — works on the active charset -- All 5 charsets produce valid round-trip on all benchmark cells (Small / Medium / Large / Repeated / Deep) -- Existing benchmark numbers (Hungarian-default) reproducible — choosing charset 2 from the menu yields the current 15:29:21-style results -- New CJK charset (option 3) produces measurable numbers (one bench run per charset documented in `Test_Benchmark_Results/`) - -### Trigger - -- Pre-NuGet release: per-charset numbers needed for the public performance-claim table -- Or: when V4N2 Phase 3 SIMD multi-byte transcoder work needs CJK-workload validation - -### Resolution - -Landed 2026-05-07 (after `ACCORE-BIN-T-D9X3` bench stabilization made sub-3% deltas measurable, which raised the value of charset-specific measurement). Implementation refined the original 5-charset proposal into a 6-charset list per user request (Latin1FixAscii + Latin1 short/long split for finer-grained Latin1 coverage): - -**1. `BenchmarkTestDataProvider` refactor** ✅ - -- New `CharsetSuffixes` static class with **6** const suffixes (one more than originally proposed): - - `Latin1FixAscii = ""` — empty suffix; baseline values stay short → FixStr fast-path stress (renamed from `AsciiOnly` per user request) - - `Latin1Short = " árvíztűrő tükörfúrógép"` (~24 char) — Hungarian short Latin1 mixed - - `Latin1Long = " árvíztűrő tükörfúrógép a magyar betűzés tesztje"` (~47 char) — **NEW**, exceeds the 32-char FixStr boundary on the suffix alone (user request) - - `CjkBmp`, `Cyrillic`, `Mixed` — as originally specified -- `LongStringSuffix` default = `CharsetSuffixes.Latin1Long` (backward-compatible in spirit with the prior fixed Latin1 default) -- All hard-coded Hungarian baseline values replaced with ASCII English equivalents: - - `KözösCímke` / `IsmétlődőCímke` / `MélyCímke` → `SharedTag` / `RepeatedTag` / `DeepTag` - - `közösfelhasználó` → `shareduser` (and variants); `közös` → `shared`; `MélyKategória` → `DeepCategory` - - `sötét` / `világos` → `dark` / `light`; `magyar` / `német` / `francia` → `hungarian` / `german` / `french` - - `hetenkénti` / `naponkénti` / `havonkénti` → `weekly` / `daily` / `monthly` - - Repeated cell long Hungarian baselines (`TermékNév_IsmétlődőTesztAdat_árvíztűrőtükörfúrógép`, `RaklapKód_IsmétlődőTesztAdat_árvíztűrő`) shortened to ASCII `ProductName` / `PalletCode` so the `EnsureAllStringsBypassFixStr` suffix-append actually applies (the prior >31-char baselines bypassed the suffix, leaving Repeated cell content fixed-Hungarian regardless of charset selection) -- The only Latin1/non-ASCII characters remaining in the file are inside the `CharsetSuffixes` const definitions themselves (intentional — those define the per-charset content profiles) - -**2. `Program.cs` interactive submenu** ✅ - -- New `[3] Charset` entry in the existing `Settings` submenu (next to `[1] Iteration` and `[2] WireMode`) — chose nested submenu over a top-level prompt to keep the main menu uncluttered -- `ShowCharsetSettingsMenu` lists the 6 charset constants with brief descriptions; selection sets `BenchmarkTestDataProvider.LongStringSuffix` and returns -- `GetCurrentCharsetName()` helper resolves the active suffix back to its constant name (returns `"Custom"` when programmatically set to a non-const value) - -**3. Benchmark output header** ✅ - -- `Charset:` field added to **3 output locations**: - - Console run header (interactive run line — `Layer: ... | Charset: CjkBmp | Iterations: ...`) - - `.LLM` markdown header (file-self-documenting) - - `.log` boxed banner (║ Charset: CjkBmp ║) - -**4. Round-trip tests unaffected** ✅ — `Utf8TranscoderTests` and other content-class unit tests use their own fixed boundary inputs; not touched by this change. Round-trip verification in the bench harness continues to run once-per-cell pre-warmup (`VerifyRoundTrip`) on the active charset. - -### Acceptance status - -- ✅ `BenchmarkTestDataProvider` refactored with ASCII baselines + `LongStringSuffix` field + 6 charset constants -- ✅ Interactive submenu lets the user choose charset 1-6; recorded in markdown output header (3 locations) -- ✅ Round-trip verification runs on the active charset (existing per-cell verify, charset-agnostic by design) -- ⚠️ "All 6 charsets produce valid round-trip on all benchmark cells" — design correctness implies this; not yet exercised on every (cell × charset) combination explicitly. Recommend running each charset once before declaring full validation. -- ❌ "Existing benchmark numbers (Hungarian-default) reproducible — choosing charset 2 yields the current 15:29:21-style results" — **NOT met**: the ASCII baseline refactor changes the numbers regardless of charset choice (shorter baselines + suffix-driven content vs. prior fixed Hungarian baselines). New `Latin1Short` ≠ prior fixed Hungarian default. This is intentional: the user explicitly chose a clean ASCII-baseline + charset-suffix design over preserving historical numerical comparability. -- ❌ "Choosing CJK produces measurable numbers documented in `Test_Benchmark_Results/`" — **NOT done in this commit window**; user has the menu and will run per-charset benches in a follow-up sprint. - -### Note on numerical incompatibility with prior runs - -Existing bench files generated before this commit (e.g. `Console.FullBenchmark_Release_2026-05-07_17-42-22.LLM` and earlier) used the prior fixed Latin1 baseline values + 32-char Hungarian suffix. The new default (`Latin1Long`) uses ASCII baselines + 47-char Latin1Long suffix; the Repeated cell sees a more dramatic shift (its 52-char fixed Hungarian baseline → 11-char ASCII `ProductName` + 47-char suffix). **Numerical comparison across the boundary is not meaningful**; the `Charset:` header field documents the source charset for each new bench file. - -### Future extensions - -- **Sentinel "real-world" charsets** — synthetic mixes representing typical production payloads (e.g. `EnglishWithEmoji` for chat-app DTOs, `ArabicHebrew` for RTL-script regions). Add as new `CharsetSuffixes` constants when consumer demand surfaces. -- **Charset auto-rotate mode** — single benchmark run cycles through all 5 charsets, producing a 5-section markdown output. Useful for full release-narrative table generation in one pass. -- **BDN integration** (per `ACCORE-BIN-T-B1D5`): charset becomes a `[Params]` axis in BenchmarkDotNet, producing a 5×5×N matrix (cells × charsets × engines) in the BDN output. - -## ACCORE-BIN-T-D9X3: Console benchmark stabilization (per-serializer warmup + GC isolate + pilot discard + min/max range + CPU pin + mode-aware JIT sleep) -**Priority:** P1 · **Type:** Tooling / measurement · **Status:** Closed (2026-05-07) · **Related:** `AyCode.Core.Serializers.Console.Program.cs`, `ACCORE-BIN-T-V4N4`, `ACCORE-BIN-T-V4N2`, `ACCORE-BIN-T-S6F2`, `ACCORE-BIN-T-B1D5` (BDN release-grade variant) - -The custom `Console` benchmark harness showed strong run-to-run variance — user-reported `±20pp / -10pp` summa-spread between runs on identical code. 1-3% perf-claims became unmeasurable on this noise-floor; the V4N4 method-split and V4N2 Phase 2.5 attempts both fell into this band, leaving the question "does the regressed bench number reflect a code regression or measurement noise?" undecidable (see `V4N4` Reverted section). - -**Diagnosis** (sprint takeaway prior to this entry): - -1. **Warmup cache pollution** — `RunBenchmarksForTestData` ran one warmup-all loop (every serializer × WarmupIterations) followed by one bench-all loop. By the time a given serializer was measured, its hot code and data lines had been evicted by the intervening serializers' warmup passes. MemPack and AcBinary hot paths share neither code nor data working sets — they actively evict each other. -2. **GC pause leakage between samples** — the Stopwatch-recorded sample loop had no explicit `GC.Collect`. A minor GC triggered inside sample N could promote into a Gen-2 pause inside sample N+1's timed window (1-5 ms spike). -3. **Pilot sample contamination** — the first sample after warmup absorbed residual JIT bookkeeping and cold-cache misses; on a 10-sample median this contributed 1-2 outliers that visibly stretched the min/max. -4. **CPU migration / preemption** — the Windows scheduler migrated the bench thread between cores between samples (L1/L2 cache evict on each migration); background work (Defender index, OS service threads) injected random preemption spikes. -5. **JIT sleep not mode-aware** — `Thread.Sleep(JitSleep = 3000)` waited 3 seconds before each cell for tiered-JIT drain. On AOT publish (`PublishAot=true`) there IS NO dynamic compilation — the 3 seconds were pure idle. Worse, the drain happened only globally (once before all cells), not per-serializer, so a tier-promotion mid-bench could still bleed in. -6. **Range invisible** — the `.LLM` markdown output showed only the median; the user could not tell whether a 5%-median-delta was inside or outside the inter-sample range for that row. - -### Resolution - -Landed 2026-05-07 (16:00 — 17:00). Six stabilization steps in one commit window: - -**1. Per-serializer warmup separation** (`RunBenchmarksForTestData`) — the warmup-loop and bench-loop merged into one per-serializer cycle: each serializer's warmup runs IMMEDIATELY before its own bench. The serializer's hot code/data is freshest in cache when the first sample times. - -**2. `GC.Collect` before every sample** (`RunTimed`) — `GC.Collect() + WaitForPendingFinalizers() + GC.Collect()` triple-tap before each sample, OUTSIDE the Stopwatch window. Every sample starts from the same heap state; an ad-hoc Gen-2 pause from sample N can no longer bleed into sample N+1. - -**3. Pilot sample discard** (`RunTimed`) — the loop runs `samples + 1` times; the first (index 0) is discarded. The first sample post-warmup absorbs residual JIT/GC bookkeeping and cold cache; the recorded `samples` count remains 10 (median is the same data the user saw before, just sourced from "typical" sample-set, not from the post-warmup-first noisy point). - -**4. Min/max range in markdown output** (`SaveLlmResults`, new `FormatMicrosWithRange` helper, new `BenchmarkResult` fields: `SerializeTimeMinMs/MaxMs`, `DeserializeTimeMinMs/MaxMs`, `RoundTripTimeMinMs/MaxMs`) — the `.LLM` output's Ser and Deser columns now render as `26.86 (24.50..29.10)`: median (min..max) µs/op. The reader sees at a glance whether a delta is above the row's noise floor. - -**5. CPU affinity + process priority** (`RunBenchmark`) — `ProcessorAffinity = 0x1` (CPU 0 pin) + `PriorityClass = High` for the benchmark phase, `try/finally` restores the original values. Eliminates inter-sample thread migration (L1/L2 cache evicts) and reduces background-task preemption. Platform-guarded: Windows / Linux only (`CA1416` — `ProcessorAffinity` throws on macOS); locked-down hosts (group policy, container without `CAP_SYS_NICE`, etc.) catch + warning + bench continues with default scheduling. - -**6. Mode-aware `JitSleep`** (property) — `RuntimeFeature.IsDynamicCodeCompiled ? 250 : 0`. JIT mode 250 ms (the .NET 9 tiered-JIT compile queue typically drains in <100 ms for the bench's hot path); AOT publish 0 ms. The 3000 ms blind wait is gone. The drain now happens per-serializer (Step 1) instead of once globally. - -### Bench result (3 consecutive runs, 2026-05-07 17:00:32 / 17:01:03 / 17:01:32, FastestByte mode, FastMode preset) - -| Cell | AcBinary Ser median (3 runs) | Inter-run spread | Intra-cell range | -|---|---|---|---| -| Small | 7.09 / 6.83 / 6.55 | 7.6% | ~8% (noise floor: 1000×6ns measured) | -| Medium | 18.74 / 18.90 / 19.22 | 2.6% | ~10% | -| Large | 140.20 / 141.67 / 141.02 | 1.0% | ~3% | -| Repeated | 26.52 / 26.25 / 26.28 | 0.3% | ~6% | -| Deep Nested | 23.44 / 23.17 / 22.70 | 3.2% | ~7% | - -The previous `±20pp / -10pp` summa-spread shrank to **1-3pp** on the medium/large cells. The Small cell remains noisy (~8% relative) but this is a physical floor: 1000 iter × 6 ns/op = 6 µs total batch — below this, Stopwatch resolution and OS spikes dominate relatively. - -The `(min..max)` range is consistently 3-10% relative — a **measurable** signal floor: 1-3% perf-deltas no longer disappear into noise. - -### Lessons - -- **Bench stabilization is a precondition for perf optimization, not a consequence.** Optimization decisions (e.g. V4N4 method-split, V4N2 Phase 2.5) can only be derived from bench numbers if the noise floor < expected signal. Without that, the bench numbers mean nothing. -- Cache pollution (warmup-all → bench-all flow) was the **single largest** noise source: per-serializer warmup separation alone removed ~10pp of variance. -- Platform stabilization (CPU pin + high priority) combined with heap stabilization (GC.Collect + pilot discard) further tightened the range. -- AOT and JIT have different stabilization needs: the 3000 ms blind sleep was idle time on AOT; mode-aware sleep pays the cost only when needed. - -### Re-evaluation list (entries currently Reverted or unmeasurable) - -The stabilization opens a follow-up sprint: the `Reverted (2026-05-07)` entries are re-evaluable now that the noise floor < the expected 1-3% signal: -- **`ACCORE-BIN-T-V4N4`** — method-split (writer + reader hot path) is re-testable -- **`ACCORE-BIN-T-V4N2` (Phase 2.5)** — UTF-8 do-while runs (2-byte / 3-byte) per charset -- **`ACCORE-BIN-T-S6F2`** — Small fast path (was integrated into V4N4) - -Per-entry re-evaluation is the next sprint's task, NOT part of this Closed entry. - -### Why P1 - -- Blocked all sub-3% perf optimization work (every recent attempt fell into the noise band) -- One-line user complaint ("+20 és -10 között ingadozott a summa") summarized weeks of unproductive bench-driven investigation -- One-time fixed cost; every future bench run benefits - -### Follow-up: adaptive iteration + CV reporting + per-cell A/B mode (2026-05-07, second commit window) - -After the initial 6-step landing, three additional refinements were added in a second commit window the same day. The trigger was a Copilot-suggested noise-reduction list against the now-stable bench output: - -**1. Per-cell adaptive iteration** — fixed `TestIterations = 1000` produced sample windows from 6 ms (Small cell @ 6 ns/op) to 140 ms (Large cell @ 140 µs/op). The Small cell at 6 ms remained the dominant residual noise source (7.6% inter-run spread vs ≤3.2% on the other cells) because OS-level spikes (preempt + IRQ + scheduler tick) are absolute-time events; on a 6 ms sample window their relative contribution is huge. - -Implementation: -- New constant `TargetSampleMs = 250` (per-sample wall-clock target) -- New helper `CalibrateIterations(Action, int targetMs)` — runs a 100-iter probe post-warmup, computes `iterPerMs`, and rounds up to the nearest 1000. Floor 1000, ceiling 200_000. -- `RunBenchmarksForTestData` calibrates Ser and Des INDEPENDENTLY per serializer (different per-op cost). RT-only rows (NamedPipe) get a single RT calibration. -- New `BenchmarkResult` fields: `SerializeIterations`, `DeserializeIterations`, `RoundTripIterations` (per-row). -- New helpers: `ToPerOpMicros(double, int)` (replaces 1-arg variant), `SerPerOp(r)` / `DesPerOp(r)` / `RtPerOp(r)` for per-op µs from the result. -- All `Average(r => r.*TimeMs)` and `OrderBy(r => r.RoundTripTimeMs)` call-sites refactored to use per-op µs (iter-independent) — mixing batch-time across rows with different iter counts would be meaningless. ~20 call-sites total. -- RT for in-mem rows synthesized so `RtPerOp(r) == SerPerOp(r) + DesPerOp(r)` regardless of `serIter != desIter`: `RoundTripIterations = max(serIter, desIter)`, `RoundTripTimeMs = rtPerOpMicros / 1000 * RoundTripIterations`. - -**Expected impact**: Small cell sample window 6 ms → ~240 ms; inter-run spread 7.6% → ~1-2% (matching the other cells). Total suite duration ~50 s → ~110-130 s. - -**2. CV (coefficient of variation) reporting + unstable-row marker** — the median + (min..max) range surfaces shape but not a single-number stability metric. The CV (= stddev/mean) is the standard statistical measure; rows with CV > threshold are flagged with a ⚠️ suffix in the markdown output so a small inter-engine delta on a high-CV row is immediately obvious as noise-suspect. - -Implementation: -- New constant `UnstableCVThreshold = 0.03` (3% — reasonable for stabilized in-memory benchmarks) -- `RunTimed` return tuple extended: `(median, min, max, stddev)`. Stddev computed over the (samples − pilot) population using `Math.Sqrt(Math.Max(0, E[X²] - E[X]²))`. -- New `BenchmarkResult` fields: `SerializeTimeStdDevMs`, `DeserializeTimeStdDevMs`, `RoundTripTimeStdDevMs`. -- `FormatMicrosWithRange` extended: `26.86 (24.50..29.10)` stays the default; `26.86 (24.50..29.10) ⚠️5.2%` appears when CV exceeds the threshold. - -**3. Per-cell A/B mini-suite filter** — optimization-iteration loops often need only one specific cell (e.g. "tuning the Repeated cell for Hungarian charset"). The full 5-cell × 2-engine × 4-measurement suite is overkill for that. - -Implementation: -- `FilterByLayer` extended: new `small` / `medium` / `large` / `repeated` / `deep` modes — case-insensitive prefix match on `TestDataSet.Name` -- `TryParseCliArgs` recognizes the new tokens: `dotnet run -- repeated` runs only the Repeated Strings cell -- `fastestbyte` mode (existing — only AcBinary FastMode + MemoryPack head-to-head) is orthogonal and stacks: `dotnet run -- repeated fastestbyte` - -### Markdown output schema change - -The `## Results` table gains an `Iter Ser/Des` column at the right edge — visible verification that each row's batch landed near the `TargetSampleMs` window. RT-only rows show a single `Iter` value (the RT calibration count); in-mem rows show `serIter / desIter`. - -Header line updated: -- Before: `Iterations: 1000 | Warmup: 10000 | Samples: 10 (median) | ...` -- After: `Iterations: per-cell adaptive (target ~250 ms/sample) | Warmup: 10000 | Samples: 10 (median) + 1 pilot discarded | ... | UnstableCV threshold: 3%` - -## ACCORE-BIN-T-K7M3: Hot-path UTF-8 transcoder switch — `Utf8Transcoder` → BCL `Utf8.FromUtf16` / `Utf8.ToUtf16` -**Priority:** P1 · **Type:** Performance · **Status:** Closed (2026-05-08) · **Related:** `ACCORE-BIN-T-V4N3` (custom transcoder origin), `ACCORE-BIN-T-V4N2` (Phase 3 SIMD multi-byte), `ACCORE-BIN-T-V4N4` (Reverted method-split), `ACCORE-BIN-T-D9X3` (bench stabilization that made the comparison measurable) - -The custom `Utf8Transcoder` (V4N3) was originally implemented to bypass `System.Text.Encoding.UTF8.GetBytes` virtual-dispatch + EncoderFallback overhead. The V4N3 audit measured wins vs. the **legacy `Encoding.UTF8`** API. **What it did NOT measure**: the modern `System.Text.Unicode.Utf8.FromUtf16` / `Utf8.ToUtf16` API (.NET 7+, tier-1 optimized, used by MemoryPack `WriteUtf8` / `ReadUtf8` paths internally). Once the bench stabilized (D9X3), a direct A/B comparison surfaced that the BCL modern API consistently outperforms the custom transcoder on the binary serializer's hot path. - -### Bench A/B (Latin1Long charset, FastMode SGen Compact) - -| Cell | Ser delta vs MemPack — custom (`EncodeUtf8SinglePass`) | Ser delta vs MemPack — BCL (`Utf8.FromUtf16`) | Improvement | -|------|--------------------------------------------------------|------------------------------------------------|-------------| -| Small | +28.5% | +7.3% | **-21pp** | -| Medium | +23.8% | +3.1% | -21pp | -| Large | +19.6% | +5.1% | -14pp | -| Repeated | +28.8% | +10.9% | -18pp | -| Deep | +23.1% | +0.6% | -22pp | - -| Cell | Deser delta vs MemPack — custom (`DecodeUtf8SinglePass`) | Deser delta vs MemPack — BCL (`Utf8.ToUtf16`) | Improvement | -|------|---------------------------------------------------------|------------------------------------------------|-------------| -| Small | +17.6% | -1.2% (paritás) | -19pp | -| Medium | +12.8% | -4.7% (AcBinary nyer) | -17pp | -| Large | +4.9% | -10.3% (AcBinary nyer) | -15pp | -| Repeated | +16.9% | -1.6% (paritás) | -18pp | -| Deep | +7.0% | -9.0% (AcBinary nyer) | -16pp | - -The Deser side flipped from "consistently behind" to "wins on 3 of 5 cells, paritás on 2". The Ser side closed the deficit from +20-29% to 0-11%. **Both sides** measurable improvement on **every** cell. - -### Why the custom transcoder lost - -The V4N3 implementation included a 4-tier SIMD ASCII prefix path (Vector512BW / Vector256 / Vector128 / scalar) plus a DWORD ASCII batch + scalar 4-branch multi-byte fallback. **All correct, all SIMD-tuned**. But: - -1. **`Utf8.FromUtf16` is also SIMD-tuned in .NET 9** — the .NET team rewrote it on top of `System.Text.Unicode.Utf8` primitives that share infrastructure with `Ascii.IsValid` / `Latin1.GetString`. AOT-publish-friendly, branch-friendly, no virtual dispatch (the `Utf8` API is static, not via an `Encoding` instance with virtual-method-table). -2. **The custom transcoder's ASCII prefix path bails out on first non-ASCII byte** — on multi-byte content (Latin extended / Cyrillic / CJK) the SIMD path runs only for the leading ASCII span, then the entire remainder falls into per-char scalar 4-branch dispatch. The BCL `Utf8.FromUtf16` SIMD-batches multi-byte content too (different algorithm — the BCL doesn't bail on first non-ASCII). -3. **AOT inline budget**: the custom transcoder's body grew with the V4N3 / V4N4 / V4N5 additions; in NativeAOT publish the call sites in `WriteStringWithDispatch` / `ReadString*` did NOT inline (V4N4 disasm audit confirmed). The BCL `Utf8.FromUtf16` is a single static method with a tighter call-site footprint. - -### Resolution - -Landed 2026-05-08. The 8 production hot-path call sites of `Utf8Transcoder.*` switched to BCL: - -| File / line | Before | After | -|---|---|---| -| `AcBinarySerializer.cs:120` | `Utf8Transcoder.GetUtf8ByteCount` | `Encoding.UTF8.GetByteCount` | -| `AcBinarySerializer.BinarySerializationContext.cs:694` | `Utf8Transcoder.EncodeUtf8SinglePass` | `Utf8.FromUtf16(...)` | -| `AcBinarySerializer.BinarySerializationContext.cs:784` | `Utf8Transcoder.EncodeUtf8SinglePass` | `Utf8.FromUtf16(...)` | -| `AcBinarySerializer.BinarySerializationContext.cs:901` | `Utf8Transcoder.EncodeUtf8SinglePass` | `Utf8.FromUtf16(...)` | -| `AcBinaryDeserializer.BinaryDeserializationContext.Read.cs:523` | `Utf8Transcoder.CountUtf8Chars` | `Encoding.UTF8.GetCharCount` | -| `AcBinaryDeserializer.BinaryDeserializationContext.Read.cs:527` | `Utf8Transcoder.DecodeUtf8SinglePass` | `Utf8.ToUtf16(...)` | -| `AcBinaryDeserializer.BinaryDeserializationContext.Read.cs:565` | `Utf8Transcoder.DecodeUtf8SinglePass` | `Utf8.ToUtf16(...)` | -| `PropertyMetadataBase.cs:104-109` (ctor-once) | `Utf8Transcoder.GetUtf8ByteCount` + `EncodeUtf8SinglePass` (two-pass) | `Encoding.UTF8.GetBytes(string)` (single-pass with exact-size byte[] return) | - -The count-only call sites (`GetByteCount` / `GetCharCount`) stay on the **legacy** `Encoding.UTF8` API — `System.Text.Unicode.Utf8` has no count-only equivalent (only `FromUtf16` / `ToUtf16` which encode + count combined). For pure count, the legacy API is the optimal tool (single SIMD-tuned scan, no encode/decode work). - -The `Utf8Transcoder.cs` file remains in the repo but **fully commented out** — the class definition is preserved as historical reference / future reactivation if a workload ever surfaces where it could win again. `Utf8TranscoderTests.cs` is not currently exercising live code. - -### Lesson — the V4N3 audit's blind spot - -The V4N3 (custom transcoder) audit compared against **legacy `Encoding.UTF8.GetBytes`** and won. **The audit did NOT compare against `Utf8.FromUtf16`** (the modern API, .NET 7+). On modern runtime the BCL has two UTF-8 transcoders: a legacy one (instance-method on `Encoding`, virtual dispatch) and a modern one (static `Utf8.FromUtf16` / `Utf8.ToUtf16`). MemoryPack uses the modern one — that's what we should have been comparing against from the start. - -**Generalizable lesson**: when measuring a custom implementation against a "BCL baseline", verify which BCL API is used by the actual competition (here: MemoryPack source-gen). The `Encoding.UTF8.*` instance API and `System.Text.Unicode.Utf8` static API are different generations of the same logical operation; treating them as interchangeable hides the comparison's scope. - -### Why P1 - -- Closed the FastMode Compact mode Ser deficit from +20-29% to ≤11% on every cell (Latin1Long benchmark) -- Flipped the Deser side from -1 to -10% deficit to **AcBinary winning on 3 of 5 cells**, parity on 2 (Latin1Long benchmark) -- One-time fixed cost (8 production call-site cseréje) — every future bench profits -- Removed a load-bearing ~600-line custom SIMD module from the maintained surface area; future maintainers don't need to reason about Vector512BW / cross-lane shuffle / 5-popcount surrogate-pair correctness — the BCL handles it - -### Follow-up — `Utf8Transcoder.cs` cleanup - -The file is fully commented out. Either: -- **Delete** entirely (preferred for repo cleanliness) — `Utf8TranscoderTests.cs` then needs deletion or revival as a regression-only guard -- **Keep** the comment-block as historical reference, with a header comment pointing to this entry - -Decision deferred — the comment-block does no harm to build / runtime. Address when the next docs-archive sweep runs. - ## ACCORE-BIN-T-P3X7: Profile-driven Compact-mode Ser optimalizációs roadmap (post-K7M3 hot-path analysis) **Priority:** P2 · **Type:** Performance roadmap · **Status:** Open · **Related:** `ACCORE-BIN-T-K7M3` (BCL UTF-8 transcoder switch — előfeltétele), `ACCORE-BIN-T-D9X3` (bench stabilization), `ACCORE-BIN-T-S2X9` (markerless schema lane — primitív property-marker már kivezetve a SGen-ben), `ACCORE-BIN-T-V4N4` (audit methodológia hivatkozás) diff --git a/AyCode.Core/docs/BINARY/BINARY_TODO_2026_04.md b/AyCode.Core/docs/BINARY/BINARY_TODO_2026_04.md new file mode 100644 index 0000000..f9db8f3 --- /dev/null +++ b/AyCode.Core/docs/BINARY/BINARY_TODO_2026_04.md @@ -0,0 +1,17 @@ +# BINARY — TODO archive (2026-04) + +Archived entries from `BINARY_TODO.md` per LLMP-DEC retention policy. IDs preserved (never reassigned). Format identical to active file. + +## ACCORE-BIN-T-S8P4: Replace JSON-in-Binary request parameters +**Priority:** P1 · **Type:** Refactor · **Status:** Closed (2026-04-26, landed in commits `cdd54d3` 2026-04-05 + `3b70070` 2026-04-06) · **Related:** `../XCUT/XCUT_ISSUES.md#accore-xcut-i-x8q1` (canonical), `AyCode.Services/docs/SIGNALR/SIGNALR_TODO.md` + +Migrate client→server request parameters from JSON-in-Binary envelope to direct Binary serialization (matching response path). Coordinated change across client, server, and all consuming projects. Do NOT attempt as side-effect of unrelated work. + +**Acceptance:** `SignalPostJsonDataMessage` replaced by a `SignalPostBinaryDataMessage` (or equivalent); no JSON round-trip on the wire for request params; benchmarks confirm no regression. + +### Resolution +- **What:** Length-prefixed, per-parameter binary format introduced via `SignalRSerializationHelper.SerializeParametersToBinary` / `DeserializeParametersFromBinary`; further unified into `SignalParams` (single `byte[]` carrying packed method parameters with `SetParameterValues` / `GetParameterValues`). +- **Where:** `AyCode.Services/SignalRs/AcSignalRClientBase.cs`, `AcWebSignalRHubBase.cs`, `ISignalParams.cs` (server + client dispatch); `IAcSignalRHubClient.cs` (legacy wrappers). +- **Equivalent (not literal `SignalPostBinaryDataMessage`):** `SignalParams` was chosen over a 1:1 binary wrapper class — fewer indirections on the hot path, type-safe pack/unpack, and `DataSerializerType` field on `SignalReceiveParams` for response format indication. +- **Wire impact:** No JSON round-trip on the wire for request params; this is a **breaking change** vs. previous JSON-in-Binary clients/servers (see commit message). +- **Legacy types:** `SignalPostJsonMessage`, `SignalPostJsonDataMessage`, `SignalPostMessage`, `ISignalPostMessage` all marked `[Obsolete]` in `IAcSignalRHubClient.cs`; deletion tracked separately in `AyCode.Services/docs/SIGNALR/SIGNALR_TODO.md#accore-sig-t-s3n8` (gated on consumer migration). diff --git a/AyCode.Core/docs/BINARY/BINARY_TODO_2026_05.md b/AyCode.Core/docs/BINARY/BINARY_TODO_2026_05.md new file mode 100644 index 0000000..de355f7 --- /dev/null +++ b/AyCode.Core/docs/BINARY/BINARY_TODO_2026_05.md @@ -0,0 +1,708 @@ +# BINARY — TODO archive (2026-05) + +Archived entries from `BINARY_TODO.md` per LLMP-DEC retention policy. IDs preserved (never reassigned). Format identical to active file. + +## ACCORE-BIN-T-N9G6: Add non-generic `Type`-based `Serialize(object, Type, ...)` overloads +**Priority:** P2 · **Type:** Feature · **Status:** Closed (2026-05-04) · **Related:** `ACCORE-BIN-T-T8K3` + +### Resolution + +Added in `AcBinarySerializer.cs`: +- `Serialize(object?, Type, opts)` → `byte[]` +- `Serialize(object?, Type, IBufferWriter, opts)` → `int` +- `SerializeChunked(object?, Type, PipeWriter, opts)` → `int` +- `SerializeChunkedFramed(object?, Type, PipeWriter, opts)` → `int` + +`AcBinaryDeserializer.cs` already had `Deserialize(byte[], Type, opts)` / `Deserialize(ReadOnlySequence, Type, opts)` / `Deserialize(AsyncPipeReaderInput, Type, opts)` overloads — no new entries needed. + +**Layering note**: `PipeReader → AsyncPipeReaderInput` drain-loop is the consumer's responsibility, not the binary serializer's. The serializer surface ends at `AsyncPipeReaderInput`; transport-specific draining (PipeReader, NamedPipe, SignalR `state.Buffer.Write`, etc.) lives in the consumer layer (e.g. `AcBinaryInputFormatter`, `AcBinaryHubProtocol.TryParseChunkData`). + +Consumed by ASP.NET Core MVC formatter package (`AyCode.Services/Mvc/`) — `AcBinaryInputFormatter`, `AcBinaryOutputFormatter`, `AddAcBinaryFormatters` extension. Media type: `application/vnd.acbinary`. Drain-loop inlined in `AcBinaryInputFormatter.ReadRequestBodyAsync`. + +Plugin frameworks, ASP.NET ModelBinding, DI middleware, and DataContractSerializer-style "generic-API container" use-cases need to serialize an `object` whose type is known only at runtime. Current AcBinary surface forces a reflection trampoline through the generic `Serialize`: + +```csharp +// Today's workaround (slow + noisy): +typeof(AcBinarySerializer).GetMethod("Serialize", new[] { type, typeof(AcBinarySerializerOptions) }) + .MakeGenericMethod(type).Invoke(null, new[] { value, options }); +``` + +**Implementation outline:** +- `public static byte[] Serialize(object? value, Type type, AcBinarySerializerOptions? options = null)` +- `public static int Serialize(object? value, Type type, IBufferWriter writer, AcBinarySerializerOptions? options = null)` +- `public static int SerializeChunked(object? value, Type type, PipeWriter writer, AcBinarySerializerOptions? options = null)` and `Pipe` overload +- `public static int SerializeChunkedFramed(object? value, Type type, PipeWriter writer, AcBinarySerializerOptions? options = null)` and `Pipe` overload +- `public static ValueTask SerializeAsync(object? value, Type type, Stream stream, ...)` — coordinated with `ACCORE-BIN-T-T8K3` +- Internal dispatch: `value.GetType()` is the runtime type; the `Type type` parameter constrains the **declared** type for polymorphism handling (`ObjectWithTypeName` write decision). + +**Acceptance:** +- All non-generic overloads round-trip via the generic deserializer's `Deserialize(byte[], Type)` overload. +- Plugin-style scenario: serialize `IList` of mixed-type elements → all elements correctly typed in the wire output. +- API doc-strings call out the performance characteristics (slightly slower than generic due to runtime `Type` lookup but **without** the reflection trampoline cost). + +## ACCORE-BIN-T-H2Q6: Fixed-width dual-length string header (Small/Medium/Big) for 1-pass decode +**Priority:** P1 · **Type:** Wire-format + Performance · **Status:** Closed (2026-05-06) · **Related:** `DecodeUtf8SinglePass`, `CountUtf8Chars`, `WriteStringWithDispatch`, `ReadStringUtf8` + +Current Compact string decode uses two-pass flow for non-ASCII payloads (`CountUtf8Chars` + `DecodeUtf8SinglePass`). +Planned direction: remove VarUInt-based string-length path for the new string wire variant, and carry both lengths in a fixed-width header so deserialize can allocate target `string` immediately and decode in a single pass. + +### Planned format tiers + +- **Small**: packed `uint16` (`charLen:8 | utf8Len:8`) +- **Medium**: packed `uint32` (`charLen:16 | utf8Len:16`) +- **Big**: `uint32 charLen + uint32 utf8Len` + +Writer picks the smallest fitting tier; reader dispatches by marker and reads fixed-width lengths (no VarUInt loop for string length metadata). + +### Why + +- Removes `CountUtf8Chars` pass on the new markers (1-pass decode path) +- Keeps decode branch profile stable (fixed-size header reads) +- Maintains range safety with explicit Big overflow path + +### Constraints captured from current benchmark context + +- Performance evaluation target is non-ASCII-heavy data (ASCII-shortcuts intentionally not primary) +- Wire-format backward compatibility is not required for this development phase + +### Marker layout decision (2026-05-06) + +After analysis on the new "all UTF-8 Magyar" benchmark baseline (`2026-05-06_13-10-30.LLM` — Compact +5-25% slower than MemPack on every cell): + +**Confirmed**: the previous benchmark's Compact-vs-MemPack advantage was an artifact of ASCII property names hitting the `FixStrAscii` / Latin1-widen fast path; once string property values are also UTF-8 Magyar, the actual hot path (`EncodeUtf8SinglePass` + two-pass `CountUtf8Chars` + `DecodeUtf8SinglePass`) becomes the bottleneck. + +**Marker scope decision** — clean split between ASCII fast path and non-ASCII tier dispatch: + +**MEGMARAD (changeless)**: +- `FixStrAscii` (≤31 byte ASCII) — kompakt 1-byte header + Latin1 widen, zero UTF-8 decode pipeline +- `StringAscii` (>31 byte ASCII) — long ASCII fast path, Latin1 widen +- `StringInternRef` — 2nd+ occurrence of interned string (no body, just cache index — not affected by 2-pass problem) +- `StringEmpty`, `Null` — sentinel markers + +**MEGSZŰNIK (replaced by H2Q6 tiers)**: +- `FixStr` (32 marker values 103-134 — non-ASCII short) → replaced by `StringSmall` +- `String` (1 marker value 91 — non-ASCII long with VarUInt utf8Len) → replaced by `StringSmall` / `StringMedium` / `StringBig` +- `StringInternFirst` (1 marker value 94 — VarUInt utf8Len interning) → replaced by `StringInternFirstSmall` / `StringInternFirstMedium` + +**ÚJ markers** (5 total): +- `StringSmall` — non-ASCII, `[marker:1][charLen:8][utf8Len:8][bytes]`, utf8Len ≤ 255 +- `StringMedium` — non-ASCII, `[marker:1][charLen:16][utf8Len:16][bytes]`, utf8Len ≤ 65535 +- `StringBig` — non-ASCII, `[marker:1][charLen:32][utf8Len:32][bytes]`, utf8Len > 65535 +- `StringInternFirstSmall` — `[marker:1][cacheIdx:VarUInt][charLen:8][utf8Len:8][bytes]` +- `StringInternFirstMedium` — `[marker:1][cacheIdx:VarUInt][charLen:16][utf8Len:16][bytes]` + +**Trade-off justification**: +- Wire cost on short non-ASCII strings: +2 byte/string header (3 vs 1) → ~0.07-0.36% wire growth on Repeated cell (10 short Magyar string × 2 byte / 28 KB) +- CPU saving: `CountUtf8Chars` Pass 1 eliminated on every non-ASCII string decode → directly attacks the +25% Deser baseline gap +- The 2-byte hybrid `FixStr` (non-ASCII) variant (1 byte marker + 1 byte charLen) was considered but **rejected**: marginal wire saving (-1 byte vs StringSmall) does not justify the +1 marker complexity given the tiny absolute wire impact on the Repeated cell. Cleaner to have ASCII-vs-non-ASCII at the marker level (FixStrAscii vs StringSmall/Medium/Big). + +**Interning tier sizing rationale**: +- `MaxStringInternLength` is `byte`-typed (`AcBinarySerializerOptions.cs:125`, default 64, abszolút max 255 char) +- Worst-case: 255 char × 4 byte/char (emoji-only) = 1020 byte → fits in Medium tier (utf8Len ≤ 65535) +- Realistic Magyar/CJK: 64 char × 2-3 byte = 128-192 byte → Small tier +- **Big tier never engages on the interning path** — only Small + Medium needed (+2 markers, not +3) + +### Marker address space reservation (post-H2Q6) + +The marker reorg frees **34 marker values** (32 `FixStr` non-ASCII + `String` + `StringInternFirst`). After allocating 5 for H2Q6, **29 values remain free**. Strategic reservation plan to prevent ad-hoc consumption and minimize future wire-format breaks: + +| Reserved range | Count | Future feature | Status | +|---|---|---|---| +| `StringSmall` / `StringMedium` / `StringBig` | 3 | H2Q6 Compact tiers | **active (this entry)** | +| `StringInternFirstSmall` / `StringInternFirstMedium` | 2 | H2Q6 interning tiers | **active (this entry)** | +| `FixArrayBase..FixArrayMax` | 16 | `ACCORE-BIN-T-L9Y3` (FixArray short-list count in marker) | reserved, future | +| Sentinel-length string tier markers | ~5 | `ACCORE-BIN-T-S5L8` (sentinel-length encoding) | reserved, future | +| Markerless schema lane | ~4 | `ACCORE-BIN-T-S2X9` (markerless schema lane opt-in) | reserved, future | +| `StringFastWire` | 1 | `ACCORE-BIN-T-F3W6` (dedicated FastWire string marker) | reserved, future | +| General reserve | 3 | unallocated | tartalék | + +**Wire-format version bump**: v2 → v3 at H2Q6 landing. The reserved-but-unimplemented marker values are documented but not yet decoded — readers throw `unknown marker` if wire contains them. Future activation of `FixArray` / sentinel-length / markerless schema lane within the **same v3 wire format** is non-breaking for already-deployed v3 consumers (they reject unknown markers cleanly; producers opt in to emit them). + +### Acceptance + +- New string markers implemented for Small/Medium/Big tiers + InternFirstSmall/InternFirstMedium tiers +- Deserialize path for these markers performs single-pass decode without `CountUtf8Chars` +- 29 freed marker values strategically reserved per the address-space reservation table; documented in `BinaryTypeCode.cs` with `// Reserved for ACCORE-BIN-T-XXXX (future)` comments +- Wire-format version bump v2 → v3 documented in `BINARY_FORMAT.md` +- Existing round-trip tests pass, plus new boundary tests for tier transitions (utf8Len = 254/255/256/65534/65535/65536) and interning tier transitions +- Benchmark report includes before/after for Compact mode on non-ASCII dataset (Ser/Deser/RT + Size) vs the `2026-05-06_13-10-30.LLM` baseline + +### Resolution + +Landed 2026-05-06. End-to-end implementation: marker reorg + writer tier-dispatch + reader tier-readers + SGen template + skip path + interning path. Five new markers (`StringSmall`/`Medium`/`Big`/`InternFirstSmall`/`InternFirstMedium`) replacing the old `String`/`StringInternFirst`/`FixStrBase..Max` (32 + 1 + 1 = 34 marker values freed, 5 used; 29 reserved for future features per the address-space plan). Wire format version bumped v2 → v3. + +Follow-up A-direction header pack-write/read optimization landed in the same window: `Unsafe.WriteUnaligned` (Small) / `` (Medium) / `` (Big) replace 2× byte / 2× ushort / 2× uint stores; reader uses single `uint`/`ulong` loads with bit-extract. Direct `ref byte` writes (no Span-shape overhead). + +**Tests:** 222 pass / 13 pre-existing GuidIId failures (unchanged). 55/55 Utf8TranscoderTests pass. + +**Benchmark vs `2026-05-06_13-10-30.LLM` baseline (`2026-05-07_08-55-49.LLM`, immediately post-H2Q6):** +- Compact-vs-MemPack Deser ratio improvement on baseline gap: **-14 to -28 percentage points** across cells +- Deser: **4/5 cells now faster than MemPack** (Small -6%, Medium -3%, Large -9%, Deep -7%); Repeated cell remaining +5% gap (V4N2 Phase 3 SIMD multi-byte transcoder targets this) +- Wire size: **5/5 cells smaller than MemPack** (-8% to -11%) +- Ser: 1/5 win (Large -9%), 1/5 tie (Medium 0%), 3/5 minor lag (+2-7% Small/Repeated/Deep) — host-noise band + +**Bench evolution post-H2Q6** (subsequent micro-opts on the same H2Q6 base): +- `2026-05-07_09-39-09.LLM` — A irány header pack-write/read (`Unsafe.WriteUnaligned` ushort/uint/ulong): zaj-szintű mozgás, strukturális javulás +- `2026-05-07_15-13-39.LLM` — V4N4 Step 1+2 method-split (`AggressiveInlining`): **regresszió** (Small Ser +29.6 pp, Repeated Ser +8.9 pp) → `WriteStringSmallFast` túl-aggresszív inline-olás code-bloat / i-cache pressure +- `2026-05-07_15-29-21.LLM` — V4N4 finomított (NoInlining a SmallFast-ra, dispatcher hint nélkül, Reader split visszavonva): **konszolidált state**: + - **Ser**: 5/5 cell paritás-vagy-jobb (Small **-8.5%**, Medium ≈, Large **-8.5%**, Repeated ≈, Deep ≈) + - **Deser**: **4/5 cell faster than MemPack** (Medium -4.7%, Large **-10.6%**, Repeated **-3.8%**, Deep **-10.1%**); Small +10% remaining gap + - **Wire**: 5/5 cell -8% to -11% smaller (unchanged) + - **Net**: Compact mostantól 8/10 cellán nyer Compact vs MemPack; csak Small Deser-en marad +10% gap (kis abszolút érték, ~1 µs) + +**Critical algorithmic correctness lesson** (from V4N3 follow-up `GetUtf8ByteCount`): the initial 4-popcount formula assumed `lowSur == highSur` per chunk. Fix: 5-popcount closed-form. Caught by surrogate-pair-split-across-chunk regression tests. Documented in Utf8Transcoder. + +**Marker address space (post-H2Q6, v3 wire):** +- 91 → StringSmall (was String) +- 94 → StringMedium (was StringInternFirst) +- 103 → StringBig +- 104 → StringInternFirstSmall +- 105 → StringInternFirstMedium +- 106..134 reserved (29 values: 16 for `L9Y3` FixArray, 5 for `S5L8` sentinel-length, 4 for `S2X9` markerless schema lane, 1 for `F3W6` FastWire dedicated marker, 3 reserve) + +**Related follow-up TODO entries (now Open):** `O7G2` (overflow guard), `S6F2` (shift-mentes Small fast path), `W2C8` (WASM string-cache H2Q6 maximalizálás). + +## ACCORE-BIN-T-M3R7: ASCII marker-dispatch — writer detect + reader dedicated path +**Priority:** P2 · **Type:** Performance + wire optimization · **Related:** `BinaryTypeCode.FixStrAsciiBase..StringAscii` markers, `WriteStringWithDispatch`, `ReadAsciiBytesAsString` +**Status:** Closed (2026-05-04) + +> **Sorrendi megjegyzés:** ezt **AZ ENCODER OPTIMALIZÁCIÓ UTÁN** csináljuk (lásd `ACCORE-BIN-T-E2F9`). Indok: a custom encoder/decoder Vector256 ASCII narrow/widen path-jai már magukban gyorsan kezelik az ASCII byte-ot. A marker-dispatch ezen FELÜL csak a per-call dispatch-overhead spórolást hozza (no `Ascii.IsValid` scan, no decoder layer). Garantált win, de additív — méréstechnikailag tisztább a decoder/encoder utánra hagyni. + +The `FixStrAscii*` (135-166) and `StringAscii` (167) markers are defined in `BinaryTypeCode.cs` with helper methods (`IsAsciiString`, `IsFixStrAscii`, `EncodeFixStrAscii`, `DecodeFixStrAsciiLength`). Encoding/decoding logic NOT yet implemented — currently both writer and reader use the universal `String` / `FixStr` markers. + +### Implementation +- **Writer**: in `WriteStringUtf8` / `WriteFixStrDirect`, after UTF-8 encoding (D-2 path), check `bytesWritten == charLength` (= ASCII iff equal). If ASCII, emit `FixStrAscii` (≤31 byte) or `StringAscii` (>31 byte). Else emit existing `FixStr` / `String`. Free detect — both numbers already computed by D-2. +- **Reader**: in `ReadStringUtf8` (or upstream marker dispatch), branch on marker. ASCII markers → dedicated byte→char widening path (no UTF-8 decode, no `Ascii.IsValid` scan, no decoder dispatch). Non-ASCII markers → existing custom UTF-8 decoder. +- **SGen**: regenerate readers/writers to dispatch on the new markers. +- **Re-enable ASCII fast paths**: uncomment writer FixStr dispatch in `AcBinarySerializer.cs` and reader `Ascii.IsValid` block in `ReadStringUtf8` — these temporarily disabled blocks become the marker-aware paths (no IsValid scan needed since the marker is the contract). + +### Wire format change +- Format version bump (1 → 2). Old readers fail clean on new wire (version mismatch). New readers must reject old wire OR support backward read. + +### Acceptance +- Repeated Strings (Hungarian content) Deser: AcBinary closes the ~10% gap vs MemoryPack +- Pure ASCII tests (Small/Medium/Large/Deep): AcBinary Ser AND Deser ≥ MemoryPack +- Wire size: minimum -25% vs MemoryPack across all test cells +- SGen-generated code compiles and round-trips on all `[AcBinarySerializable]` types +- Decision documented: backward-compat policy for v2 vs v1 wire + +### Resolution +End-to-end implementation landed (writer + reader + SGen + skip + populate). Key components: +- **Writer (`AcBinarySerializer.BinarySerializationContext.WriteStringWithDispatch`)** — single-pass UTF-8 encode + ASCII detect via `bytesWritten == charLength`; emits one of 4 markers (FixStrAscii / FixStr / StringAscii / String). Split layout for hot path: `charLength ≤ 31` encodes optimistically at `savedPos+1` (FixStr position) → 0 shift on FixStr hit; `charLength > 31` uses D-2 layout with backfill. The split avoids the post-encode left-shift that the unified layout introduced (regression seen in 12-42-32 bench). +- **Reader (`AcBinaryDeserializer.BinaryDeserializationContext.ReadAsciiBytesAsString`)** — `Encoding.Latin1.GetString` (BCL SIMD-accelerated byte→char widen). Avoids the `string.Create` callback + scalar widen overhead — measurably better on Small Deser cell (closed the +20% MemPack-relative anomaly). +- **TypeReaderTable**: `StringAscii` (167) + 32 × `FixStrAscii` (135-166) readers registered. `IsFixStrAscii` / `StringAscii` fast paths in `PopulatePropertyWithMarker`, `ReadValue`, `SkipValue`. +- **SGen (`AcBinarySourceGenerator.EmitReadString`)** — regenerated readers branch on `IsFixStr` / `IsFixStrAscii` / `case StringAscii` per property. + +**Wire format version not bumped** — the new markers occupy previously-unused codepoints (135-167); old wire (without ASCII markers) is forward-compatible (readers handle both `String` and `StringAscii`). v1 stays. + +**Acceptance (AOT bench 13-40-29, MemPack-relative ratios — JIT noise eliminated):** +- ✅ AcBinary Ser AND Deser GYORSABB MemPack-nél MINDEN cellán (5/5) + - Small: Ser -8%, Deser -23% + - Medium: Ser -17%, Deser -30% + - Large: Ser -28%, Deser -32% + - Repeated: Ser -4%, Deser -9% + - Deep: Ser -24%, Deser -22% +- ✅ Wire size advantage: 2043-50419 byte (vs MemPack 3070-64986) = **-22% to -33%** across cells +- ✅ Round-trip tests: 167 pass (13 pre-existing failures are IId-tracking, unrelated to M3R7) + +**JIT vs AOT note**: earlier JIT-mode benchmarks (12-50-43 → 13-27-20 series) showed elevated ratios on Small/Repeated cells (1.0-1.2 range) that disappeared under AOT publish. The JIT-mode numbers reflect tier-up artifacts (inconsistent inlining of SGen-generated reader hot paths during the 1000-iteration measurement window), not a structural M3R7 property. AOT (NativeAOT / ILC) compiles deterministically with fixed inline decisions — the steady-state numbers above reflect the actual production performance. + +## ACCORE-BIN-T-E2F9: Custom UTF-8 encoder (writer-side, symmetric with custom decoder) +**Priority:** P1 · **Type:** Performance · **Related:** decoder optimization (`AcBinaryDeserializer.BinaryDeserializationContext.Read.cs::DecodeUtf8SinglePass`) +**Status:** Closed (2026-05-04) + +> **Sorrendi megjegyzés:** ezt **A MARKER-DISPATCH ELŐTT** csináljuk (lásd `ACCORE-BIN-T-M3R7`). Indok: a custom encoder/decoder optimalizáció a "nehezebb, kevésbé biztos" win — a non-ASCII / mixed content workload-okat (Repeated Strings Hungarian) hozza be. A marker-dispatch utána már csak additív tisztítás a pure ASCII path dispatch-overhead-jén. + +Replace `Encoding.UTF8.GetBytes` calls in `WriteStringUtf8` / `WriteStringUtf8Internal` / `WriteFixStrDirect` (collectively the writer's UTF-8 encode path, post-D-2) with a hand-rolled SIMD encoder. Symmetric to the decoder optimization (V4N2 / Read.cs::DecodeUtf8SinglePass). + +### Layered structure (mirrors decoder) +- **Phase 1 — Vector256 ASCII narrow**: 16 chars (Vector256) → 16 bytes (Vector128) via `Vector256.Narrow`. ASCII detect via `(v & 0xFF80).ExtractMostSignificantBits() == 0` (any high bit on UTF-16 char). Break on first non-ASCII char. +- **Phase 2 — DWORD ASCII batch**: 4 chars at a time, OR-mask test, 4 bytes per iter when ASCII. +- **Phase 3 — Scalar multi-byte encode**: 1-byte (ASCII) / 2-byte (Latin extended) / 3-byte (BMP) / 4-byte (surrogate pair → supplementary plane) UTF-8 encoding via direct bit-extract. No fallback dispatch — input is trusted UTF-16 (string). +- Use `System.Text.Unicode.Utf8.FromUtf16` as fallback target for scalar correctness — or skip BCL entirely with manual bit-pack. + +### Why +`Encoding.UTF8.GetBytes` carries virtual-dispatch + encoder-fallback overhead even with SIMD ASCII fast path internally. Custom encoder skips this. ~15-30% Ser improvement on ASCII content, ~5-10% on non-ASCII (multi-byte path stays scalar). + +### Trigger +- **NEXT** — implementation order P1 before marker-dispatch (M3R7) +- Re-evaluate if .NET 11 BCL UTF-8 GetBytes becomes faster (PR #120628 follow-up) + +### Acceptance +- Writer-side benchmark: ≥15% Ser speedup on ASCII content (Small/Medium/Large/Deep), ≥5% on non-ASCII (Repeated) +- Wire format unchanged (custom encoder produces same bytes as `Encoding.UTF8`) +- Round-trip tests pass + +### Resolution +Implemented as `EncodeUtf8SinglePass` in `AcBinarySerializer.BinarySerializationContext.cs` — three-phase layered encoder (Vector256 ASCII narrow + DWORD ASCII batch + scalar 1/2/3-byte BMP & 4-byte surrogate-pair). Bypasses `Encoding.UTF8.GetBytes` virtual-dispatch + encoder-fallback overhead. Trusted-input path — no validation pass on writer side (the input is a .NET `string` with valid UTF-16 surrogate pairs by construction). + +Used by `WriteStringUtf8` (D-2 single-pass with VarUInt backfill) and `WriteStringWithDispatch` (M3R7 marker-dispatch path). Wire format unchanged — the encoder produces the same bytes as `Encoding.UTF8.GetBytes`. + +Acceptance (per bench 12-50-43 → 13-27-20, MemPack-relative ratios on AcBinary Compact FastMode SGen): +- ✅ ASCII Ser ≥ MemPack on 4/5 cells (Small 0.94, Medium 0.80, Large 0.79, Deep 0.81) +- ⚠️ Repeated Ser ~1.04 (Hungarian, multi-byte path scalar) — see follow-up `ACCORE-BIN-T-H7K3` +- ✅ Round-trip tests pass (167 of 180; 13 pre-existing failures unrelated to encoder) + +## ACCORE-BIN-T-W7N5: Default-value omission policy — doc + optional opt-out +**Priority:** P2 · **Type:** Refactor + Documentation · **Status:** Closed (2026-05-04) — Won't Fix (JIT-only artifact) · **Related:** `BINARY_ISSUES.md#accore-bin-i-d9y2` (canonical issue) + +The serializer's `PropertySkip` (102) optimization saves 1 byte per default-valued property by omitting the full value from the wire — relying on the consumer-side type definition to have the same `default(T)`. This is a **latent correctness risk** documented in `ACCORE-BIN-I-D9Y2`. This entry tracks the mitigation plan; full failure-mode analysis lives in the issue. + +### Decision tree (TBD when implementing) + +1. **Doc-only**: position as a deliberate protobuf-style feature; consumer keeps type defaults stable across versions. Lowest cost, maximum benchmark wire-size advantage retained. +2. **Option flag**: `AcBinarySerializerOptions.OmitDefaults` boolean. Default `true` (preserves current behavior + benchmark numbers). `false` writes every property in full — opt-out for fragile-class-evolution scenarios. +3. **Both**: ship doc + flag. Default behavior unchanged; consumers who hit silent-corruption have an explicit opt-out. + +### Acceptance (when implementing) + +- `BINARY_FEATURES.md` adds a "Default-Value Omission" section documenting the semantic and the tradeoff (with cross-ref to `ACCORE-BIN-I-D9Y2`) +- If flag added: round-trip tests covering both `true` and `false`; benchmark comparison table showing wire-size delta on ASCII / Hungarian / DTO-heavy workloads +- Decision rationale recorded in `LLM_PROTOCOL_DECISIONS.md` (or a `### Resolution` block on the issue) once implemented + +## ACCORE-BIN-T-V4N3: Symmetric `GetUtf8ByteCount` API + writer-side BCL kihagyás (cold path) +**Priority:** P3 · **Type:** Performance · **Status:** Superseded (2026-05-08, by `ACCORE-BIN-T-K7M3`) — landed Closed 2026-05-06; subsequent A/B against modern `Utf8.FromUtf16` / `Utf8.ToUtf16` showed the BCL modern API outperforms the custom transcoder on every benchmark cell, leading to full hot-path switch in K7M3 · **Related:** `EncodeUtf8SinglePass`, `WriteStringUtf8Internal`, `PropertyMetadataBase.NameUtf8`, `ACCORE-BIN-T-K7M3` (hot-path BCL switch) + +Symmetric byte-count helper for `EncodeUtf8SinglePass`, paired with writer-side BCL `Encoding.UTF8.GetBytes` / `GetByteCount` removal across all cold-path call sites. `Utf8Transcoder.GetUtf8ByteCount(ReadOnlySpan)` SIMD impl (Vector512 / Vector256 / Vector128 / scalar tier hierarchy, 5-popcount closed-form aggregation handling chunk-split surrogate pairs correctly). + +**Implementation summary**: +- `Utf8Transcoder.GetUtf8ByteCount` SIMD impl with closed-form `bytes = 3*N - ascii - c_lt_0x800 + highSur - 3*lowSur` aggregation +- `Utf8TranscoderTests` extended (29 new tests covering ASCII / Hungarian / CJK / emoji / boundary 0-64, plus surrogate-pair-split-across-SIMD-chunks regression coverage) +- `WriteStringUtf8Internal` (`BinarySerializationContext.cs:875`) refactored from BCL two-pass to single-pass D-2 layout (worst-case `length*4` allocate + `EncodeUtf8SinglePass` + VarUInt backfill); the `4×` worst-case capacity is amortized by the buffer growth doubling strategy (`Math.Max(buffer.Length*2, position+needed)` + ArrayPool bucket-rounding to next power-of-2) +- Cold path cleanup: `AcBinarySerializer.AnalyzeStringInternCandidates` (analysis log) and `PropertyMetadataBase.NameUtf8` ctor-once init both migrated to `Utf8Transcoder` + +### Resolution + +Landed 2026-05-06. All `Utf8TranscoderTests` pass (55/55). Binary test suite unchanged (222 pass / 13 pre-existing GuidIId failures, untouched). + +**Critical observation surfaced during the audit**: `WriteStringUtf8Internal` has only one caller (`WriteFixStrDirect`), and `WriteFixStrDirect` itself is **uncalled anywhere in the codebase** — no core call site, no SourceGenerator template hit (verified against `AcBinarySourceGenerator.cs` line 706/724/1492/1514 — generator emits `WriteStringGenerated` and `context.WriteStringUtf8` (the public 659-line method, not `WriteStringUtf8Internal`)), no test, no reflection path. The V4N3 implementation therefore landed cleanly but its hot-path benchmark impact is limited to the two cold-path init sites. Dead-code disposition tracked as `ACCORE-BIN-T-V4N5`. + +**Algorithmic correctness lesson** — the initial 4-popcount formula (`3*N - c_lt_0x80 - c_lt_0x800 - 2*highSur`) was wrong on chunks where a surrogate pair straddles the SIMD chunk boundary (it implicitly assumed `lowSur == highSur` per chunk, which is true over the whole well-formed string but NOT per chunk). Fix: 5-popcount closed-form (`3*N - ascii - c_lt_0x800 + highSur - 3*lowSur`), with the scalar tail using the same per-char accounting model (`i += 1` per char regardless of role; high → 4, low → 0, BMP → 3, two-byte → 2, ASCII → 1). Caught by `GetUtf8ByteCount_MultipleEmojiBoundary_MatchesBcl` and `GetUtf8ByteCount_BoundaryAsciiToEmoji_MatchesBcl` regression tests — exactly the `prefixLen` 1, 7 boundaries that exercise chunk-split surrogate pairs. + +### Superseded by `ACCORE-BIN-T-K7M3` (2026-05-08) + +The V4N3 audit measured the custom transcoder against the **legacy `Encoding.UTF8.GetBytes`** API and won. **Did NOT measure against the modern `System.Text.Unicode.Utf8.FromUtf16` / `Utf8.ToUtf16` static API** (.NET 7+, used by MemoryPack source-gen). Once `D9X3` stabilized the bench, a direct A/B revealed the BCL modern API outperforms the custom transcoder on **every** cell (Ser deficit -14 to -22pp, Deser flips from behind to ahead). All 8 hot-path call sites switched to BCL in `K7M3`. The `Utf8Transcoder.cs` file is fully commented out — preserved as historical reference. + +The V4N3 algorithmic correctness work (5-popcount surrogate-pair-split-across-chunks closed-form) remains a **valid algorithmic contribution**, but no longer load-bearing on the hot path. + +## ACCORE-BIN-T-J5L9: Remove dead `WriteFixStrDirect` / `WriteStringUtf8Internal` (audit-surfaced uncalled methods) +**Priority:** P3 · **Type:** Refactor / hygiene · **Status:** Closed (2026-05-06) · **Related:** `BinarySerializationContext.cs` + +V4N3 audit surfaced two methods with no callers in the entire workspace: + +- `WriteFixStrDirect(string)` — public method, no call site (no core, no SourceGenerator template, no test, no reflection / Expression-compile) +- `WriteStringUtf8Internal(string)` — private method called only from `WriteFixStrDirect`'s non-ASCII fallback branch + +The pair forms a closed dead loop (`WriteFixStrDirect` → `WriteStringUtf8Internal`), but no entry point reaches `WriteFixStrDirect`. The public-API `WriteStringUtf8` (line 659) is the live equivalent and is called from the SourceGenerator template (polymorphism path: assembly-qualified type-name write). The hot-path string-write goes through `WriteStringWithDispatch` (line 734) which uses the M3R7 marker-dispatch — NOT through this dead pair. + +### Disposition options (decide pre-NuGet release) + +1. **Delete both methods** — pure dead-code cleanup; reduces public surface, removes maintenance burden, simplifies onboarding. Functionality is fully covered by `WriteStringWithDispatch` (M3R7 marker-dispatch — emits `FixStr` / `FixStrAscii` directly with proper ASCII detection via `bytesWritten == charLength` after `EncodeUtf8SinglePass`). +2. **Activate `WriteFixStrDirect` for property-name writes** — SGen could emit `WriteFixStrDirect(propName)` instead of `WriteStringWithDispatch(propName)` for known-short, often-ASCII property names — saving the marker-dispatch overhead. Requires SGen template change + benchmark validation that the saving is real (likely marginal — property names are typically <31 char ASCII, so M3R7 already takes the FixStrAscii fast path with one byte-write to `_buffer`). The pre-encoded `NameUtf8` byte[] on `PropertyMetadataBase` already provides a faster path (`WriteFixStrBytes` at line 853) which the SGen / runtime writer could use directly. +3. **Defer** — leave as-is, document as dead code, revisit when the codebase has another reason to touch this area. + +### Why P3 + +- No correctness or perf impact in either direction (dead code is dead — no consumer affected) +- Cleanup vs activation is a low-stakes choice; benchmark would decide if option 2 has real saving +- Surfaced during V4N3 work, not blocking the NuGet release + +### Acceptance + +- Decision recorded (delete / activate / defer) with rationale +- If "delete": grep across workspace confirms zero callers post-removal; binary test suite unchanged (still 235 pass / 13 pre-existing failures) +- If "activate": SGen template change + benchmark validation showing ≥ 2% Ser improvement on a representative cell (otherwise revert to "delete") +- Documentation in `BINARY_IMPLEMENTATION.md` updated (or remove the old reference if both methods deleted) + +### Trigger + +- Pre-NuGet release housekeeping pass +- Or: any future refactor that touches `BinarySerializationContext` string-write methods (then decide rather than leave the dead pair behind) + +### Resolution + +Disposition: **Delete (Option 1)**. Landed 2026-05-06 together with the H2Q6 marker reorg commit. Five dead methods removed in a single cleanup pass: + +- `WriteFixStrDirect(string)` — uncalled public method +- `WriteStringUtf8Internal(string)` — uncalled private method (only called from `WriteFixStrDirect`) +- `WriteFixStr(string)` — uncalled public method (audit surfaced; was originally listed as live) +- `WriteFixStrBytes(ReadOnlySpan)` — uncalled public method (audit surfaced) +- `WritePreencodedPropertyName(ReadOnlySpan)` — uncalled public method (audit surfaced) + +All five had zero call sites across core, SourceGenerator template, tests, and reflection. The hot-path string write continues through `WriteStringWithDispatch` (M3R7 + H2Q6 marker dispatch) and `WriteStringInternFirstWithDispatch` (interning tier dispatch). Public surface reduced; binary test suite unchanged (222 pass / 13 pre-existing GuidIId failures). + +## ACCORE-BIN-T-O7G2: Overflow guard on `charLength * 4` writer arithmetic + corrupted-wire `ReadStringBig` +**Priority:** P3 · **Type:** Defensive / safety · **Status:** Closed (2026-05-06) · **Related:** `WriteStringWithDispatch`, `WriteStringInternFirstWithDispatch`, `ReadStringBig`, `BinaryTypeCode.MaxStringCharLength` + +Defensive guards covering two latent failure modes in the H2Q6 string serialization paths: + +**Writer overflow (silent zero corruption)** — `charLength * 4` overflows `int` when `charLength > 0x1FFFFFFF` (~537M). At exactly `0x40000000` chars the multiplication wraps to **0**, causing: +- `EnsureCapacity(reserveHeader + 0)` to silently succeed (no buffer growth) +- `EncodeUtf8SinglePass(value, emptySpan)` to write 0 bytes, returning `bytesWritten = 0` +- The H2Q6 tier choice picks Small (`bytesWritten ≤ 255`), writing `[StringSmall][0][0]` to the wire +- **The string content is lost silently — no exception, wire claims an empty string** + +Other overflow values (e.g. `charLength = 600M` → `maxBytes` becomes negative) eventually surface as `ArgumentOutOfRangeException` from `Span.AsSpan(start, length)`, but the message ("length cannot be negative") is misleading and arrives after the buffer has already been partially mutated. + +**Reader corrupted wire (negative cast from oversized uint)** — in `ReadStringBig`, the wire-side `charLen:32` and `utf8Len:32` are read as `uint`, then cast to `int`. Corrupted or maliciously-crafted payloads with values > `Int32.MaxValue` produce negative ints, leading to `string.Create(negative, ...)` exceptions or position-state desync — at best a misleading message, at worst a partial decode with wire-position shifted incorrectly. + +### Resolution + +Landed 2026-05-06 (this commit window). + +**Writer side** — `WriteStringWithDispatch` and `WriteStringInternFirstWithDispatch` each gain one method-entry guard: + +```csharp +var charLength = value.Length; +if ((uint)charLength > BinaryTypeCode.MaxStringCharLength) ThrowStringTooLong(charLength); +``` + +A single unsigned compare catches the overflow band; predict-friendly (always false on realistic input). The throw helper is `[MethodImpl(MethodImplOptions.NoInlining)]` so the JIT/AOT keeps the throw site out of the inlined hot path. The same `charLength` value is reused across the FastWire and Compact branches — no duplicate guard. + +**Reader side** — `ReadStringBig` gains a single bitwise-OR + sign-test: + +```csharp +var packed = context.ReadUInt64Unsafe(); +var charLength = (int)(uint)packed; +var byteLength = (int)(uint)(packed >> 32); +if ((charLength | byteLength) < 0) ThrowCorruptedBigWire(charLength, byteLength); +``` + +The OR + sign-test catches negative casts (any wire-side uint > `Int32.MaxValue` produces a negative int after cast; OR of two positives is positive, sign-test cheap). One instruction effective; predict-friendly. + +**New constant**: `BinaryTypeCode.MaxStringCharLength = 0x1FFFFFFF` (536_870_911 — largest charLength where `charLength * 4` fits in int). + +**Hot-path cost**: ~0% on realistic input — single unsigned compare on the writer, single OR + sign-test on the reader Big tier (Small/Medium readers untouched since their wire values are bounded by `byte` / `ushort` types and cannot overflow). Throw helpers `NoInlining` keep the inlined caller body compact. Tests 222 pass / 13 pre-existing failures unchanged. + +### Why P3 + +- No correctness impact for realistic inputs (the overflow band is far outside any real DTO scenario) +- Defensive value: prevents silent data loss in the `charLength = 1.07G` zero-overflow edge case + provides clear error messages on out-of-range inputs +- Security value: corrupted/malicious wire payloads on the reader Big tier path are now caught early instead of producing inconsistent position state +- NuGet release professional-quality signal — explicit, defensive guards over silent-corruption paths + +## ACCORE-BIN-T-C5R8: Charset-parameterized benchmark workload (ASCII / Hungarian / CJK / Cyrillic / Mixed) +**Priority:** P2 · **Type:** Tooling / release-narrative · **Status:** Closed (2026-05-07) · **Related:** `BenchmarkTestDataProvider`, `AyCode.Core.Serializers.Console.Program.cs` (Settings → Charset submenu), `ACCORE-BIN-T-V4N2` (charset-specific optimization measurement target), `ACCORE-BIN-T-D9X3` (bench stabilization preceding this work) + +The current `BenchmarkTestDataProvider` hard-codes Hungarian (Latin extended 2-byte) content into the test DTOs. This produces a single workload-shape: **Hungarian mixed text with short 1-2 char 2-byte runs**. While Hungarian is a fine general-purpose i18n stress, it is **only one production-content profile** — and the optimization decisions ride on it implicitly (e.g. V4N2 Phase 2.5's 3-byte run do-while was deferred-on-2-byte-side because the Hungarian bench measured regression there, but its CJK-side value cannot be measured on the current data). + +A **charset-parameterized** benchmark workload — selectable from the interactive menu — would: + +- **Measure optimization value across realistic content profiles** — what wins on CJK content may not win on Hungarian, and vice versa. Without explicit per-charset measurement, optimization decisions become Hungarian-biased. +- **Surface release-narrative numbers credibly** — instead of "Compact beats MemPack on i18n payload" (single workload), claim "Compact vs MemPack: ASCII X%, Hungarian Y%, CJK Z%, Cyrillic W%, Mixed V%" — concrete numbers per content profile, NuGet-grade. +- **Enable workload-specific optimization audits** — V4N2 Phase 3 SIMD multi-byte transcoder targets CJK 3-byte content; without a CJK workload measurement, Phase 3 acceptance criteria cannot be validated. + +### Implementation outline + +#### 1. `BenchmarkTestDataProvider` refactor + +Hard-coded Hungarian strings (`KözösCímke`, `sötét`, `magyar`, `hetenkénti`, etc.) → **ASCII baseline values** (English equivalents: `SharedTag`, `dark`, `hungarian`, `weekly`). + +New static `LongStringSuffix` field — charset-aware suffix appended to a subset of property values: + +```csharp +public static class CharsetSuffixes +{ + public const string AsciiOnly = ""; // baseline — pure-English ASCII content + public const string Hungarian = " árvíztűrő tükörfúrógép"; + public const string CjkBmp = " 你好世界 こんにちは 안녕하세요"; + public const string Cyrillic = " Привет мир дорогой друг"; + public const string Mixed = " árvíz 你好 Привет 😀"; +} + +public static string LongStringSuffix { get; set; } = CharsetSuffixes.Hungarian; // default +``` + +Property values use the suffix dynamically: +```csharp +var description = "Product description" + LongStringSuffix; +``` + +The 5 charsets cover the realistic UTF-8 workload spectrum: +1. **Pure ASCII** — baseline; Phase 1 SIMD prefix widen + DWORD batch dominate; no multi-byte path engagement +2. **Hungarian** (Latin extended) — short 1-2 char 2-byte runs in mixed text; current default workload +3. **CJK BMP** — long homogeneous 3-byte runs; primary V4N2 Phase 2.5/3 win region +4. **Cyrillic** (Russian / etc.) — long 2-byte runs (different shape than Hungarian mixed); V4N2 Phase 2.5 may yet pay off here +5. **Mixed** (Hungarian + CJK + emoji) — full multi-tier coverage in one payload; surrogate-pair handling stress + +#### 2. `Program.cs` interactive submenu + +Before starting a benchmark run, prompt the user for charset choice: + +``` +Choose benchmark charset: + 1 — Pure ASCII (baseline) + 2 — Hungarian (Latin extended) [DEFAULT] + 3 — CJK BMP (Chinese / Japanese / Korean) + 4 — Cyrillic (Russian / etc.) + 5 — Mixed (Hungarian + CJK + emoji) +``` + +The choice → `BenchmarkTestDataProvider.LongStringSuffix = ...` before constructing test data. + +#### 3. Benchmark output header + +The markdown output header should reflect the selected charset: +``` +# AcBinary Benchmark Release 2026-05-07 16:00:00 +Charset: CJK BMP | Iterations: 1000 | Warmup: 10000 | ... +``` + +This makes per-charset bench files self-documenting — file names + content both encode the workload profile. + +#### 4. Round-trip tests unaffected + +`Utf8TranscoderTests` and other content-class unit tests (with their fixed Hungarian / CJK / emoji boundary inputs) are **untouched** — they remain fixed-content for regression coverage. Only the benchmark workload is charset-parameterized. + +### Why P2 + +- **Release-narrative**: NuGet release credibility depends on measurable performance claims across realistic content profiles, not a single Hungarian-mixed workload +- **Optimization decision quality**: V4N2 Phase 2.5 / Phase 3 / future SIMD multi-byte work cannot be objectively validated without a CJK workload — current decisions have implicit Hungarian-bias +- **Consumer reproducibility**: external consumers can reproduce benchmark numbers on their own content profile (or contribute a new charset profile) + +### Acceptance + +- `BenchmarkTestDataProvider` refactored: ASCII baseline + `LongStringSuffix` static field with 5 predefined charset constants +- Interactive menu in `Program.cs` lets the user choose charset 1-5 before benchmark run; the chosen charset is recorded in the markdown output header +- Round-trip correctness verification still runs once-per-cell before warmup (existing `Verified: round-trip ...` line) — works on the active charset +- All 5 charsets produce valid round-trip on all benchmark cells (Small / Medium / Large / Repeated / Deep) +- Existing benchmark numbers (Hungarian-default) reproducible — choosing charset 2 from the menu yields the current 15:29:21-style results +- New CJK charset (option 3) produces measurable numbers (one bench run per charset documented in `Test_Benchmark_Results/`) + +### Trigger + +- Pre-NuGet release: per-charset numbers needed for the public performance-claim table +- Or: when V4N2 Phase 3 SIMD multi-byte transcoder work needs CJK-workload validation + +### Resolution + +Landed 2026-05-07 (after `ACCORE-BIN-T-D9X3` bench stabilization made sub-3% deltas measurable, which raised the value of charset-specific measurement). Implementation refined the original 5-charset proposal into a 6-charset list per user request (Latin1FixAscii + Latin1 short/long split for finer-grained Latin1 coverage): + +**1. `BenchmarkTestDataProvider` refactor** ✅ + +- New `CharsetSuffixes` static class with **6** const suffixes (one more than originally proposed): + - `Latin1FixAscii = ""` — empty suffix; baseline values stay short → FixStr fast-path stress (renamed from `AsciiOnly` per user request) + - `Latin1Short = " árvíztűrő tükörfúrógép"` (~24 char) — Hungarian short Latin1 mixed + - `Latin1Long = " árvíztűrő tükörfúrógép a magyar betűzés tesztje"` (~47 char) — **NEW**, exceeds the 32-char FixStr boundary on the suffix alone (user request) + - `CjkBmp`, `Cyrillic`, `Mixed` — as originally specified +- `LongStringSuffix` default = `CharsetSuffixes.Latin1Long` (backward-compatible in spirit with the prior fixed Latin1 default) +- All hard-coded Hungarian baseline values replaced with ASCII English equivalents: + - `KözösCímke` / `IsmétlődőCímke` / `MélyCímke` → `SharedTag` / `RepeatedTag` / `DeepTag` + - `közösfelhasználó` → `shareduser` (and variants); `közös` → `shared`; `MélyKategória` → `DeepCategory` + - `sötét` / `világos` → `dark` / `light`; `magyar` / `német` / `francia` → `hungarian` / `german` / `french` + - `hetenkénti` / `naponkénti` / `havonkénti` → `weekly` / `daily` / `monthly` + - Repeated cell long Hungarian baselines (`TermékNév_IsmétlődőTesztAdat_árvíztűrőtükörfúrógép`, `RaklapKód_IsmétlődőTesztAdat_árvíztűrő`) shortened to ASCII `ProductName` / `PalletCode` so the `EnsureAllStringsBypassFixStr` suffix-append actually applies (the prior >31-char baselines bypassed the suffix, leaving Repeated cell content fixed-Hungarian regardless of charset selection) +- The only Latin1/non-ASCII characters remaining in the file are inside the `CharsetSuffixes` const definitions themselves (intentional — those define the per-charset content profiles) + +**2. `Program.cs` interactive submenu** ✅ + +- New `[3] Charset` entry in the existing `Settings` submenu (next to `[1] Iteration` and `[2] WireMode`) — chose nested submenu over a top-level prompt to keep the main menu uncluttered +- `ShowCharsetSettingsMenu` lists the 6 charset constants with brief descriptions; selection sets `BenchmarkTestDataProvider.LongStringSuffix` and returns +- `GetCurrentCharsetName()` helper resolves the active suffix back to its constant name (returns `"Custom"` when programmatically set to a non-const value) + +**3. Benchmark output header** ✅ + +- `Charset:` field added to **3 output locations**: + - Console run header (interactive run line — `Layer: ... | Charset: CjkBmp | Iterations: ...`) + - `.LLM` markdown header (file-self-documenting) + - `.log` boxed banner (║ Charset: CjkBmp ║) + +**4. Round-trip tests unaffected** ✅ — `Utf8TranscoderTests` and other content-class unit tests use their own fixed boundary inputs; not touched by this change. Round-trip verification in the bench harness continues to run once-per-cell pre-warmup (`VerifyRoundTrip`) on the active charset. + +### Acceptance status + +- ✅ `BenchmarkTestDataProvider` refactored with ASCII baselines + `LongStringSuffix` field + 6 charset constants +- ✅ Interactive submenu lets the user choose charset 1-6; recorded in markdown output header (3 locations) +- ✅ Round-trip verification runs on the active charset (existing per-cell verify, charset-agnostic by design) +- ⚠️ "All 6 charsets produce valid round-trip on all benchmark cells" — design correctness implies this; not yet exercised on every (cell × charset) combination explicitly. Recommend running each charset once before declaring full validation. +- ❌ "Existing benchmark numbers (Hungarian-default) reproducible — choosing charset 2 yields the current 15:29:21-style results" — **NOT met**: the ASCII baseline refactor changes the numbers regardless of charset choice (shorter baselines + suffix-driven content vs. prior fixed Hungarian baselines). New `Latin1Short` ≠ prior fixed Hungarian default. This is intentional: the user explicitly chose a clean ASCII-baseline + charset-suffix design over preserving historical numerical comparability. +- ❌ "Choosing CJK produces measurable numbers documented in `Test_Benchmark_Results/`" — **NOT done in this commit window**; user has the menu and will run per-charset benches in a follow-up sprint. + +### Note on numerical incompatibility with prior runs + +Existing bench files generated before this commit (e.g. `Console.FullBenchmark_Release_2026-05-07_17-42-22.LLM` and earlier) used the prior fixed Latin1 baseline values + 32-char Hungarian suffix. The new default (`Latin1Long`) uses ASCII baselines + 47-char Latin1Long suffix; the Repeated cell sees a more dramatic shift (its 52-char fixed Hungarian baseline → 11-char ASCII `ProductName` + 47-char suffix). **Numerical comparison across the boundary is not meaningful**; the `Charset:` header field documents the source charset for each new bench file. + +### Future extensions + +- **Sentinel "real-world" charsets** — synthetic mixes representing typical production payloads (e.g. `EnglishWithEmoji` for chat-app DTOs, `ArabicHebrew` for RTL-script regions). Add as new `CharsetSuffixes` constants when consumer demand surfaces. +- **Charset auto-rotate mode** — single benchmark run cycles through all 5 charsets, producing a 5-section markdown output. Useful for full release-narrative table generation in one pass. +- **BDN integration** (per `ACCORE-BIN-T-B1D5`): charset becomes a `[Params]` axis in BenchmarkDotNet, producing a 5×5×N matrix (cells × charsets × engines) in the BDN output. + +## ACCORE-BIN-T-D9X3: Console benchmark stabilization (per-serializer warmup + GC isolate + pilot discard + min/max range + CPU pin + mode-aware JIT sleep) +**Priority:** P1 · **Type:** Tooling / measurement · **Status:** Closed (2026-05-07) · **Related:** `AyCode.Core.Serializers.Console.Program.cs`, `ACCORE-BIN-T-V4N4`, `ACCORE-BIN-T-V4N2`, `ACCORE-BIN-T-S6F2`, `ACCORE-BIN-T-B1D5` (BDN release-grade variant) + +The custom `Console` benchmark harness showed strong run-to-run variance — user-reported `±20pp / -10pp` summa-spread between runs on identical code. 1-3% perf-claims became unmeasurable on this noise-floor; the V4N4 method-split and V4N2 Phase 2.5 attempts both fell into this band, leaving the question "does the regressed bench number reflect a code regression or measurement noise?" undecidable (see `V4N4` Reverted section). + +**Diagnosis** (sprint takeaway prior to this entry): + +1. **Warmup cache pollution** — `RunBenchmarksForTestData` ran one warmup-all loop (every serializer × WarmupIterations) followed by one bench-all loop. By the time a given serializer was measured, its hot code and data lines had been evicted by the intervening serializers' warmup passes. MemPack and AcBinary hot paths share neither code nor data working sets — they actively evict each other. +2. **GC pause leakage between samples** — the Stopwatch-recorded sample loop had no explicit `GC.Collect`. A minor GC triggered inside sample N could promote into a Gen-2 pause inside sample N+1's timed window (1-5 ms spike). +3. **Pilot sample contamination** — the first sample after warmup absorbed residual JIT bookkeeping and cold-cache misses; on a 10-sample median this contributed 1-2 outliers that visibly stretched the min/max. +4. **CPU migration / preemption** — the Windows scheduler migrated the bench thread between cores between samples (L1/L2 cache evict on each migration); background work (Defender index, OS service threads) injected random preemption spikes. +5. **JIT sleep not mode-aware** — `Thread.Sleep(JitSleep = 3000)` waited 3 seconds before each cell for tiered-JIT drain. On AOT publish (`PublishAot=true`) there IS NO dynamic compilation — the 3 seconds were pure idle. Worse, the drain happened only globally (once before all cells), not per-serializer, so a tier-promotion mid-bench could still bleed in. +6. **Range invisible** — the `.LLM` markdown output showed only the median; the user could not tell whether a 5%-median-delta was inside or outside the inter-sample range for that row. + +### Resolution + +Landed 2026-05-07 (16:00 — 17:00). Six stabilization steps in one commit window: + +**1. Per-serializer warmup separation** (`RunBenchmarksForTestData`) — the warmup-loop and bench-loop merged into one per-serializer cycle: each serializer's warmup runs IMMEDIATELY before its own bench. The serializer's hot code/data is freshest in cache when the first sample times. + +**2. `GC.Collect` before every sample** (`RunTimed`) — `GC.Collect() + WaitForPendingFinalizers() + GC.Collect()` triple-tap before each sample, OUTSIDE the Stopwatch window. Every sample starts from the same heap state; an ad-hoc Gen-2 pause from sample N can no longer bleed into sample N+1. + +**3. Pilot sample discard** (`RunTimed`) — the loop runs `samples + 1` times; the first (index 0) is discarded. The first sample post-warmup absorbs residual JIT/GC bookkeeping and cold cache; the recorded `samples` count remains 10 (median is the same data the user saw before, just sourced from "typical" sample-set, not from the post-warmup-first noisy point). + +**4. Min/max range in markdown output** (`SaveLlmResults`, new `FormatMicrosWithRange` helper, new `BenchmarkResult` fields: `SerializeTimeMinMs/MaxMs`, `DeserializeTimeMinMs/MaxMs`, `RoundTripTimeMinMs/MaxMs`) — the `.LLM` output's Ser and Deser columns now render as `26.86 (24.50..29.10)`: median (min..max) µs/op. The reader sees at a glance whether a delta is above the row's noise floor. + +**5. CPU affinity + process priority** (`RunBenchmark`) — `ProcessorAffinity = 0x1` (CPU 0 pin) + `PriorityClass = High` for the benchmark phase, `try/finally` restores the original values. Eliminates inter-sample thread migration (L1/L2 cache evicts) and reduces background-task preemption. Platform-guarded: Windows / Linux only (`CA1416` — `ProcessorAffinity` throws on macOS); locked-down hosts (group policy, container without `CAP_SYS_NICE`, etc.) catch + warning + bench continues with default scheduling. + +**6. Mode-aware `JitSleep`** (property) — `RuntimeFeature.IsDynamicCodeCompiled ? 250 : 0`. JIT mode 250 ms (the .NET 9 tiered-JIT compile queue typically drains in <100 ms for the bench's hot path); AOT publish 0 ms. The 3000 ms blind wait is gone. The drain now happens per-serializer (Step 1) instead of once globally. + +### Bench result (3 consecutive runs, 2026-05-07 17:00:32 / 17:01:03 / 17:01:32, FastestByte mode, FastMode preset) + +| Cell | AcBinary Ser median (3 runs) | Inter-run spread | Intra-cell range | +|---|---|---|---| +| Small | 7.09 / 6.83 / 6.55 | 7.6% | ~8% (noise floor: 1000×6ns measured) | +| Medium | 18.74 / 18.90 / 19.22 | 2.6% | ~10% | +| Large | 140.20 / 141.67 / 141.02 | 1.0% | ~3% | +| Repeated | 26.52 / 26.25 / 26.28 | 0.3% | ~6% | +| Deep Nested | 23.44 / 23.17 / 22.70 | 3.2% | ~7% | + +The previous `±20pp / -10pp` summa-spread shrank to **1-3pp** on the medium/large cells. The Small cell remains noisy (~8% relative) but this is a physical floor: 1000 iter × 6 ns/op = 6 µs total batch — below this, Stopwatch resolution and OS spikes dominate relatively. + +The `(min..max)` range is consistently 3-10% relative — a **measurable** signal floor: 1-3% perf-deltas no longer disappear into noise. + +### Lessons + +- **Bench stabilization is a precondition for perf optimization, not a consequence.** Optimization decisions (e.g. V4N4 method-split, V4N2 Phase 2.5) can only be derived from bench numbers if the noise floor < expected signal. Without that, the bench numbers mean nothing. +- Cache pollution (warmup-all → bench-all flow) was the **single largest** noise source: per-serializer warmup separation alone removed ~10pp of variance. +- Platform stabilization (CPU pin + high priority) combined with heap stabilization (GC.Collect + pilot discard) further tightened the range. +- AOT and JIT have different stabilization needs: the 3000 ms blind sleep was idle time on AOT; mode-aware sleep pays the cost only when needed. + +### Re-evaluation list (entries currently Reverted or unmeasurable) + +The stabilization opens a follow-up sprint: the `Reverted (2026-05-07)` entries are re-evaluable now that the noise floor < the expected 1-3% signal: +- **`ACCORE-BIN-T-V4N4`** — method-split (writer + reader hot path) is re-testable +- **`ACCORE-BIN-T-V4N2` (Phase 2.5)** — UTF-8 do-while runs (2-byte / 3-byte) per charset +- **`ACCORE-BIN-T-S6F2`** — Small fast path (was integrated into V4N4) + +Per-entry re-evaluation is the next sprint's task, NOT part of this Closed entry. + +### Why P1 + +- Blocked all sub-3% perf optimization work (every recent attempt fell into the noise band) +- One-line user complaint ("+20 és -10 között ingadozott a summa") summarized weeks of unproductive bench-driven investigation +- One-time fixed cost; every future bench run benefits + +### Follow-up: adaptive iteration + CV reporting + per-cell A/B mode (2026-05-07, second commit window) + +After the initial 6-step landing, three additional refinements were added in a second commit window the same day. The trigger was a Copilot-suggested noise-reduction list against the now-stable bench output: + +**1. Per-cell adaptive iteration** — fixed `TestIterations = 1000` produced sample windows from 6 ms (Small cell @ 6 ns/op) to 140 ms (Large cell @ 140 µs/op). The Small cell at 6 ms remained the dominant residual noise source (7.6% inter-run spread vs ≤3.2% on the other cells) because OS-level spikes (preempt + IRQ + scheduler tick) are absolute-time events; on a 6 ms sample window their relative contribution is huge. + +Implementation: +- New constant `TargetSampleMs = 250` (per-sample wall-clock target) +- New helper `CalibrateIterations(Action, int targetMs)` — runs a 100-iter probe post-warmup, computes `iterPerMs`, and rounds up to the nearest 1000. Floor 1000, ceiling 200_000. +- `RunBenchmarksForTestData` calibrates Ser and Des INDEPENDENTLY per serializer (different per-op cost). RT-only rows (NamedPipe) get a single RT calibration. +- New `BenchmarkResult` fields: `SerializeIterations`, `DeserializeIterations`, `RoundTripIterations` (per-row). +- New helpers: `ToPerOpMicros(double, int)` (replaces 1-arg variant), `SerPerOp(r)` / `DesPerOp(r)` / `RtPerOp(r)` for per-op µs from the result. +- All `Average(r => r.*TimeMs)` and `OrderBy(r => r.RoundTripTimeMs)` call-sites refactored to use per-op µs (iter-independent) — mixing batch-time across rows with different iter counts would be meaningless. ~20 call-sites total. +- RT for in-mem rows synthesized so `RtPerOp(r) == SerPerOp(r) + DesPerOp(r)` regardless of `serIter != desIter`: `RoundTripIterations = max(serIter, desIter)`, `RoundTripTimeMs = rtPerOpMicros / 1000 * RoundTripIterations`. + +**Expected impact**: Small cell sample window 6 ms → ~240 ms; inter-run spread 7.6% → ~1-2% (matching the other cells). Total suite duration ~50 s → ~110-130 s. + +**2. CV (coefficient of variation) reporting + unstable-row marker** — the median + (min..max) range surfaces shape but not a single-number stability metric. The CV (= stddev/mean) is the standard statistical measure; rows with CV > threshold are flagged with a ⚠️ suffix in the markdown output so a small inter-engine delta on a high-CV row is immediately obvious as noise-suspect. + +Implementation: +- New constant `UnstableCVThreshold = 0.03` (3% — reasonable for stabilized in-memory benchmarks) +- `RunTimed` return tuple extended: `(median, min, max, stddev)`. Stddev computed over the (samples − pilot) population using `Math.Sqrt(Math.Max(0, E[X²] - E[X]²))`. +- New `BenchmarkResult` fields: `SerializeTimeStdDevMs`, `DeserializeTimeStdDevMs`, `RoundTripTimeStdDevMs`. +- `FormatMicrosWithRange` extended: `26.86 (24.50..29.10)` stays the default; `26.86 (24.50..29.10) ⚠️5.2%` appears when CV exceeds the threshold. + +**3. Per-cell A/B mini-suite filter** — optimization-iteration loops often need only one specific cell (e.g. "tuning the Repeated cell for Hungarian charset"). The full 5-cell × 2-engine × 4-measurement suite is overkill for that. + +Implementation: +- `FilterByLayer` extended: new `small` / `medium` / `large` / `repeated` / `deep` modes — case-insensitive prefix match on `TestDataSet.Name` +- `TryParseCliArgs` recognizes the new tokens: `dotnet run -- repeated` runs only the Repeated Strings cell +- `fastestbyte` mode (existing — only AcBinary FastMode + MemoryPack head-to-head) is orthogonal and stacks: `dotnet run -- repeated fastestbyte` + +### Markdown output schema change + +The `## Results` table gains an `Iter Ser/Des` column at the right edge — visible verification that each row's batch landed near the `TargetSampleMs` window. RT-only rows show a single `Iter` value (the RT calibration count); in-mem rows show `serIter / desIter`. + +Header line updated: +- Before: `Iterations: 1000 | Warmup: 10000 | Samples: 10 (median) | ...` +- After: `Iterations: per-cell adaptive (target ~250 ms/sample) | Warmup: 10000 | Samples: 10 (median) + 1 pilot discarded | ... | UnstableCV threshold: 3%` + +## ACCORE-BIN-T-K7M3: Hot-path UTF-8 transcoder switch — `Utf8Transcoder` → BCL `Utf8.FromUtf16` / `Utf8.ToUtf16` +**Priority:** P1 · **Type:** Performance · **Status:** Closed (2026-05-08) · **Related:** `ACCORE-BIN-T-V4N3` (custom transcoder origin), `ACCORE-BIN-T-V4N2` (Phase 3 SIMD multi-byte), `ACCORE-BIN-T-V4N4` (Reverted method-split), `ACCORE-BIN-T-D9X3` (bench stabilization that made the comparison measurable) + +The custom `Utf8Transcoder` (V4N3) was originally implemented to bypass `System.Text.Encoding.UTF8.GetBytes` virtual-dispatch + EncoderFallback overhead. The V4N3 audit measured wins vs. the **legacy `Encoding.UTF8`** API. **What it did NOT measure**: the modern `System.Text.Unicode.Utf8.FromUtf16` / `Utf8.ToUtf16` API (.NET 7+, tier-1 optimized, used by MemoryPack `WriteUtf8` / `ReadUtf8` paths internally). Once the bench stabilized (D9X3), a direct A/B comparison surfaced that the BCL modern API consistently outperforms the custom transcoder on the binary serializer's hot path. + +### Bench A/B (Latin1Long charset, FastMode SGen Compact) + +| Cell | Ser delta vs MemPack — custom (`EncodeUtf8SinglePass`) | Ser delta vs MemPack — BCL (`Utf8.FromUtf16`) | Improvement | +|------|--------------------------------------------------------|------------------------------------------------|-------------| +| Small | +28.5% | +7.3% | **-21pp** | +| Medium | +23.8% | +3.1% | -21pp | +| Large | +19.6% | +5.1% | -14pp | +| Repeated | +28.8% | +10.9% | -18pp | +| Deep | +23.1% | +0.6% | -22pp | + +| Cell | Deser delta vs MemPack — custom (`DecodeUtf8SinglePass`) | Deser delta vs MemPack — BCL (`Utf8.ToUtf16`) | Improvement | +|------|---------------------------------------------------------|------------------------------------------------|-------------| +| Small | +17.6% | -1.2% (paritás) | -19pp | +| Medium | +12.8% | -4.7% (AcBinary nyer) | -17pp | +| Large | +4.9% | -10.3% (AcBinary nyer) | -15pp | +| Repeated | +16.9% | -1.6% (paritás) | -18pp | +| Deep | +7.0% | -9.0% (AcBinary nyer) | -16pp | + +The Deser side flipped from "consistently behind" to "wins on 3 of 5 cells, paritás on 2". The Ser side closed the deficit from +20-29% to 0-11%. **Both sides** measurable improvement on **every** cell. + +### Why the custom transcoder lost + +The V4N3 implementation included a 4-tier SIMD ASCII prefix path (Vector512BW / Vector256 / Vector128 / scalar) plus a DWORD ASCII batch + scalar 4-branch multi-byte fallback. **All correct, all SIMD-tuned**. But: + +1. **`Utf8.FromUtf16` is also SIMD-tuned in .NET 9** — the .NET team rewrote it on top of `System.Text.Unicode.Utf8` primitives that share infrastructure with `Ascii.IsValid` / `Latin1.GetString`. AOT-publish-friendly, branch-friendly, no virtual dispatch (the `Utf8` API is static, not via an `Encoding` instance with virtual-method-table). +2. **The custom transcoder's ASCII prefix path bails out on first non-ASCII byte** — on multi-byte content (Latin extended / Cyrillic / CJK) the SIMD path runs only for the leading ASCII span, then the entire remainder falls into per-char scalar 4-branch dispatch. The BCL `Utf8.FromUtf16` SIMD-batches multi-byte content too (different algorithm — the BCL doesn't bail on first non-ASCII). +3. **AOT inline budget**: the custom transcoder's body grew with the V4N3 / V4N4 / V4N5 additions; in NativeAOT publish the call sites in `WriteStringWithDispatch` / `ReadString*` did NOT inline (V4N4 disasm audit confirmed). The BCL `Utf8.FromUtf16` is a single static method with a tighter call-site footprint. + +### Resolution + +Landed 2026-05-08. The 8 production hot-path call sites of `Utf8Transcoder.*` switched to BCL: + +| File / line | Before | After | +|---|---|---| +| `AcBinarySerializer.cs:120` | `Utf8Transcoder.GetUtf8ByteCount` | `Encoding.UTF8.GetByteCount` | +| `AcBinarySerializer.BinarySerializationContext.cs:694` | `Utf8Transcoder.EncodeUtf8SinglePass` | `Utf8.FromUtf16(...)` | +| `AcBinarySerializer.BinarySerializationContext.cs:784` | `Utf8Transcoder.EncodeUtf8SinglePass` | `Utf8.FromUtf16(...)` | +| `AcBinarySerializer.BinarySerializationContext.cs:901` | `Utf8Transcoder.EncodeUtf8SinglePass` | `Utf8.FromUtf16(...)` | +| `AcBinaryDeserializer.BinaryDeserializationContext.Read.cs:523` | `Utf8Transcoder.CountUtf8Chars` | `Encoding.UTF8.GetCharCount` | +| `AcBinaryDeserializer.BinaryDeserializationContext.Read.cs:527` | `Utf8Transcoder.DecodeUtf8SinglePass` | `Utf8.ToUtf16(...)` | +| `AcBinaryDeserializer.BinaryDeserializationContext.Read.cs:565` | `Utf8Transcoder.DecodeUtf8SinglePass` | `Utf8.ToUtf16(...)` | +| `PropertyMetadataBase.cs:104-109` (ctor-once) | `Utf8Transcoder.GetUtf8ByteCount` + `EncodeUtf8SinglePass` (two-pass) | `Encoding.UTF8.GetBytes(string)` (single-pass with exact-size byte[] return) | + +The count-only call sites (`GetByteCount` / `GetCharCount`) stay on the **legacy** `Encoding.UTF8` API — `System.Text.Unicode.Utf8` has no count-only equivalent (only `FromUtf16` / `ToUtf16` which encode + count combined). For pure count, the legacy API is the optimal tool (single SIMD-tuned scan, no encode/decode work). + +The `Utf8Transcoder.cs` file remains in the repo but **fully commented out** — the class definition is preserved as historical reference / future reactivation if a workload ever surfaces where it could win again. `Utf8TranscoderTests.cs` is not currently exercising live code. + +### Lesson — the V4N3 audit's blind spot + +The V4N3 (custom transcoder) audit compared against **legacy `Encoding.UTF8.GetBytes`** and won. **The audit did NOT compare against `Utf8.FromUtf16`** (the modern API, .NET 7+). On modern runtime the BCL has two UTF-8 transcoders: a legacy one (instance-method on `Encoding`, virtual dispatch) and a modern one (static `Utf8.FromUtf16` / `Utf8.ToUtf16`). MemoryPack uses the modern one — that's what we should have been comparing against from the start. + +**Generalizable lesson**: when measuring a custom implementation against a "BCL baseline", verify which BCL API is used by the actual competition (here: MemoryPack source-gen). The `Encoding.UTF8.*` instance API and `System.Text.Unicode.Utf8` static API are different generations of the same logical operation; treating them as interchangeable hides the comparison's scope. + +### Why P1 + +- Closed the FastMode Compact mode Ser deficit from +20-29% to ≤11% on every cell (Latin1Long benchmark) +- Flipped the Deser side from -1 to -10% deficit to **AcBinary winning on 3 of 5 cells**, parity on 2 (Latin1Long benchmark) +- One-time fixed cost (8 production call-site cseréje) — every future bench profits +- Removed a load-bearing ~600-line custom SIMD module from the maintained surface area; future maintainers don't need to reason about Vector512BW / cross-lane shuffle / 5-popcount surrogate-pair correctness — the BCL handles it + +### Follow-up — `Utf8Transcoder.cs` cleanup + +The file is fully commented out. Either: +- **Delete** entirely (preferred for repo cleanliness) — `Utf8TranscoderTests.cs` then needs deletion or revival as a regression-only guard +- **Keep** the comment-block as historical reference, with a header comment pointing to this entry + +Decision deferred — the comment-block does no harm to build / runtime. Address when the next docs-archive sweep runs. diff --git a/AyCode.Core/docs/BINARY/README.md b/AyCode.Core/docs/BINARY/README.md index f97678c..c1c060a 100644 --- a/AyCode.Core/docs/BINARY/README.md +++ b/AyCode.Core/docs/BINARY/README.md @@ -11,6 +11,7 @@ AcBinary serialization system. Primary goal: **speed** (two-phase scan+serialize - [`BINARY_IMPLEMENTATION.md`](BINARY_IMPLEMENTATION.md) — Internal implementation details - [`BINARY_WRITERS.md`](BINARY_WRITERS.md) — Writer internals (streaming, buffering) - [`BINARY_SGEN.md`](BINARY_SGEN.md) — Source generator (`AyCode.Core.Serializers.SourceGenerator`) +- [`BINARY_SGEN_OPTIMIZATION.md`](BINARY_SGEN_OPTIMIZATION.md) — SGen per-property emit micro-optimization brainstorming / methodology notes (working doc, not a TODO) - [`BINARY_ISSUES.md`](BINARY_ISSUES.md) — Known issues and limitations (binary serializer core) - [`BINARY_TODO.md`](BINARY_TODO.md) — Planned work / open tickets (binary serializer core) - [`BINARY_ASYNCPIPE_ISSUES.md`](BINARY_ASYNCPIPE_ISSUES.md) — Known issues and limitations (streaming I/O layer: `AsyncPipeReaderInput` + `AsyncPipeWriterOutput`)