From abee22b31a72d734625cfe711c05bdf2f2a5a482 Mon Sep 17 00:00:00 2001 From: Loretta Date: Wed, 6 May 2026 13:52:35 +0200 Subject: [PATCH] [LOADED_DOCS: 3 files, no new loads] SIMD Utf8Transcoder.GetUtf8ByteCount + test suite Introduced SIMD-accelerated Utf8Transcoder.GetUtf8ByteCount for efficient UTF-8 byte counting, replacing all writer-side Encoding.UTF8.GetByteCount usages. Added 29 unit tests for correctness across ASCII, Hungarian, CJK, emoji, and boundary cases. Updated benchmarks to ensure FixStr is bypassed and wire mode is selectable. Documented implementation and dead-code review in BINARY_TODO.md. No public API changes. --- .github/copilot-instructions.md | 1 - AyCode.Core.Serializers.Console/Program.cs | 112 +++++++-- .../Serialization/Utf8TranscoderTests.cs | 221 ++++++++++++++++++ .../TestModels/BenchmarkTestDataProvider.cs | 82 +++++++ .../Binaries/AcBinarySerializer.cs | 2 +- .../Serializers/Binaries/Utf8Transcoder.cs | 175 ++++++++++++++ .../Serializers/PropertyMetadataBase.cs | 10 +- AyCode.Core/docs/BINARY/BINARY_TODO.md | 192 +++++++++++++++ 8 files changed, 777 insertions(+), 18 deletions(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index e7d72af..032f158 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -166,5 +166,4 @@ Full doctrine: `../docs/ARCHITECTURE.md#framework-vs-consumer-boundary` 19. **Documentation layering** — write `.md` documentation at the **defining layer** (where the code lives). Higher-layer `.md` files reference the base docs (e.g. `see AyCode.Services/docs/SIGNALR/README.md`) and document only project-specific overrides or extensions. Never duplicate base-layer descriptions in consumer-level docs. 20. **Do not re-read .md files** already in your context window. They only change if you modify them yourself (new content is already in context) or if the developer tells you they changed — in that case re-read them once. 21. **Folder navigation** — start from the root `README.md` for solution-level navigation. When you need to understand a folder's contents or find a type/class, read the `README.md` in that folder first — it indexes the local files and sub-folders. Follow this before grepping or reading source files. - 22. **Language Preference**: Communicate in Hungarian as requested by the user. diff --git a/AyCode.Core.Serializers.Console/Program.cs b/AyCode.Core.Serializers.Console/Program.cs index ed4ca4a..2ddba06 100644 --- a/AyCode.Core.Serializers.Console/Program.cs +++ b/AyCode.Core.Serializers.Console/Program.cs @@ -47,9 +47,13 @@ public static class Program #else private static int WarmupIterations = 10000; //5000 private static int TestIterations = 1000; //1000 - private static int BenchmarkSamples = 3; + private static int BenchmarkSamples = 5; #endif + // Interactive settings: selected AcBinary wire mode for benchmark runs. + // 1 = Compact, 2 = Fast + private static WireMode SelectedWireMode = WireMode.Compact; + // Serializer name constants // Engine identifiers (used in Engine column + comparison logic) private const string EngineAcBinary = "AcBinary"; @@ -480,21 +484,22 @@ public static class Program private static List CreateSerializers(TestDataSet testData, string serializerMode) { // FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path. - // THREE benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + AcBinary FastMode Byte[] - // (WireMode.Fast = UTF-16 raw memcpy) + MemoryPack Byte[]. Shows BOTH sides of AcBinary's - // positioning vs MemPack: - // - Compact: smallest wire, UTF-8 encode/decode CPU cost - // - Fast (UTF-16 raw): comparable wire to MemPack, no encoding cost + // TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[]. + // - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head. // Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min. + // + // FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint — + // we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor + // reference earlier. Re-enable when revisiting Fast wire-mode performance. if (serializerMode == "fastestbyte") { - var fastWireOptions = AcBinarySerializerOptions.FastMode; - fastWireOptions.WireMode = WireMode.Fast; + var fastestByteOptions = AcBinarySerializerOptions.FastMode; + fastestByteOptions.WireMode = SelectedWireMode; return new List { - new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.FastMode, "FastMode"), - new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"), + new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"), + //new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"), new MemoryPackBenchmark(testData.Order, "Default"), }; } @@ -513,6 +518,7 @@ public static class Program // wire chunk AND kernel transfer unit; change ONLY this line when tuning. var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode; binaryFastModePipeChunkOnly.BufferWriterChunkSize = PipeChunkSize; + binaryFastModePipeChunkOnly.WireMode = SelectedWireMode; return new List { @@ -547,12 +553,18 @@ public static class Program var binaryNoInternOption = AcBinarySerializerOptions.Default; binaryNoInternOption.UseStringInterning = StringInterningMode.None; + binaryNoInternOption.WireMode = SelectedWireMode; var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default; binaryDefaultNoSgenOption.UseGeneratedCode = false; + binaryDefaultNoSgenOption.WireMode = SelectedWireMode; var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode; binaryFastModeNoSgenOption.UseGeneratedCode = false; + binaryFastModeNoSgenOption.WireMode = SelectedWireMode; + + var binaryFastModeOption = AcBinarySerializerOptions.FastMode; + binaryFastModeOption.WireMode = SelectedWireMode; // BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives // the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call. @@ -561,16 +573,19 @@ public static class Program // vs syscall count). var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode; binaryFastModeBufWrChunk.BufferWriterChunkSize = PipeChunkSize; + binaryFastModeBufWrChunk.WireMode = SelectedWireMode; // In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment // concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory // System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task). var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode; binaryFastModePipeChunkInMem.BufferWriterChunkSize = PipeChunkSize; + binaryFastModePipeChunkInMem.WireMode = SelectedWireMode; var defaultOptions = AcBinarySerializerOptions.Default; defaultOptions.UseStringInterning = StringInterningMode.None; defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId; + defaultOptions.WireMode = SelectedWireMode; return new List { @@ -578,7 +593,7 @@ public static class Program // AcBinary — Byte[] API (uncomment to compare option presets side-by-side) // ============================================================ // Fastest Byte[] — SGen path (UseGeneratedCode=true, default). - new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.FastMode, "FastMode"), + new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"), // Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch. // Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples. // NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates @@ -594,7 +609,7 @@ public static class Program //new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"), // AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario) - new AcBinaryBufferWriterBenchmark(testData.Order, AcBinarySerializerOptions.FastMode, "FastMode"), + new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"), // AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario). // 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter @@ -859,7 +874,7 @@ public static class Program System.Console.WriteLine(" [A] All layers"); System.Console.WriteLine(" [F] FastestByte — AcBinary FastMode Byte[] vs MemoryPack Byte[] only (tight optimization loop)"); System.Console.WriteLine(" [P] AsyncPipe — streaming I/O isolation (only AsyncPipe, all test data)"); - System.Console.WriteLine($" [S] Settings — modify Warmup ({WarmupIterations}) / Iterations ({TestIterations}) / Samples ({BenchmarkSamples})"); + System.Console.WriteLine($" [S] Settings — Iteration / WireMode (current: {SelectedWireMode})"); System.Console.WriteLine(" [Q] Quit"); System.Console.Write("\nSelection: "); @@ -889,10 +904,42 @@ public static class Program /// Returns to the caller (which re-displays the main menu). /// private static void ShowSettingsMenu() + { + while (true) + { + System.Console.WriteLine(); + System.Console.WriteLine("─────────────────────────────────────────────"); + System.Console.WriteLine("Settings"); + System.Console.WriteLine("─────────────────────────────────────────────"); + System.Console.WriteLine(" [1] Iteration — Warmup / Iterations / Samples"); + System.Console.WriteLine($" [2] WireMode — current: {SelectedWireMode}"); + System.Console.WriteLine(" [B] Back"); + System.Console.Write("\nSelection: "); + + var key = System.Console.ReadKey(intercept: false).KeyChar; + System.Console.WriteLine(); + + switch (char.ToLower(key)) + { + case '1': + ShowIterationSettingsMenu(); + break; + case '2': + ShowWireModeSettingsMenu(); + break; + case 'b': + return; + default: + continue; + } + } + } + + private static void ShowIterationSettingsMenu() { System.Console.WriteLine(); System.Console.WriteLine("─────────────────────────────────────────────"); - System.Console.WriteLine("Settings — press Enter to keep current value"); + System.Console.WriteLine("Iteration settings — press Enter to keep current value"); System.Console.WriteLine("─────────────────────────────────────────────"); System.Console.WriteLine(); @@ -901,7 +948,42 @@ public static class Program BenchmarkSamples = PromptInt("BenchmarkSamples", BenchmarkSamples, min: 1); System.Console.WriteLine(); - System.Console.WriteLine($"✓ Settings updated: Warmup={WarmupIterations} | Iterations={TestIterations} | Samples={BenchmarkSamples}"); + System.Console.WriteLine($"✓ Iteration settings updated: Warmup={WarmupIterations} | Iterations={TestIterations} | Samples={BenchmarkSamples}"); + } + + private static void ShowWireModeSettingsMenu() + { + while (true) + { + System.Console.WriteLine(); + System.Console.WriteLine("─────────────────────────────────────────────"); + System.Console.WriteLine("WireMode settings"); + System.Console.WriteLine("─────────────────────────────────────────────"); + System.Console.WriteLine($"Current: {SelectedWireMode}"); + System.Console.WriteLine(" [1] Compact"); + System.Console.WriteLine(" [2] Fast"); + System.Console.WriteLine(" [B] Back"); + System.Console.Write("\nSelection: "); + + var key = System.Console.ReadKey(intercept: false).KeyChar; + System.Console.WriteLine(); + + switch (char.ToLower(key)) + { + case '1': + SelectedWireMode = WireMode.Compact; + System.Console.WriteLine("✓ WireMode set to Compact"); + return; + case '2': + SelectedWireMode = WireMode.Fast; + System.Console.WriteLine("✓ WireMode set to Fast"); + return; + case 'b': + return; + default: + continue; + } + } } /// diff --git a/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs b/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs index 0318c29..d9d8722 100644 --- a/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs +++ b/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs @@ -226,6 +226,214 @@ public class Utf8TranscoderTests AssertRoundTrip("😀"); } + // ────────────────────────────────────────────────────────────────────── + // GetUtf8ByteCount — content classes + // ────────────────────────────────────────────────────────────────────── + + [TestMethod] + public void GetUtf8ByteCount_AsciiOnly_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl("Hello, World! Plain ASCII text."); + } + + [TestMethod] + public void GetUtf8ByteCount_AsciiExactly7Bytes_MatchesBcl() + { + // Boundary: just below Vector128.Count (8) — scalar tail only + AssertGetUtf8ByteCountMatchesBcl(new string('a', 7)); + } + + [TestMethod] + public void GetUtf8ByteCount_AsciiExactly8Bytes_MatchesBcl() + { + // Boundary: exactly Vector128.Count — Vector128 path triggers + AssertGetUtf8ByteCountMatchesBcl(new string('a', 8)); + } + + [TestMethod] + public void GetUtf8ByteCount_AsciiExactly16Bytes_MatchesBcl() + { + // Boundary: exactly Vector256.Count — Vector256 path triggers + AssertGetUtf8ByteCountMatchesBcl(new string('a', 16)); + } + + [TestMethod] + public void GetUtf8ByteCount_AsciiExactly32Bytes_MatchesBcl() + { + // Boundary: exactly Vector512.Count — Vector512 path triggers on AVX-512BW + AssertGetUtf8ByteCountMatchesBcl(new string('a', 32)); + } + + [TestMethod] + public void GetUtf8ByteCount_AsciiVeryLong_500Chars_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl(new string('z', 500)); + } + + [TestMethod] + public void GetUtf8ByteCount_HungarianShort_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl("Termék"); + } + + [TestMethod] + public void GetUtf8ByteCount_HungarianMedium_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl("árvíztűrő tükörfúrógép"); + } + + [TestMethod] + public void GetUtf8ByteCount_HungarianLong_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl(string.Concat(Enumerable.Repeat("árvíztűrő tükörfúrógép ", 20))); + } + + [TestMethod] + public void GetUtf8ByteCount_CjkBmp_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl("你好世界 こんにちは 안녕하세요"); + } + + [TestMethod] + public void GetUtf8ByteCount_CjkBmpLong_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl(string.Concat(Enumerable.Repeat("你好世界 ", 30))); + } + + [TestMethod] + public void GetUtf8ByteCount_SupplementaryPlane_MatchesBcl() + { + // Each emoji is 2 UTF-16 chars (surrogate pair) → 4 UTF-8 bytes total + AssertGetUtf8ByteCountMatchesBcl("😀😁😂🎉🌟"); + } + + [TestMethod] + public void GetUtf8ByteCount_MixedAllClasses_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl("ASCII Magyar:árvíz CJK:你好 Emoji:😀"); + } + + [TestMethod] + public void GetUtf8ByteCount_LongMixed_MatchesBcl() + { + var sb = new StringBuilder(); + for (var i = 0; i < 50; i++) + { + sb.Append("ASCII run-").Append(i).Append(" Magyar:árvíz CJK:你好 "); + } + AssertGetUtf8ByteCountMatchesBcl(sb.ToString()); + } + + [TestMethod] + public void GetUtf8ByteCount_Empty_ReturnsZero() + { + Assert.AreEqual(0, Utf8Transcoder.GetUtf8ByteCount(ReadOnlySpan.Empty)); + } + + [TestMethod] + public void GetUtf8ByteCount_SingleAsciiChar_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl("X"); + } + + [TestMethod] + public void GetUtf8ByteCount_SingleHungarianChar_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl("é"); + } + + [TestMethod] + public void GetUtf8ByteCount_SingleCjkChar_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl("好"); + } + + [TestMethod] + public void GetUtf8ByteCount_SingleEmoji_MatchesBcl() + { + // Single emoji = surrogate pair, exact 4 bytes + AssertGetUtf8ByteCountMatchesBcl("😀"); + } + + [TestMethod] + public void GetUtf8ByteCount_BoundaryAsciiToHungarian_MatchesBcl() + { + // Exercises split between SIMD ASCII region and 2-byte tail + for (var asciiLen = 0; asciiLen <= 64; asciiLen++) + { + var s = new string('a', asciiLen) + "árvíz"; + var expected = Utf8.GetByteCount(s); + var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan()); + Assert.AreEqual(expected, actual, $"asciiLen={asciiLen}"); + } + } + + [TestMethod] + public void GetUtf8ByteCount_BoundaryAsciiToCjk_MatchesBcl() + { + // 3-byte sequence boundary stress + for (var asciiLen = 0; asciiLen <= 64; asciiLen++) + { + var s = new string('a', asciiLen) + "你好世界"; + var expected = Utf8.GetByteCount(s); + var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan()); + Assert.AreEqual(expected, actual, $"asciiLen={asciiLen}"); + } + } + + [TestMethod] + public void GetUtf8ByteCount_BoundaryAsciiToEmoji_MatchesBcl() + { + // CRITICAL: tests that surrogate pairs split across SIMD chunks still produce correct count. + // High surrogate may land in chunk N, low surrogate in chunk N+1; total must remain 4 bytes. + for (var asciiLen = 0; asciiLen <= 64; asciiLen++) + { + var s = new string('a', asciiLen) + "😀"; + var expected = Utf8.GetByteCount(s); + var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan()); + Assert.AreEqual(expected, actual, $"asciiLen={asciiLen}"); + } + } + + [TestMethod] + public void GetUtf8ByteCount_MultipleEmojiBoundary_MatchesBcl() + { + // Surrogate pair split-stress: many emojis at varying offsets + for (var prefixLen = 0; prefixLen <= 32; prefixLen++) + { + var s = new string('a', prefixLen) + "😀😁😂🎉🌟😀😁😂🎉🌟"; + var expected = Utf8.GetByteCount(s); + var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan()); + Assert.AreEqual(expected, actual, $"prefixLen={prefixLen}"); + } + } + + [TestMethod] + public void GetUtf8ByteCount_AgreesWithEncodeUtf8SinglePass_AllContentClasses() + { + // Round-trip contract: the byte count returned must equal the bytesWritten by EncodeUtf8SinglePass. + // This is the load-bearing invariant for two-pass [VarUInt][bytes] writes in cold-fallback paths. + var samples = new[] + { + "Hello", + "árvíztűrő tükörfúrógép", + "你好世界", + "😀🎉🌟", + "ASCII Magyar:árvíz CJK:你好 Emoji:😀", + new string('z', 500), + string.Concat(Enumerable.Repeat("árvíztűrő tükörfúrógép ", 20)) + }; + + foreach (var s in samples) + { + var byteCountFromCounter = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan()); + var dst = new byte[s.Length * 4]; + var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(s.AsSpan(), dst.AsSpan()); + Assert.AreEqual(bytesWritten, byteCountFromCounter, + $"GetUtf8ByteCount disagrees with EncodeUtf8SinglePass for [{s.Substring(0, Math.Min(20, s.Length))}...]"); + } + } + // ────────────────────────────────────────────────────────────────────── // Decoder-side cross-check: BCL Encoding.UTF8.GetString reference // ────────────────────────────────────────────────────────────────────── @@ -291,6 +499,19 @@ public class Utf8TranscoderTests Assert.AreEqual(original, decoded, $"Decoder output mismatch{ctx}"); } + /// + /// Verifies that matches + /// for the same input. This is the BCL parity + /// invariant — any divergence means the SIMD byte counter is producing wrong values that + /// would corrupt VarUInt length prefixes in WriteStringUtf8Internal. + /// + private static void AssertGetUtf8ByteCountMatchesBcl(string original) + { + var expected = Utf8.GetByteCount(original); + var actual = Utf8Transcoder.GetUtf8ByteCount(original.AsSpan()); + Assert.AreEqual(expected, actual, $"GetUtf8ByteCount mismatch for input length {original.Length}"); + } + /// /// Verifies that DecodeUtf8SinglePass produces output identical to /// for the same byte input. Catches silent decoder bugs that pass the round-trip test diff --git a/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs b/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs index 72bcf1e..34d1ff2 100644 --- a/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs +++ b/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs @@ -1,9 +1,22 @@ using AyCode.Core.Serializers.Binaries; +using System.Collections; +using System.Reflection; +using System.Runtime.CompilerServices; namespace AyCode.Core.Tests.TestModels; public static class BenchmarkTestDataProvider { + private const int FixStrMaxLength = 31; + private const string LongStringSuffix = "__Benchmárk_Long_String_Söffix__"; + + private sealed class ReferenceComparer : IEqualityComparer + { + public static readonly ReferenceComparer Instance = new(); + public new bool Equals(object? x, object? y) => ReferenceEquals(x, y); + public int GetHashCode(object obj) => RuntimeHelpers.GetHashCode(obj); + } + public static List CreateTestDataSets(bool resetId = true) { return new List @@ -45,6 +58,8 @@ public static class BenchmarkTestDataProvider sharedTag: sharedTag, sharedUser: sharedUser); + EnsureAllStringsBypassFixStr(order); + ClearDeepLevelRefs(order); return new TestDataSet("Small (2x2x2x2)", order, iidRefPercent: 20); @@ -77,6 +92,8 @@ public static class BenchmarkTestDataProvider sharedMetadata: sharedMeta, sharedPreferences: sharedPreferences); + EnsureAllStringsBypassFixStr(order); + ClearDeepLevelRefs(order); return new TestDataSet("Medium (3x3x3x4)", order, iidRefPercent: 20); @@ -107,6 +124,8 @@ public static class BenchmarkTestDataProvider sharedUser: sharedUser, sharedPreferences: sharedPreferences); + EnsureAllStringsBypassFixStr(order); + ClearDeepLevelRefs(order); return new TestDataSet("Large (5x5x5x10)", order, iidRefPercent: 20); @@ -153,6 +172,8 @@ public static class BenchmarkTestDataProvider } } + EnsureAllStringsBypassFixStr(order); + ClearDeepLevelRefs(order); return new TestDataSet("Repeated Strings (10 items)", order, iidRefPercent: 20); @@ -185,6 +206,8 @@ public static class BenchmarkTestDataProvider sharedPreferences: sharedPreferences, sharedCategory: sharedCategory); + EnsureAllStringsBypassFixStr(order); + ClearDeepLevelRefs(order); return new TestDataSet("Deep Nested (2x4x4x8)", order, iidRefPercent: 20); @@ -218,6 +241,65 @@ public static class BenchmarkTestDataProvider } } } + + private static void EnsureAllStringsBypassFixStr(object? root) + { + if (root == null) return; + + var visited = new HashSet(ReferenceComparer.Instance); + var stack = new Stack(); + stack.Push(root); + + while (stack.Count > 0) + { + var current = stack.Pop(); + if (!visited.Add(current)) continue; + + if (current is IEnumerable enumerable && current is not string) + { + foreach (var item in enumerable) + { + if (item != null) + stack.Push(item); + } + continue; + } + + var type = current.GetType(); + foreach (var property in type.GetProperties(BindingFlags.Instance | BindingFlags.Public)) + { + if (!property.CanRead) continue; + + if (property.PropertyType == typeof(string)) + { + if (!property.CanWrite) continue; + + var value = (string?)property.GetValue(current); + property.SetValue(current, ToLongString(value)); + continue; + } + + if (property.PropertyType.IsValueType || property.PropertyType.IsEnum) + continue; + + var child = property.GetValue(current); + if (child != null) + stack.Push(child); + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static string ToLongString(string? value) + { + if (string.IsNullOrEmpty(value)) + return "Benchmark_String_Value" + LongStringSuffix; + + if (value.Length > FixStrMaxLength) + return value; + + return value + LongStringSuffix; + } } public sealed class TestDataSet diff --git a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.cs b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.cs index ef3c6d0..4f54e76 100644 --- a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.cs +++ b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.cs @@ -117,7 +117,7 @@ public static partial class AcBinarySerializer foreach (var (stringValue, properties) in analysis) { - var byteLength = Encoding.UTF8.GetByteCount(stringValue); + var byteLength = Utf8Transcoder.GetUtf8ByteCount(stringValue.AsSpan()); foreach (var (propPath, count) in properties) { if (!propertyStats.TryGetValue(propPath, out var list)) diff --git a/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs b/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs index da833e8..22ad29c 100644 --- a/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs +++ b/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs @@ -188,6 +188,181 @@ internal static class Utf8Transcoder return dstIdx; } + /// + /// Counts the UTF-8 byte length produced by encoding the given UTF-16 char span. + /// Symmetric encode-side helper to ; the value returned equals + /// the bytesWritten that would produce. + /// + /// + /// Trusted-input — assumes well-formed UTF-16 (every high surrogate paired with a low surrogate), + /// matching 's contract. Bypasses + /// .GetByteCount virtual-dispatch + encoder-fallback overhead. + /// + /// Layered SIMD: Vector512 (32 chars/iter) on AVX-512BW hosts → Vector256 (16 chars/iter) + /// on AVX2 hosts → Vector128 (8 chars/iter) on Apple Silicon NEON / WASM SIMD / SSE2 → scalar tail. + /// JIT/AOT path-selection via Avx512BW.IsSupported / Vector{N}.IsHardwareAccelerated + /// [Intrinsic] booleans (constant-folded dead branches per host). + /// + /// Per-char UTF-8 byte contribution: + /// + /// c < 0x80 → 1 byte (ASCII) + /// 0x80 ≤ c < 0x800 → 2 bytes (Latin extended, Cyrillic, Greek, Hebrew, Arabic) + /// 0x800 ≤ c < 0xD800 or c ≥ 0xE000 → 3 bytes (CJK BMP, other BMP) + /// 0xD800 ≤ c < 0xDC00 (high surrogate) → 4 bytes (whole pair encoded here) + /// 0xDC00 ≤ c < 0xE000 (low surrogate) → 0 bytes (absorbed by paired high surrogate) + /// + /// + /// SIMD per-block: 5 popcount-on-threshold-mask operations + /// (< 0x80, < 0x800, < 0xD800, < 0xDC00, < 0xE000). Closed-form aggregation: + /// bytes = 3*N - ascii - c_lt_0x800 + highSur - 3*lowSur + /// where ascii = popcount(c < 0x80), + /// highSur = popcount(c < 0xDC00) - popcount(c < 0xD800), + /// lowSur = popcount(c < 0xE000) - popcount(c < 0xDC00). + /// + /// Both highSur and lowSur must be counted independently — feature-equivalent + /// to the per-char model (high → 4 bytes, low → 0 bytes). A natural-looking shortcut + /// (lowSur == highSur for well-formed UTF-16) is FALSE within a single SIMD chunk when + /// a surrogate pair straddles the chunk boundary; over the whole string the counts equalize + /// but per-block they don't. Across-the-boundary correctness: a high surrogate counted in + /// chunk N contributes 4 bytes there; its low surrogate (in chunk N+1) contributes 0 bytes — + /// total 4 bytes per pair regardless of where the boundary falls. + /// + /// Pairs with for two-pass [VarUInt][bytes] writes in + /// cold-fallback paths (e.g. WriteFixStrDirect's non-ASCII fallback in + /// BinarySerializationContext). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int GetUtf8ByteCount(ReadOnlySpan src) + { + var byteCount = 0; + var i = 0; + var n = src.Length; + ref ushort srcRef = ref Unsafe.As(ref MemoryMarshal.GetReference(src)); + + // SIMD path 1: Vector512 (32 chars/iter) on AVX-512BW hosts + if (Avx512BW.IsSupported && n >= Vector512.Count) + { + var v_0x80 = Vector512.Create((ushort)0x80); + var v_0x800 = Vector512.Create((ushort)0x800); + var v_0xD800 = Vector512.Create((ushort)0xD800); + var v_0xDC00 = Vector512.Create((ushort)0xDC00); + var v_0xE000 = Vector512.Create((ushort)0xE000); + + do + { + var v = Vector512.LoadUnsafe(ref srcRef, (uint)i); + + var c_lt_0x80 = BitOperations.PopCount(Vector512.LessThan(v, v_0x80).ExtractMostSignificantBits()); + var c_lt_0x800 = BitOperations.PopCount(Vector512.LessThan(v, v_0x800).ExtractMostSignificantBits()); + var c_lt_0xD800 = BitOperations.PopCount(Vector512.LessThan(v, v_0xD800).ExtractMostSignificantBits()); + var c_lt_0xDC00 = BitOperations.PopCount(Vector512.LessThan(v, v_0xDC00).ExtractMostSignificantBits()); + var c_lt_0xE000 = BitOperations.PopCount(Vector512.LessThan(v, v_0xE000).ExtractMostSignificantBits()); + + var highSur = c_lt_0xDC00 - c_lt_0xD800; + var lowSur = c_lt_0xE000 - c_lt_0xDC00; + byteCount += 3 * Vector512.Count - c_lt_0x80 - c_lt_0x800 + highSur - 3 * lowSur; + + i += Vector512.Count; + } while (n - i >= Vector512.Count); + } + + // SIMD path 2: Vector256 (16 chars/iter) on AVX2 hosts; also handles AVX-512 tail < 32 chars + if (Vector256.IsHardwareAccelerated && n - i >= Vector256.Count) + { + var v_0x80 = Vector256.Create((ushort)0x80); + var v_0x800 = Vector256.Create((ushort)0x800); + var v_0xD800 = Vector256.Create((ushort)0xD800); + var v_0xDC00 = Vector256.Create((ushort)0xDC00); + var v_0xE000 = Vector256.Create((ushort)0xE000); + + do + { + var v = Vector256.LoadUnsafe(ref srcRef, (uint)i); + + var c_lt_0x80 = BitOperations.PopCount(Vector256.LessThan(v, v_0x80).ExtractMostSignificantBits()); + var c_lt_0x800 = BitOperations.PopCount(Vector256.LessThan(v, v_0x800).ExtractMostSignificantBits()); + var c_lt_0xD800 = BitOperations.PopCount(Vector256.LessThan(v, v_0xD800).ExtractMostSignificantBits()); + var c_lt_0xDC00 = BitOperations.PopCount(Vector256.LessThan(v, v_0xDC00).ExtractMostSignificantBits()); + var c_lt_0xE000 = BitOperations.PopCount(Vector256.LessThan(v, v_0xE000).ExtractMostSignificantBits()); + + var highSur = c_lt_0xDC00 - c_lt_0xD800; + var lowSur = c_lt_0xE000 - c_lt_0xDC00; + byteCount += 3 * Vector256.Count - c_lt_0x80 - c_lt_0x800 + highSur - 3 * lowSur; + + i += Vector256.Count; + } while (n - i >= Vector256.Count); + } + + // SIMD path 3: Vector128 (8 chars/iter) on Apple Silicon NEON, WASM SIMD, legacy SSE2; + // also handles tail < 16 from higher tiers. Cross-platform via Vector128.IsHardwareAccelerated. + if (Vector128.IsHardwareAccelerated && n - i >= Vector128.Count) + { + var v_0x80 = Vector128.Create((ushort)0x80); + var v_0x800 = Vector128.Create((ushort)0x800); + var v_0xD800 = Vector128.Create((ushort)0xD800); + var v_0xDC00 = Vector128.Create((ushort)0xDC00); + var v_0xE000 = Vector128.Create((ushort)0xE000); + + do + { + var v = Vector128.LoadUnsafe(ref srcRef, (uint)i); + + var c_lt_0x80 = BitOperations.PopCount(Vector128.LessThan(v, v_0x80).ExtractMostSignificantBits()); + var c_lt_0x800 = BitOperations.PopCount(Vector128.LessThan(v, v_0x800).ExtractMostSignificantBits()); + var c_lt_0xD800 = BitOperations.PopCount(Vector128.LessThan(v, v_0xD800).ExtractMostSignificantBits()); + var c_lt_0xDC00 = BitOperations.PopCount(Vector128.LessThan(v, v_0xDC00).ExtractMostSignificantBits()); + var c_lt_0xE000 = BitOperations.PopCount(Vector128.LessThan(v, v_0xE000).ExtractMostSignificantBits()); + + var highSur = c_lt_0xDC00 - c_lt_0xD800; + var lowSur = c_lt_0xE000 - c_lt_0xDC00; + byteCount += 3 * Vector128.Count - c_lt_0x80 - c_lt_0x800 + highSur - 3 * lowSur; + + i += Vector128.Count; + } while (n - i >= Vector128.Count); + } + + // Scalar tail (and fallback for non-SIMD hardware). + // CRITICAL: must use the SAME per-char accounting model as the SIMD path so that surrogate + // pairs split across a SIMD/scalar boundary count correctly. The SIMD path counts each char + // independently — high surrogate → 4 bytes, low surrogate → 0 bytes. The scalar tail must + // do the same (i += 1 per char, NOT i += 2 on high surrogate). If the scalar tail + // double-consumed surrogate pairs (i += 2 on high), a high surrogate landing in the last + // SIMD chunk would be counted there as 4 bytes, then its low surrogate in the scalar tail + // would re-trigger the surrogate branch and add 4 more bytes (with i += 2 advancing past + // an unrelated next char). Net: +4 byte miscount per split-pair. + while (i < n) + { + var c = Unsafe.Add(ref srcRef, i); + if (c < 0x80) + { + byteCount += 1; + } + else if (c < 0x800) + { + byteCount += 2; + } + else if (c < 0xD800) + { + byteCount += 3; // BMP below surrogate range + } + else if (c < 0xDC00) + { + byteCount += 4; // high surrogate → owns the 4-byte encoding for the pair + } + else if (c < 0xE000) + { + // low surrogate → 0 bytes (the paired high surrogate already accounted for the 4) + } + else + { + byteCount += 3; // BMP at or above 0xE000 + } + i += 1; + } + + return byteCount; + } + /// /// Counts UTF-16 chars produced by decoding the given UTF-8 byte span. /// diff --git a/AyCode.Core/Serializers/PropertyMetadataBase.cs b/AyCode.Core/Serializers/PropertyMetadataBase.cs index a95e624..dd8370b 100644 --- a/AyCode.Core/Serializers/PropertyMetadataBase.cs +++ b/AyCode.Core/Serializers/PropertyMetadataBase.cs @@ -99,7 +99,15 @@ public abstract class PropertyMetadataBase [DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.PublicProperties)] Type declaringType) { Name = prop.Name; - NameUtf8 = Encoding.UTF8.GetBytes(prop.Name); + // Ctor-once init: SIMD path via Utf8Transcoder (GetUtf8ByteCount + EncodeUtf8SinglePass) + // bypasses Encoding.UTF8 virtual-dispatch + encoder-fallback overhead. Ascii.FromUtf16 + // would be slightly faster for the (overwhelmingly common) ASCII property name case, but + // the symmetric Utf8Transcoder API keeps this consistent with the binary serializer's + // writer-side BCL-free policy and handles non-ASCII property names without a fallback. + var nameByteCount = Utf8Transcoder.GetUtf8ByteCount(prop.Name.AsSpan()); + var nameBytes = new byte[nameByteCount]; + Utf8Transcoder.EncodeUtf8SinglePass(prop.Name.AsSpan(), nameBytes); + NameUtf8 = nameBytes; DeclaringType = declaringType; PropertyType = prop.PropertyType; diff --git a/AyCode.Core/docs/BINARY/BINARY_TODO.md b/AyCode.Core/docs/BINARY/BINARY_TODO.md index 81aa3f3..f32a8f2 100644 --- a/AyCode.Core/docs/BINARY/BINARY_TODO.md +++ b/AyCode.Core/docs/BINARY/BINARY_TODO.md @@ -692,6 +692,7 @@ JIT/AOT path-selection via `[Intrinsic]` `IsSupported` static booleans — non-s |-------|--------|-----------|-----------|-----------|--------| | 1 | `CountUtf8Chars` (decode 1st pass) | ✅ done | ✅ existing | ✅ done | ✅ existing | | 2 | `EncodeUtf8SinglePass` Phase 1 (ASCII narrow) | ✅ done | ✅ existing | ✅ done | ✅ existing | +| 2.5 | `DecodeUtf8SinglePass` scalar run-length decoder (multi-byte baseline) | — | — | — | ⏳ TODO | | 3a | `DecodeUtf8SinglePass` multi-byte transcoder (Vector512) | ⏳ TODO | bail-out only | bail-out only | ✅ existing | | 3b | `DecodeUtf8SinglePass` multi-byte transcoder (Vector256) | — | 🔍 **deferred — see note** | bail-out only | ✅ existing | | 3c | `DecodeUtf8SinglePass` multi-byte transcoder (Vector128) | — | — | ⏳ TODO | ✅ existing | @@ -731,6 +732,54 @@ The cascading tail-handler hierarchy (existing in Phase 1+2) carries over: AVX-5 The Vector128 path is the **WASM and Apple Silicon target** — without it both platforms fell back to scalar (1 byte/iter). With Phase 1+2 landed, WASM and Apple Silicon now run the UTF-8 hot path at 16 byte/iter (16× scalar speedup on the count + ASCII narrow operations). +### Phase 2.5 — scalar run-length decoder (multi-byte baseline, pre-Phase 3 prototype) + +Targets the `DecodeUtf8SinglePass` switch-jumptable per-char dispatch on multi-byte content. Current scalar Phase (jumptable) re-dispatches every char; a run-length-aware scalar decoder runs a tight branchless inner loop on homogeneous runs (long ASCII run, long 2-byte Latin/Cyrillic run, long 3-byte CJK BMP run), with the existing single-codepoint scalar branch as mixed-edge fallback. + +**Algorithm sketch**: +``` +while (s < src.Length) +{ + // 1) ASCII run (0xxxxxxx) — already handled by Phase 1 SIMD prefix; this is tail + int asciiStart = s; + while (s < src.Length && src[s] < 0x80) s++; + if (s > asciiStart) { WriteAsciiRun(src.Slice(asciiStart, s-asciiStart), dst, ref d); continue; } + + // 2) 2-byte run (110xxxxx 10xxxxxx) — Hungarian / Cyrillic / Greek / Hebrew / Arabic + int twoStart = s; + while (s + 1 < src.Length && Is2ByteLead(src[s]) && IsCont(src[s+1])) s += 2; + if (s > twoStart) { Decode2ByteRun(src.Slice(twoStart, s-twoStart), dst, ref d); continue; } + + // 3) 3-byte run (1110xxxx 10xxxxxx 10xxxxxx) — CJK BMP, other 3-byte BMP scripts + int threeStart = s; + while (s + 2 < src.Length && Is3ByteLead(src[s]) && IsCont(src[s+1]) && IsCont(src[s+2])) s += 3; + if (s > threeStart) { Decode3ByteRun(src.Slice(threeStart, s-threeStart), dst, ref d); continue; } + + // 4) Mixed-edge fallback (typically 4-byte surrogate pair or single transition char) + DecodeSingleCodePoint(src, ref s, dst, ref d); +} +``` + +**Why P2.5 — scalar baseline before SIMD multi-byte (Phase 3a-3c)**: +- 1-2h prototyping cost vs 6-10h Phase 3 SIMD work +- A/B benchmark on Repeated cell decides whether the run-length structure already wins on Magyar mixed (`KözösCímke` pattern) — if it does, Phase 3 lifts further; if not, Phase 3 SIMD is the only win path +- Documents the "switch-jumptable bottleneck on Hungarian benchmark" hypothesis without committing to the larger SIMD effort +- The `Decode2ByteRun` / `Decode3ByteRun` scalar-batch implementations also serve as algorithm references for the Phase 3 SIMD versions (clear semantics first, optimize after) + +**Expected payoff** (per content class, ratio vs current switch-jumptable): +- Long CJK BMP (3-byte run, e.g. `你好世界` ×30): ~20-40% Deser improvement (long homogeneous run, biggest jumptable savings) +- Long 2-byte run (`árvíztűrő` ×10+): ~5-15% improvement +- Magyar mixed (`KözösCímke`, `sötét` — short alternating runs): 0-5% (run-detection overhead may eat the savings on short runs) +- Long ASCII (≥32 byte): 0% (Phase 1 SIMD prefix already handles) +- Emoji (4-byte): 0% (mixed-edge fallback unchanged) + +**Risk** — the existing switch-jumptable JIT optimization is strong; Magyar mixed text (1-2 char runs) may not show net gain. Implementation must be **isolated prototype first** (alongside the live `DecodeUtf8SinglePass`, not replacing it), with A/B benchmark comparing the two before any switch. + +**Acceptance (Phase 2.5)**: +- Repeated cell Compact Deser ratio ≤ 1.0 vs MemPack on AVX2 hosts (parity with current measurement, no regression) +- Round-trip tests pass on all UTF-8 content classes (ASCII / 2-byte / 3-byte BMP / 4-byte surrogate-pair) +- A/B benchmark shows ≥ 5% Deser improvement on Repeated OR ≥ 10% on Large cell — else Phase 2.5 stays in TODO as documented dead-end (negative result is also valuable: confirms the jumptable is fast enough, focus moves entirely to Phase 3) + ### Phase 3 implementation outline - Insert SIMD multi-byte branches at `DecodeUtf8SinglePass` entry, **before** the existing ASCII-prefix bail-out loops: @@ -777,6 +826,38 @@ The Vector128 path is the **WASM and Apple Silicon target** — without it both - Local `dotnet test` covers correctness; per-tier benchmarks measure the multi-byte speedup - Phase 1+2 (AVX-512BW + Vector128 in `CountUtf8Chars` + `EncodeUtf8SinglePass` Phase 1) **landed 2026-05-05** — covered by existing round-trip tests, no regression on non-AVX-512 hosts (validated on AVX2-host bench) +## ACCORE-BIN-T-H2Q6: Fixed-width dual-length string header (Small/Medium/Big) for 1-pass decode +**Priority:** P1 · **Type:** Wire-format + Performance · **Related:** `DecodeUtf8SinglePass`, `CountUtf8Chars`, `WriteStringWithDispatch`, `ReadStringUtf8` + +Current Compact string decode uses two-pass flow for non-ASCII payloads (`CountUtf8Chars` + `DecodeUtf8SinglePass`). +Planned direction: remove VarUInt-based string-length path for the new string wire variant, and carry both lengths in a fixed-width header so deserialize can allocate target `string` immediately and decode in a single pass. + +### Planned format tiers + +- **Small**: packed `uint16` (`charLen:8 | utf8Len:8`) +- **Medium**: packed `uint32` (`charLen:16 | utf8Len:16`) +- **Big**: `uint32 charLen + uint32 utf8Len` + +Writer picks the smallest fitting tier; reader dispatches by marker and reads fixed-width lengths (no VarUInt loop for string length metadata). + +### Why + +- Removes `CountUtf8Chars` pass on the new markers (1-pass decode path) +- Keeps decode branch profile stable (fixed-size header reads) +- Maintains range safety with explicit Big overflow path + +### Constraints captured from current benchmark context + +- Performance evaluation target is non-ASCII-heavy data (ASCII-shortcuts intentionally not primary) +- Wire-format backward compatibility is not required for this development phase + +### Acceptance + +- New string markers implemented for Small/Medium/Big tiers +- Deserialize path for these markers performs single-pass decode without `CountUtf8Chars` +- Existing round-trip tests pass, plus new boundary tests for tier transitions +- Benchmark report includes before/after for Compact mode on non-ASCII dataset (Ser/Deser/RT + Size) + ## ACCORE-BIN-T-S5L8: Sentinel-length encoding for strings (wire-size optimization, both modes) **Priority:** P3 · **Type:** Wire-format optimization · **Related:** `AcBinarySerializer.WriteString`, `AcBinaryDeserializer.ReadValue` string dispatch @@ -1019,3 +1100,114 @@ Reader-side: SGen-generated code drops the per-property `ReadByte()` + `IsTinyIn - Schema-evolution fragility documented in `BINARY_FEATURES.md` (alongside the existing `PropertySkip` / default-omission caveat from `ACCORE-BIN-I-D9Y2`) - Opt-in flag with default `false` (preserves marker-driven default; consumers explicitly opt in for frozen-schema scenarios) +## ACCORE-BIN-T-V4N3: Symmetric `GetUtf8ByteCount` API + writer-side BCL kihagyás (cold path) +**Priority:** P3 · **Type:** Performance · **Status:** Closed (2026-05-06) · **Related:** `EncodeUtf8SinglePass`, `WriteStringUtf8Internal`, `PropertyMetadataBase.NameUtf8` + +Symmetric byte-count helper for `EncodeUtf8SinglePass`, paired with writer-side BCL `Encoding.UTF8.GetBytes` / `GetByteCount` removal across all cold-path call sites. `Utf8Transcoder.GetUtf8ByteCount(ReadOnlySpan)` SIMD impl (Vector512 / Vector256 / Vector128 / scalar tier hierarchy, 5-popcount closed-form aggregation handling chunk-split surrogate pairs correctly). + +**Implementation summary**: +- `Utf8Transcoder.GetUtf8ByteCount` SIMD impl with closed-form `bytes = 3*N - ascii - c_lt_0x800 + highSur - 3*lowSur` aggregation +- `Utf8TranscoderTests` extended (29 new tests covering ASCII / Hungarian / CJK / emoji / boundary 0-64, plus surrogate-pair-split-across-SIMD-chunks regression coverage) +- `WriteStringUtf8Internal` (`BinarySerializationContext.cs:875`) refactored from BCL two-pass to single-pass D-2 layout (worst-case `length*4` allocate + `EncodeUtf8SinglePass` + VarUInt backfill); the `4×` worst-case capacity is amortized by the buffer growth doubling strategy (`Math.Max(buffer.Length*2, position+needed)` + ArrayPool bucket-rounding to next power-of-2) +- Cold path cleanup: `AcBinarySerializer.AnalyzeStringInternCandidates` (analysis log) and `PropertyMetadataBase.NameUtf8` ctor-once init both migrated to `Utf8Transcoder` + +### Resolution + +Landed 2026-05-06. All `Utf8TranscoderTests` pass (55/55). Binary test suite unchanged (222 pass / 13 pre-existing GuidIId failures, untouched). + +**Critical observation surfaced during the audit**: `WriteStringUtf8Internal` has only one caller (`WriteFixStrDirect`), and `WriteFixStrDirect` itself is **uncalled anywhere in the codebase** — no core call site, no SourceGenerator template hit (verified against `AcBinarySourceGenerator.cs` line 706/724/1492/1514 — generator emits `WriteStringGenerated` and `context.WriteStringUtf8` (the public 659-line method, not `WriteStringUtf8Internal`)), no test, no reflection path. The V4N3 implementation therefore landed cleanly but its hot-path benchmark impact is limited to the two cold-path init sites. Dead-code disposition tracked as `ACCORE-BIN-T-V4N5`. + +**Algorithmic correctness lesson** — the initial 4-popcount formula (`3*N - c_lt_0x80 - c_lt_0x800 - 2*highSur`) was wrong on chunks where a surrogate pair straddles the SIMD chunk boundary (it implicitly assumed `lowSur == highSur` per chunk, which is true over the whole well-formed string but NOT per chunk). Fix: 5-popcount closed-form (`3*N - ascii - c_lt_0x800 + highSur - 3*lowSur`), with the scalar tail using the same per-char accounting model (`i += 1` per char regardless of role; high → 4, low → 0, BMP → 3, two-byte → 2, ASCII → 1). Caught by `GetUtf8ByteCount_MultipleEmojiBoundary_MatchesBcl` and `GetUtf8ByteCount_BoundaryAsciiToEmoji_MatchesBcl` regression tests — exactly the `prefixLen` 1, 7 boundaries that exercise chunk-split surrogate pairs. + +## ACCORE-BIN-T-V4N4: NativeAOT-specific inlining / codegen audit on hot UTF-8 path +**Priority:** P2 · **Type:** Performance · **Related:** `EncodeUtf8SinglePass`, `DecodeUtf8SinglePass`, `WriteStringWithDispatch`, `Utf8Transcoder` SIMD path + +Hypothesis: NativeAOT (the benchmark target environment) does not match Tier 1 JIT optimization quality on the UTF-8 hot path, despite `[MethodImpl(AggressiveInlining)]` hints. Symptoms in 2026-05-05 / 2026-05-06 benchmarks: + +- Repeated cell perzisztens 8-11% Compact ≤ MemPack lemaradás (Magyar content + repeated string pattern) +- Compact Ser/Deser cellán mozaikos eredmények run-to-run (4-7/10 cell wins, 3-6 noise/loss bands) +- Methodonkénti Compact gyorsítások a Medium/Large/Deep cellán **konzisztensek** (-22% to -28% vs MemPack), ami JIT/AOT inlining-eltérésnek tűnik a Repeated-en — ott a `WriteStringWithDispatch` short-lane sokszor hívódik 10× repeated string-en + +**Suspect mechanisms (ranked by likelihood)**: + +1. **AOT inline budget**. NativeAOT is more conservative than the Tier 1 JIT in respecting `AggressiveInlining` for large method bodies. `EncodeUtf8SinglePass` (~190 lines, 4 SIMD path + scalar), `DecodeUtf8SinglePass` (~120 lines), `GetUtf8ByteCount` (~120 lines) may exceed the AOT inline budget at hot call sites (`WriteStringWithDispatch` short-lane, `ReadString` decode callback). If the AOT compiler emits `call ` instead of inlining, every iteration of the Repeated 10-string loop pays the call overhead. + +2. **`[Intrinsic]` `IsSupported` constant folding**. `Avx512BW.IsSupported`, `Vector512.IsHardwareAccelerated`, `Vector256.IsHardwareAccelerated`, `Vector128.IsHardwareAccelerated` should constant-fold per host on AOT. Verify via disasm — if any remain runtime checks, every iteration pays the branch cost (3 nested `if`-s in each Utf8Transcoder method). + +3. **`Vector256.LessThan` unsigned compare emulation**. No native `pcmpltw_unsigned` on AVX2; JIT/AOT lowers to `pminuw` + `pcmpeqw`. Cost amortized over many chars in long content but can dominate on short Magyar runs (`KözösCímke` ~6 runs of 2-3 chars). Less likely if (1) holds — the inlining hit dwarfs the per-instruction emulation cost. + +4. **Method size cascade**. The Utf8Transcoder method bodies grew with the V4N3 `GetUtf8ByteCount` addition. Adjacent methods in the same source file may have lost inlining at SGen-generated callers due to AOT compilation-unit heuristics (file-locality affects inline cost models on some AOT codegen). + +**Investigation steps (no code changes — diagnostic phase first)**: + +1. NativeAOT publish dump: + ``` + dotnet publish AyCode.Core.Serializers.Console -c Release -r win-x64 -p:PublishAot=true + dumpbin /disasm > disasm.txt + ``` +2. Locate `EncodeUtf8SinglePass`, `DecodeUtf8SinglePass`, `GetUtf8ByteCount`, `CountUtf8Chars` symbols in the disasm +3. Verify constant folding on `IsSupported` checks — no run-time CMP/JMP at the path-selector branches; the dead branches eliminated +4. Verify inlining at `WriteStringWithDispatch` / `ReadString` callers — if `call ` instructions remain, inlining failed +5. Method size inspection — large method bodies hint at inline-eligibility issues; large prologue/epilogue at hot call sites is a tell +6. Cross-compare with Tier 1 JIT disasm (run with `DOTNET_TieredCompilation=0` + `DOTNET_TC_QuickJit=0` to force Tier 1, dump the JIT-tier disasm via WinDbg or `BenchmarkDotNet`'s `[DisassemblyDiagnoser]`) to confirm the gap is AOT-specific rather than algorithmic + +**Possible fixes (Open until disasm confirms which apply)**: + +- **A. Method split** — `EncodeUtf8SinglePass` → small dispatcher + per-tier inner methods (each Vector512 / Vector256 / Vector128 / scalar in its own AOT-inline-friendly small method). Same for `DecodeUtf8SinglePass`. The dispatcher stays small enough to inline at the hot call site; the dead-branch tier methods are never called on a given host. +- **B. `[MethodImpl(NoInlining)]` on cold tiers** — paradox tactic that can REDUCE the hot-path code emitted at the call site by preventing the AOT from speculatively considering the dead branches as inlining candidates. +- **C. Per-target ISA build** — if the benchmark environment has a fixed ISA (e.g. AVX2 baseline), use `` in `csproj` to constant-fold the `IsSupported` checks at AOT compile time. Alternative: separate per-ISA AOT publish artifacts. +- **D. Manual hot-path inlining** — for the Repeated cell, hand-inline `EncodeUtf8SinglePass` short-string lane into `WriteStringWithDispatch` FixStr path (≤31 byte case). Trades code-size for hot-path speed. +- **E. Algorithm change** — if the AOT can't inline the SIMD bodies efficiently, a smaller scalar-only fast path for short strings (≤31 byte) bypassing the SIMD setup might be faster on AOT than on JIT (where Tier 1 is fine with the SIMD path inlined). + +### Why P2 + +- Repeated benchmark cell is the canonical witness for the **i18n production deploy** narrative — public NuGet release narrative depends on parity-or-better against MemPack across all cells (cloud / desktop / mobile / Blazor WASM) +- AOT-specific tuning is high-leverage on the hot path — JIT-only optimizations will not match +- Disasm validation is the prerequisite for any of the fix directions; without it, any change is speculative and risks reintroducing 2c-style regression + +### Acceptance + +- Disasm report confirms (or refutes) inlining + constant-fold hypotheses on the hot UTF-8 path +- If hypotheses confirmed: the chosen fix delivers Repeated Compact Ser+Deser ratio ≤ 1.0 vs MemPack on the AOT benchmark target +- No regression on Small / Medium / Large / Deep cells (or net positive) +- Fix maintains cross-tier SIMD correctness (round-trip tests pass on all UTF-8 content classes); both `Utf8TranscoderTests` and the binary test suite stay green + +### Trigger + +- Pre-NuGet release: i18n claim cannot ship with an 8-11% gap on a representative cell +- Disasm + bench correlation step before any code change (no speculative refactoring) + +## ACCORE-BIN-T-V4N5: Dead-code review — `WriteFixStrDirect` + `WriteStringUtf8Internal` +**Priority:** P3 · **Type:** Refactor / hygiene · **Related:** `BinarySerializationContext.WriteFixStrDirect` (line 832), `WriteStringUtf8Internal` (line 875) + +V4N3 audit surfaced two methods with no callers in the entire workspace: + +- `WriteFixStrDirect(string)` — public method, no call site (no core, no SourceGenerator template, no test, no reflection / Expression-compile) +- `WriteStringUtf8Internal(string)` — private method called only from `WriteFixStrDirect`'s non-ASCII fallback branch + +The pair forms a closed dead loop (`WriteFixStrDirect` → `WriteStringUtf8Internal`), but no entry point reaches `WriteFixStrDirect`. The public-API `WriteStringUtf8` (line 659) is the live equivalent and is called from the SourceGenerator template (polymorphism path: assembly-qualified type-name write). The hot-path string-write goes through `WriteStringWithDispatch` (line 734) which uses the M3R7 marker-dispatch — NOT through this dead pair. + +### Disposition options (decide pre-NuGet release) + +1. **Delete both methods** — pure dead-code cleanup; reduces public surface, removes maintenance burden, simplifies onboarding. Functionality is fully covered by `WriteStringWithDispatch` (M3R7 marker-dispatch — emits `FixStr` / `FixStrAscii` directly with proper ASCII detection via `bytesWritten == charLength` after `EncodeUtf8SinglePass`). +2. **Activate `WriteFixStrDirect` for property-name writes** — SGen could emit `WriteFixStrDirect(propName)` instead of `WriteStringWithDispatch(propName)` for known-short, often-ASCII property names — saving the marker-dispatch overhead. Requires SGen template change + benchmark validation that the saving is real (likely marginal — property names are typically <31 char ASCII, so M3R7 already takes the FixStrAscii fast path with one byte-write to `_buffer`). The pre-encoded `NameUtf8` byte[] on `PropertyMetadataBase` already provides a faster path (`WriteFixStrBytes` at line 853) which the SGen / runtime writer could use directly. +3. **Defer** — leave as-is, document as dead code, revisit when the codebase has another reason to touch this area. + +### Why P3 + +- No correctness or perf impact in either direction (dead code is dead — no consumer affected) +- Cleanup vs activation is a low-stakes choice; benchmark would decide if option 2 has real saving +- Surfaced during V4N3 work, not blocking the NuGet release + +### Acceptance + +- Decision recorded (delete / activate / defer) with rationale +- If "delete": grep across workspace confirms zero callers post-removal; binary test suite unchanged (still 235 pass / 13 pre-existing failures) +- If "activate": SGen template change + benchmark validation showing ≥ 2% Ser improvement on a representative cell (otherwise revert to "delete") +- Documentation in `BINARY_IMPLEMENTATION.md` updated (or remove the old reference if both methods deleted) + +### Trigger + +- Pre-NuGet release housekeeping pass +- Or: any future refactor that touches `BinarySerializationContext` string-write methods (then decide rather than leave the dead pair behind) +