diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index e7d72af..032f158 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -166,5 +166,4 @@ Full doctrine: `../docs/ARCHITECTURE.md#framework-vs-consumer-boundary` 19. **Documentation layering** — write `.md` documentation at the **defining layer** (where the code lives). Higher-layer `.md` files reference the base docs (e.g. `see AyCode.Services/docs/SIGNALR/README.md`) and document only project-specific overrides or extensions. Never duplicate base-layer descriptions in consumer-level docs. 20. **Do not re-read .md files** already in your context window. They only change if you modify them yourself (new content is already in context) or if the developer tells you they changed — in that case re-read them once. 21. **Folder navigation** — start from the root `README.md` for solution-level navigation. When you need to understand a folder's contents or find a type/class, read the `README.md` in that folder first — it indexes the local files and sub-folders. Follow this before grepping or reading source files. - 22. **Language Preference**: Communicate in Hungarian as requested by the user. diff --git a/AyCode.Core.Serializers.Console/Program.cs b/AyCode.Core.Serializers.Console/Program.cs index ed4ca4a..2ddba06 100644 --- a/AyCode.Core.Serializers.Console/Program.cs +++ b/AyCode.Core.Serializers.Console/Program.cs @@ -47,9 +47,13 @@ public static class Program #else private static int WarmupIterations = 10000; //5000 private static int TestIterations = 1000; //1000 - private static int BenchmarkSamples = 3; + private static int BenchmarkSamples = 5; #endif + // Interactive settings: selected AcBinary wire mode for benchmark runs. + // 1 = Compact, 2 = Fast + private static WireMode SelectedWireMode = WireMode.Compact; + // Serializer name constants // Engine identifiers (used in Engine column + comparison logic) private const string EngineAcBinary = "AcBinary"; @@ -480,21 +484,22 @@ public static class Program private static List CreateSerializers(TestDataSet testData, string serializerMode) { // FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path. - // THREE benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + AcBinary FastMode Byte[] - // (WireMode.Fast = UTF-16 raw memcpy) + MemoryPack Byte[]. Shows BOTH sides of AcBinary's - // positioning vs MemPack: - // - Compact: smallest wire, UTF-8 encode/decode CPU cost - // - Fast (UTF-16 raw): comparable wire to MemPack, no encoding cost + // TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[]. + // - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head. // Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min. + // + // FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint — + // we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor + // reference earlier. Re-enable when revisiting Fast wire-mode performance. if (serializerMode == "fastestbyte") { - var fastWireOptions = AcBinarySerializerOptions.FastMode; - fastWireOptions.WireMode = WireMode.Fast; + var fastestByteOptions = AcBinarySerializerOptions.FastMode; + fastestByteOptions.WireMode = SelectedWireMode; return new List { - new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.FastMode, "FastMode"), - new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"), + new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"), + //new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"), new MemoryPackBenchmark(testData.Order, "Default"), }; } @@ -513,6 +518,7 @@ public static class Program // wire chunk AND kernel transfer unit; change ONLY this line when tuning. var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode; binaryFastModePipeChunkOnly.BufferWriterChunkSize = PipeChunkSize; + binaryFastModePipeChunkOnly.WireMode = SelectedWireMode; return new List { @@ -547,12 +553,18 @@ public static class Program var binaryNoInternOption = AcBinarySerializerOptions.Default; binaryNoInternOption.UseStringInterning = StringInterningMode.None; + binaryNoInternOption.WireMode = SelectedWireMode; var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default; binaryDefaultNoSgenOption.UseGeneratedCode = false; + binaryDefaultNoSgenOption.WireMode = SelectedWireMode; var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode; binaryFastModeNoSgenOption.UseGeneratedCode = false; + binaryFastModeNoSgenOption.WireMode = SelectedWireMode; + + var binaryFastModeOption = AcBinarySerializerOptions.FastMode; + binaryFastModeOption.WireMode = SelectedWireMode; // BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives // the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call. @@ -561,16 +573,19 @@ public static class Program // vs syscall count). var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode; binaryFastModeBufWrChunk.BufferWriterChunkSize = PipeChunkSize; + binaryFastModeBufWrChunk.WireMode = SelectedWireMode; // In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment // concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory // System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task). var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode; binaryFastModePipeChunkInMem.BufferWriterChunkSize = PipeChunkSize; + binaryFastModePipeChunkInMem.WireMode = SelectedWireMode; var defaultOptions = AcBinarySerializerOptions.Default; defaultOptions.UseStringInterning = StringInterningMode.None; defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId; + defaultOptions.WireMode = SelectedWireMode; return new List { @@ -578,7 +593,7 @@ public static class Program // AcBinary — Byte[] API (uncomment to compare option presets side-by-side) // ============================================================ // Fastest Byte[] — SGen path (UseGeneratedCode=true, default). - new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.FastMode, "FastMode"), + new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"), // Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch. // Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples. // NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates @@ -594,7 +609,7 @@ public static class Program //new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"), // AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario) - new AcBinaryBufferWriterBenchmark(testData.Order, AcBinarySerializerOptions.FastMode, "FastMode"), + new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"), // AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario). // 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter @@ -859,7 +874,7 @@ public static class Program System.Console.WriteLine(" [A] All layers"); System.Console.WriteLine(" [F] FastestByte — AcBinary FastMode Byte[] vs MemoryPack Byte[] only (tight optimization loop)"); System.Console.WriteLine(" [P] AsyncPipe — streaming I/O isolation (only AsyncPipe, all test data)"); - System.Console.WriteLine($" [S] Settings — modify Warmup ({WarmupIterations}) / Iterations ({TestIterations}) / Samples ({BenchmarkSamples})"); + System.Console.WriteLine($" [S] Settings — Iteration / WireMode (current: {SelectedWireMode})"); System.Console.WriteLine(" [Q] Quit"); System.Console.Write("\nSelection: "); @@ -889,10 +904,42 @@ public static class Program /// Returns to the caller (which re-displays the main menu). /// private static void ShowSettingsMenu() + { + while (true) + { + System.Console.WriteLine(); + System.Console.WriteLine("─────────────────────────────────────────────"); + System.Console.WriteLine("Settings"); + System.Console.WriteLine("─────────────────────────────────────────────"); + System.Console.WriteLine(" [1] Iteration — Warmup / Iterations / Samples"); + System.Console.WriteLine($" [2] WireMode — current: {SelectedWireMode}"); + System.Console.WriteLine(" [B] Back"); + System.Console.Write("\nSelection: "); + + var key = System.Console.ReadKey(intercept: false).KeyChar; + System.Console.WriteLine(); + + switch (char.ToLower(key)) + { + case '1': + ShowIterationSettingsMenu(); + break; + case '2': + ShowWireModeSettingsMenu(); + break; + case 'b': + return; + default: + continue; + } + } + } + + private static void ShowIterationSettingsMenu() { System.Console.WriteLine(); System.Console.WriteLine("─────────────────────────────────────────────"); - System.Console.WriteLine("Settings — press Enter to keep current value"); + System.Console.WriteLine("Iteration settings — press Enter to keep current value"); System.Console.WriteLine("─────────────────────────────────────────────"); System.Console.WriteLine(); @@ -901,7 +948,42 @@ public static class Program BenchmarkSamples = PromptInt("BenchmarkSamples", BenchmarkSamples, min: 1); System.Console.WriteLine(); - System.Console.WriteLine($"✓ Settings updated: Warmup={WarmupIterations} | Iterations={TestIterations} | Samples={BenchmarkSamples}"); + System.Console.WriteLine($"✓ Iteration settings updated: Warmup={WarmupIterations} | Iterations={TestIterations} | Samples={BenchmarkSamples}"); + } + + private static void ShowWireModeSettingsMenu() + { + while (true) + { + System.Console.WriteLine(); + System.Console.WriteLine("─────────────────────────────────────────────"); + System.Console.WriteLine("WireMode settings"); + System.Console.WriteLine("─────────────────────────────────────────────"); + System.Console.WriteLine($"Current: {SelectedWireMode}"); + System.Console.WriteLine(" [1] Compact"); + System.Console.WriteLine(" [2] Fast"); + System.Console.WriteLine(" [B] Back"); + System.Console.Write("\nSelection: "); + + var key = System.Console.ReadKey(intercept: false).KeyChar; + System.Console.WriteLine(); + + switch (char.ToLower(key)) + { + case '1': + SelectedWireMode = WireMode.Compact; + System.Console.WriteLine("✓ WireMode set to Compact"); + return; + case '2': + SelectedWireMode = WireMode.Fast; + System.Console.WriteLine("✓ WireMode set to Fast"); + return; + case 'b': + return; + default: + continue; + } + } } /// diff --git a/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs b/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs index 0318c29..d9d8722 100644 --- a/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs +++ b/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs @@ -226,6 +226,214 @@ public class Utf8TranscoderTests AssertRoundTrip("😀"); } + // ────────────────────────────────────────────────────────────────────── + // GetUtf8ByteCount — content classes + // ────────────────────────────────────────────────────────────────────── + + [TestMethod] + public void GetUtf8ByteCount_AsciiOnly_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl("Hello, World! Plain ASCII text."); + } + + [TestMethod] + public void GetUtf8ByteCount_AsciiExactly7Bytes_MatchesBcl() + { + // Boundary: just below Vector128.Count (8) — scalar tail only + AssertGetUtf8ByteCountMatchesBcl(new string('a', 7)); + } + + [TestMethod] + public void GetUtf8ByteCount_AsciiExactly8Bytes_MatchesBcl() + { + // Boundary: exactly Vector128.Count — Vector128 path triggers + AssertGetUtf8ByteCountMatchesBcl(new string('a', 8)); + } + + [TestMethod] + public void GetUtf8ByteCount_AsciiExactly16Bytes_MatchesBcl() + { + // Boundary: exactly Vector256.Count — Vector256 path triggers + AssertGetUtf8ByteCountMatchesBcl(new string('a', 16)); + } + + [TestMethod] + public void GetUtf8ByteCount_AsciiExactly32Bytes_MatchesBcl() + { + // Boundary: exactly Vector512.Count — Vector512 path triggers on AVX-512BW + AssertGetUtf8ByteCountMatchesBcl(new string('a', 32)); + } + + [TestMethod] + public void GetUtf8ByteCount_AsciiVeryLong_500Chars_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl(new string('z', 500)); + } + + [TestMethod] + public void GetUtf8ByteCount_HungarianShort_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl("Termék"); + } + + [TestMethod] + public void GetUtf8ByteCount_HungarianMedium_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl("árvíztűrő tükörfúrógép"); + } + + [TestMethod] + public void GetUtf8ByteCount_HungarianLong_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl(string.Concat(Enumerable.Repeat("árvíztűrő tükörfúrógép ", 20))); + } + + [TestMethod] + public void GetUtf8ByteCount_CjkBmp_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl("你好世界 こんにちは 안녕하세요"); + } + + [TestMethod] + public void GetUtf8ByteCount_CjkBmpLong_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl(string.Concat(Enumerable.Repeat("你好世界 ", 30))); + } + + [TestMethod] + public void GetUtf8ByteCount_SupplementaryPlane_MatchesBcl() + { + // Each emoji is 2 UTF-16 chars (surrogate pair) → 4 UTF-8 bytes total + AssertGetUtf8ByteCountMatchesBcl("😀😁😂🎉🌟"); + } + + [TestMethod] + public void GetUtf8ByteCount_MixedAllClasses_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl("ASCII Magyar:árvíz CJK:你好 Emoji:😀"); + } + + [TestMethod] + public void GetUtf8ByteCount_LongMixed_MatchesBcl() + { + var sb = new StringBuilder(); + for (var i = 0; i < 50; i++) + { + sb.Append("ASCII run-").Append(i).Append(" Magyar:árvíz CJK:你好 "); + } + AssertGetUtf8ByteCountMatchesBcl(sb.ToString()); + } + + [TestMethod] + public void GetUtf8ByteCount_Empty_ReturnsZero() + { + Assert.AreEqual(0, Utf8Transcoder.GetUtf8ByteCount(ReadOnlySpan.Empty)); + } + + [TestMethod] + public void GetUtf8ByteCount_SingleAsciiChar_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl("X"); + } + + [TestMethod] + public void GetUtf8ByteCount_SingleHungarianChar_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl("é"); + } + + [TestMethod] + public void GetUtf8ByteCount_SingleCjkChar_MatchesBcl() + { + AssertGetUtf8ByteCountMatchesBcl("好"); + } + + [TestMethod] + public void GetUtf8ByteCount_SingleEmoji_MatchesBcl() + { + // Single emoji = surrogate pair, exact 4 bytes + AssertGetUtf8ByteCountMatchesBcl("😀"); + } + + [TestMethod] + public void GetUtf8ByteCount_BoundaryAsciiToHungarian_MatchesBcl() + { + // Exercises split between SIMD ASCII region and 2-byte tail + for (var asciiLen = 0; asciiLen <= 64; asciiLen++) + { + var s = new string('a', asciiLen) + "árvíz"; + var expected = Utf8.GetByteCount(s); + var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan()); + Assert.AreEqual(expected, actual, $"asciiLen={asciiLen}"); + } + } + + [TestMethod] + public void GetUtf8ByteCount_BoundaryAsciiToCjk_MatchesBcl() + { + // 3-byte sequence boundary stress + for (var asciiLen = 0; asciiLen <= 64; asciiLen++) + { + var s = new string('a', asciiLen) + "你好世界"; + var expected = Utf8.GetByteCount(s); + var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan()); + Assert.AreEqual(expected, actual, $"asciiLen={asciiLen}"); + } + } + + [TestMethod] + public void GetUtf8ByteCount_BoundaryAsciiToEmoji_MatchesBcl() + { + // CRITICAL: tests that surrogate pairs split across SIMD chunks still produce correct count. + // High surrogate may land in chunk N, low surrogate in chunk N+1; total must remain 4 bytes. + for (var asciiLen = 0; asciiLen <= 64; asciiLen++) + { + var s = new string('a', asciiLen) + "😀"; + var expected = Utf8.GetByteCount(s); + var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan()); + Assert.AreEqual(expected, actual, $"asciiLen={asciiLen}"); + } + } + + [TestMethod] + public void GetUtf8ByteCount_MultipleEmojiBoundary_MatchesBcl() + { + // Surrogate pair split-stress: many emojis at varying offsets + for (var prefixLen = 0; prefixLen <= 32; prefixLen++) + { + var s = new string('a', prefixLen) + "😀😁😂🎉🌟😀😁😂🎉🌟"; + var expected = Utf8.GetByteCount(s); + var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan()); + Assert.AreEqual(expected, actual, $"prefixLen={prefixLen}"); + } + } + + [TestMethod] + public void GetUtf8ByteCount_AgreesWithEncodeUtf8SinglePass_AllContentClasses() + { + // Round-trip contract: the byte count returned must equal the bytesWritten by EncodeUtf8SinglePass. + // This is the load-bearing invariant for two-pass [VarUInt][bytes] writes in cold-fallback paths. + var samples = new[] + { + "Hello", + "árvíztűrő tükörfúrógép", + "你好世界", + "😀🎉🌟", + "ASCII Magyar:árvíz CJK:你好 Emoji:😀", + new string('z', 500), + string.Concat(Enumerable.Repeat("árvíztűrő tükörfúrógép ", 20)) + }; + + foreach (var s in samples) + { + var byteCountFromCounter = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan()); + var dst = new byte[s.Length * 4]; + var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(s.AsSpan(), dst.AsSpan()); + Assert.AreEqual(bytesWritten, byteCountFromCounter, + $"GetUtf8ByteCount disagrees with EncodeUtf8SinglePass for [{s.Substring(0, Math.Min(20, s.Length))}...]"); + } + } + // ────────────────────────────────────────────────────────────────────── // Decoder-side cross-check: BCL Encoding.UTF8.GetString reference // ────────────────────────────────────────────────────────────────────── @@ -291,6 +499,19 @@ public class Utf8TranscoderTests Assert.AreEqual(original, decoded, $"Decoder output mismatch{ctx}"); } + /// + /// Verifies that matches + /// for the same input. This is the BCL parity + /// invariant — any divergence means the SIMD byte counter is producing wrong values that + /// would corrupt VarUInt length prefixes in WriteStringUtf8Internal. + /// + private static void AssertGetUtf8ByteCountMatchesBcl(string original) + { + var expected = Utf8.GetByteCount(original); + var actual = Utf8Transcoder.GetUtf8ByteCount(original.AsSpan()); + Assert.AreEqual(expected, actual, $"GetUtf8ByteCount mismatch for input length {original.Length}"); + } + /// /// Verifies that DecodeUtf8SinglePass produces output identical to /// for the same byte input. Catches silent decoder bugs that pass the round-trip test diff --git a/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs b/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs index 72bcf1e..34d1ff2 100644 --- a/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs +++ b/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs @@ -1,9 +1,22 @@ using AyCode.Core.Serializers.Binaries; +using System.Collections; +using System.Reflection; +using System.Runtime.CompilerServices; namespace AyCode.Core.Tests.TestModels; public static class BenchmarkTestDataProvider { + private const int FixStrMaxLength = 31; + private const string LongStringSuffix = "__Benchmárk_Long_String_Söffix__"; + + private sealed class ReferenceComparer : IEqualityComparer + { + public static readonly ReferenceComparer Instance = new(); + public new bool Equals(object? x, object? y) => ReferenceEquals(x, y); + public int GetHashCode(object obj) => RuntimeHelpers.GetHashCode(obj); + } + public static List CreateTestDataSets(bool resetId = true) { return new List @@ -45,6 +58,8 @@ public static class BenchmarkTestDataProvider sharedTag: sharedTag, sharedUser: sharedUser); + EnsureAllStringsBypassFixStr(order); + ClearDeepLevelRefs(order); return new TestDataSet("Small (2x2x2x2)", order, iidRefPercent: 20); @@ -77,6 +92,8 @@ public static class BenchmarkTestDataProvider sharedMetadata: sharedMeta, sharedPreferences: sharedPreferences); + EnsureAllStringsBypassFixStr(order); + ClearDeepLevelRefs(order); return new TestDataSet("Medium (3x3x3x4)", order, iidRefPercent: 20); @@ -107,6 +124,8 @@ public static class BenchmarkTestDataProvider sharedUser: sharedUser, sharedPreferences: sharedPreferences); + EnsureAllStringsBypassFixStr(order); + ClearDeepLevelRefs(order); return new TestDataSet("Large (5x5x5x10)", order, iidRefPercent: 20); @@ -153,6 +172,8 @@ public static class BenchmarkTestDataProvider } } + EnsureAllStringsBypassFixStr(order); + ClearDeepLevelRefs(order); return new TestDataSet("Repeated Strings (10 items)", order, iidRefPercent: 20); @@ -185,6 +206,8 @@ public static class BenchmarkTestDataProvider sharedPreferences: sharedPreferences, sharedCategory: sharedCategory); + EnsureAllStringsBypassFixStr(order); + ClearDeepLevelRefs(order); return new TestDataSet("Deep Nested (2x4x4x8)", order, iidRefPercent: 20); @@ -218,6 +241,65 @@ public static class BenchmarkTestDataProvider } } } + + private static void EnsureAllStringsBypassFixStr(object? root) + { + if (root == null) return; + + var visited = new HashSet(ReferenceComparer.Instance); + var stack = new Stack(); + stack.Push(root); + + while (stack.Count > 0) + { + var current = stack.Pop(); + if (!visited.Add(current)) continue; + + if (current is IEnumerable enumerable && current is not string) + { + foreach (var item in enumerable) + { + if (item != null) + stack.Push(item); + } + continue; + } + + var type = current.GetType(); + foreach (var property in type.GetProperties(BindingFlags.Instance | BindingFlags.Public)) + { + if (!property.CanRead) continue; + + if (property.PropertyType == typeof(string)) + { + if (!property.CanWrite) continue; + + var value = (string?)property.GetValue(current); + property.SetValue(current, ToLongString(value)); + continue; + } + + if (property.PropertyType.IsValueType || property.PropertyType.IsEnum) + continue; + + var child = property.GetValue(current); + if (child != null) + stack.Push(child); + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static string ToLongString(string? value) + { + if (string.IsNullOrEmpty(value)) + return "Benchmark_String_Value" + LongStringSuffix; + + if (value.Length > FixStrMaxLength) + return value; + + return value + LongStringSuffix; + } } public sealed class TestDataSet diff --git a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.cs b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.cs index ef3c6d0..4f54e76 100644 --- a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.cs +++ b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.cs @@ -117,7 +117,7 @@ public static partial class AcBinarySerializer foreach (var (stringValue, properties) in analysis) { - var byteLength = Encoding.UTF8.GetByteCount(stringValue); + var byteLength = Utf8Transcoder.GetUtf8ByteCount(stringValue.AsSpan()); foreach (var (propPath, count) in properties) { if (!propertyStats.TryGetValue(propPath, out var list)) diff --git a/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs b/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs index da833e8..22ad29c 100644 --- a/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs +++ b/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs @@ -188,6 +188,181 @@ internal static class Utf8Transcoder return dstIdx; } + /// + /// Counts the UTF-8 byte length produced by encoding the given UTF-16 char span. + /// Symmetric encode-side helper to ; the value returned equals + /// the bytesWritten that would produce. + /// + /// + /// Trusted-input — assumes well-formed UTF-16 (every high surrogate paired with a low surrogate), + /// matching 's contract. Bypasses + /// .GetByteCount virtual-dispatch + encoder-fallback overhead. + /// + /// Layered SIMD: Vector512 (32 chars/iter) on AVX-512BW hosts → Vector256 (16 chars/iter) + /// on AVX2 hosts → Vector128 (8 chars/iter) on Apple Silicon NEON / WASM SIMD / SSE2 → scalar tail. + /// JIT/AOT path-selection via Avx512BW.IsSupported / Vector{N}.IsHardwareAccelerated + /// [Intrinsic] booleans (constant-folded dead branches per host). + /// + /// Per-char UTF-8 byte contribution: + /// + /// c < 0x80 → 1 byte (ASCII) + /// 0x80 ≤ c < 0x800 → 2 bytes (Latin extended, Cyrillic, Greek, Hebrew, Arabic) + /// 0x800 ≤ c < 0xD800 or c ≥ 0xE000 → 3 bytes (CJK BMP, other BMP) + /// 0xD800 ≤ c < 0xDC00 (high surrogate) → 4 bytes (whole pair encoded here) + /// 0xDC00 ≤ c < 0xE000 (low surrogate) → 0 bytes (absorbed by paired high surrogate) + /// + /// + /// SIMD per-block: 5 popcount-on-threshold-mask operations + /// (< 0x80, < 0x800, < 0xD800, < 0xDC00, < 0xE000). Closed-form aggregation: + /// bytes = 3*N - ascii - c_lt_0x800 + highSur - 3*lowSur + /// where ascii = popcount(c < 0x80), + /// highSur = popcount(c < 0xDC00) - popcount(c < 0xD800), + /// lowSur = popcount(c < 0xE000) - popcount(c < 0xDC00). + /// + /// Both highSur and lowSur must be counted independently — feature-equivalent + /// to the per-char model (high → 4 bytes, low → 0 bytes). A natural-looking shortcut + /// (lowSur == highSur for well-formed UTF-16) is FALSE within a single SIMD chunk when + /// a surrogate pair straddles the chunk boundary; over the whole string the counts equalize + /// but per-block they don't. Across-the-boundary correctness: a high surrogate counted in + /// chunk N contributes 4 bytes there; its low surrogate (in chunk N+1) contributes 0 bytes — + /// total 4 bytes per pair regardless of where the boundary falls. + /// + /// Pairs with for two-pass [VarUInt][bytes] writes in + /// cold-fallback paths (e.g. WriteFixStrDirect's non-ASCII fallback in + /// BinarySerializationContext). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int GetUtf8ByteCount(ReadOnlySpan src) + { + var byteCount = 0; + var i = 0; + var n = src.Length; + ref ushort srcRef = ref Unsafe.As(ref MemoryMarshal.GetReference(src)); + + // SIMD path 1: Vector512 (32 chars/iter) on AVX-512BW hosts + if (Avx512BW.IsSupported && n >= Vector512.Count) + { + var v_0x80 = Vector512.Create((ushort)0x80); + var v_0x800 = Vector512.Create((ushort)0x800); + var v_0xD800 = Vector512.Create((ushort)0xD800); + var v_0xDC00 = Vector512.Create((ushort)0xDC00); + var v_0xE000 = Vector512.Create((ushort)0xE000); + + do + { + var v = Vector512.LoadUnsafe(ref srcRef, (uint)i); + + var c_lt_0x80 = BitOperations.PopCount(Vector512.LessThan(v, v_0x80).ExtractMostSignificantBits()); + var c_lt_0x800 = BitOperations.PopCount(Vector512.LessThan(v, v_0x800).ExtractMostSignificantBits()); + var c_lt_0xD800 = BitOperations.PopCount(Vector512.LessThan(v, v_0xD800).ExtractMostSignificantBits()); + var c_lt_0xDC00 = BitOperations.PopCount(Vector512.LessThan(v, v_0xDC00).ExtractMostSignificantBits()); + var c_lt_0xE000 = BitOperations.PopCount(Vector512.LessThan(v, v_0xE000).ExtractMostSignificantBits()); + + var highSur = c_lt_0xDC00 - c_lt_0xD800; + var lowSur = c_lt_0xE000 - c_lt_0xDC00; + byteCount += 3 * Vector512.Count - c_lt_0x80 - c_lt_0x800 + highSur - 3 * lowSur; + + i += Vector512.Count; + } while (n - i >= Vector512.Count); + } + + // SIMD path 2: Vector256 (16 chars/iter) on AVX2 hosts; also handles AVX-512 tail < 32 chars + if (Vector256.IsHardwareAccelerated && n - i >= Vector256.Count) + { + var v_0x80 = Vector256.Create((ushort)0x80); + var v_0x800 = Vector256.Create((ushort)0x800); + var v_0xD800 = Vector256.Create((ushort)0xD800); + var v_0xDC00 = Vector256.Create((ushort)0xDC00); + var v_0xE000 = Vector256.Create((ushort)0xE000); + + do + { + var v = Vector256.LoadUnsafe(ref srcRef, (uint)i); + + var c_lt_0x80 = BitOperations.PopCount(Vector256.LessThan(v, v_0x80).ExtractMostSignificantBits()); + var c_lt_0x800 = BitOperations.PopCount(Vector256.LessThan(v, v_0x800).ExtractMostSignificantBits()); + var c_lt_0xD800 = BitOperations.PopCount(Vector256.LessThan(v, v_0xD800).ExtractMostSignificantBits()); + var c_lt_0xDC00 = BitOperations.PopCount(Vector256.LessThan(v, v_0xDC00).ExtractMostSignificantBits()); + var c_lt_0xE000 = BitOperations.PopCount(Vector256.LessThan(v, v_0xE000).ExtractMostSignificantBits()); + + var highSur = c_lt_0xDC00 - c_lt_0xD800; + var lowSur = c_lt_0xE000 - c_lt_0xDC00; + byteCount += 3 * Vector256.Count - c_lt_0x80 - c_lt_0x800 + highSur - 3 * lowSur; + + i += Vector256.Count; + } while (n - i >= Vector256.Count); + } + + // SIMD path 3: Vector128 (8 chars/iter) on Apple Silicon NEON, WASM SIMD, legacy SSE2; + // also handles tail < 16 from higher tiers. Cross-platform via Vector128.IsHardwareAccelerated. + if (Vector128.IsHardwareAccelerated && n - i >= Vector128.Count) + { + var v_0x80 = Vector128.Create((ushort)0x80); + var v_0x800 = Vector128.Create((ushort)0x800); + var v_0xD800 = Vector128.Create((ushort)0xD800); + var v_0xDC00 = Vector128.Create((ushort)0xDC00); + var v_0xE000 = Vector128.Create((ushort)0xE000); + + do + { + var v = Vector128.LoadUnsafe(ref srcRef, (uint)i); + + var c_lt_0x80 = BitOperations.PopCount(Vector128.LessThan(v, v_0x80).ExtractMostSignificantBits()); + var c_lt_0x800 = BitOperations.PopCount(Vector128.LessThan(v, v_0x800).ExtractMostSignificantBits()); + var c_lt_0xD800 = BitOperations.PopCount(Vector128.LessThan(v, v_0xD800).ExtractMostSignificantBits()); + var c_lt_0xDC00 = BitOperations.PopCount(Vector128.LessThan(v, v_0xDC00).ExtractMostSignificantBits()); + var c_lt_0xE000 = BitOperations.PopCount(Vector128.LessThan(v, v_0xE000).ExtractMostSignificantBits()); + + var highSur = c_lt_0xDC00 - c_lt_0xD800; + var lowSur = c_lt_0xE000 - c_lt_0xDC00; + byteCount += 3 * Vector128.Count - c_lt_0x80 - c_lt_0x800 + highSur - 3 * lowSur; + + i += Vector128.Count; + } while (n - i >= Vector128.Count); + } + + // Scalar tail (and fallback for non-SIMD hardware). + // CRITICAL: must use the SAME per-char accounting model as the SIMD path so that surrogate + // pairs split across a SIMD/scalar boundary count correctly. The SIMD path counts each char + // independently — high surrogate → 4 bytes, low surrogate → 0 bytes. The scalar tail must + // do the same (i += 1 per char, NOT i += 2 on high surrogate). If the scalar tail + // double-consumed surrogate pairs (i += 2 on high), a high surrogate landing in the last + // SIMD chunk would be counted there as 4 bytes, then its low surrogate in the scalar tail + // would re-trigger the surrogate branch and add 4 more bytes (with i += 2 advancing past + // an unrelated next char). Net: +4 byte miscount per split-pair. + while (i < n) + { + var c = Unsafe.Add(ref srcRef, i); + if (c < 0x80) + { + byteCount += 1; + } + else if (c < 0x800) + { + byteCount += 2; + } + else if (c < 0xD800) + { + byteCount += 3; // BMP below surrogate range + } + else if (c < 0xDC00) + { + byteCount += 4; // high surrogate → owns the 4-byte encoding for the pair + } + else if (c < 0xE000) + { + // low surrogate → 0 bytes (the paired high surrogate already accounted for the 4) + } + else + { + byteCount += 3; // BMP at or above 0xE000 + } + i += 1; + } + + return byteCount; + } + /// /// Counts UTF-16 chars produced by decoding the given UTF-8 byte span. /// diff --git a/AyCode.Core/Serializers/PropertyMetadataBase.cs b/AyCode.Core/Serializers/PropertyMetadataBase.cs index a95e624..dd8370b 100644 --- a/AyCode.Core/Serializers/PropertyMetadataBase.cs +++ b/AyCode.Core/Serializers/PropertyMetadataBase.cs @@ -99,7 +99,15 @@ public abstract class PropertyMetadataBase [DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.PublicProperties)] Type declaringType) { Name = prop.Name; - NameUtf8 = Encoding.UTF8.GetBytes(prop.Name); + // Ctor-once init: SIMD path via Utf8Transcoder (GetUtf8ByteCount + EncodeUtf8SinglePass) + // bypasses Encoding.UTF8 virtual-dispatch + encoder-fallback overhead. Ascii.FromUtf16 + // would be slightly faster for the (overwhelmingly common) ASCII property name case, but + // the symmetric Utf8Transcoder API keeps this consistent with the binary serializer's + // writer-side BCL-free policy and handles non-ASCII property names without a fallback. + var nameByteCount = Utf8Transcoder.GetUtf8ByteCount(prop.Name.AsSpan()); + var nameBytes = new byte[nameByteCount]; + Utf8Transcoder.EncodeUtf8SinglePass(prop.Name.AsSpan(), nameBytes); + NameUtf8 = nameBytes; DeclaringType = declaringType; PropertyType = prop.PropertyType; diff --git a/AyCode.Core/docs/BINARY/BINARY_TODO.md b/AyCode.Core/docs/BINARY/BINARY_TODO.md index 81aa3f3..f32a8f2 100644 --- a/AyCode.Core/docs/BINARY/BINARY_TODO.md +++ b/AyCode.Core/docs/BINARY/BINARY_TODO.md @@ -692,6 +692,7 @@ JIT/AOT path-selection via `[Intrinsic]` `IsSupported` static booleans — non-s |-------|--------|-----------|-----------|-----------|--------| | 1 | `CountUtf8Chars` (decode 1st pass) | ✅ done | ✅ existing | ✅ done | ✅ existing | | 2 | `EncodeUtf8SinglePass` Phase 1 (ASCII narrow) | ✅ done | ✅ existing | ✅ done | ✅ existing | +| 2.5 | `DecodeUtf8SinglePass` scalar run-length decoder (multi-byte baseline) | — | — | — | ⏳ TODO | | 3a | `DecodeUtf8SinglePass` multi-byte transcoder (Vector512) | ⏳ TODO | bail-out only | bail-out only | ✅ existing | | 3b | `DecodeUtf8SinglePass` multi-byte transcoder (Vector256) | — | 🔍 **deferred — see note** | bail-out only | ✅ existing | | 3c | `DecodeUtf8SinglePass` multi-byte transcoder (Vector128) | — | — | ⏳ TODO | ✅ existing | @@ -731,6 +732,54 @@ The cascading tail-handler hierarchy (existing in Phase 1+2) carries over: AVX-5 The Vector128 path is the **WASM and Apple Silicon target** — without it both platforms fell back to scalar (1 byte/iter). With Phase 1+2 landed, WASM and Apple Silicon now run the UTF-8 hot path at 16 byte/iter (16× scalar speedup on the count + ASCII narrow operations). +### Phase 2.5 — scalar run-length decoder (multi-byte baseline, pre-Phase 3 prototype) + +Targets the `DecodeUtf8SinglePass` switch-jumptable per-char dispatch on multi-byte content. Current scalar Phase (jumptable) re-dispatches every char; a run-length-aware scalar decoder runs a tight branchless inner loop on homogeneous runs (long ASCII run, long 2-byte Latin/Cyrillic run, long 3-byte CJK BMP run), with the existing single-codepoint scalar branch as mixed-edge fallback. + +**Algorithm sketch**: +``` +while (s < src.Length) +{ + // 1) ASCII run (0xxxxxxx) — already handled by Phase 1 SIMD prefix; this is tail + int asciiStart = s; + while (s < src.Length && src[s] < 0x80) s++; + if (s > asciiStart) { WriteAsciiRun(src.Slice(asciiStart, s-asciiStart), dst, ref d); continue; } + + // 2) 2-byte run (110xxxxx 10xxxxxx) — Hungarian / Cyrillic / Greek / Hebrew / Arabic + int twoStart = s; + while (s + 1 < src.Length && Is2ByteLead(src[s]) && IsCont(src[s+1])) s += 2; + if (s > twoStart) { Decode2ByteRun(src.Slice(twoStart, s-twoStart), dst, ref d); continue; } + + // 3) 3-byte run (1110xxxx 10xxxxxx 10xxxxxx) — CJK BMP, other 3-byte BMP scripts + int threeStart = s; + while (s + 2 < src.Length && Is3ByteLead(src[s]) && IsCont(src[s+1]) && IsCont(src[s+2])) s += 3; + if (s > threeStart) { Decode3ByteRun(src.Slice(threeStart, s-threeStart), dst, ref d); continue; } + + // 4) Mixed-edge fallback (typically 4-byte surrogate pair or single transition char) + DecodeSingleCodePoint(src, ref s, dst, ref d); +} +``` + +**Why P2.5 — scalar baseline before SIMD multi-byte (Phase 3a-3c)**: +- 1-2h prototyping cost vs 6-10h Phase 3 SIMD work +- A/B benchmark on Repeated cell decides whether the run-length structure already wins on Magyar mixed (`KözösCímke` pattern) — if it does, Phase 3 lifts further; if not, Phase 3 SIMD is the only win path +- Documents the "switch-jumptable bottleneck on Hungarian benchmark" hypothesis without committing to the larger SIMD effort +- The `Decode2ByteRun` / `Decode3ByteRun` scalar-batch implementations also serve as algorithm references for the Phase 3 SIMD versions (clear semantics first, optimize after) + +**Expected payoff** (per content class, ratio vs current switch-jumptable): +- Long CJK BMP (3-byte run, e.g. `你好世界` ×30): ~20-40% Deser improvement (long homogeneous run, biggest jumptable savings) +- Long 2-byte run (`árvíztűrő` ×10+): ~5-15% improvement +- Magyar mixed (`KözösCímke`, `sötét` — short alternating runs): 0-5% (run-detection overhead may eat the savings on short runs) +- Long ASCII (≥32 byte): 0% (Phase 1 SIMD prefix already handles) +- Emoji (4-byte): 0% (mixed-edge fallback unchanged) + +**Risk** — the existing switch-jumptable JIT optimization is strong; Magyar mixed text (1-2 char runs) may not show net gain. Implementation must be **isolated prototype first** (alongside the live `DecodeUtf8SinglePass`, not replacing it), with A/B benchmark comparing the two before any switch. + +**Acceptance (Phase 2.5)**: +- Repeated cell Compact Deser ratio ≤ 1.0 vs MemPack on AVX2 hosts (parity with current measurement, no regression) +- Round-trip tests pass on all UTF-8 content classes (ASCII / 2-byte / 3-byte BMP / 4-byte surrogate-pair) +- A/B benchmark shows ≥ 5% Deser improvement on Repeated OR ≥ 10% on Large cell — else Phase 2.5 stays in TODO as documented dead-end (negative result is also valuable: confirms the jumptable is fast enough, focus moves entirely to Phase 3) + ### Phase 3 implementation outline - Insert SIMD multi-byte branches at `DecodeUtf8SinglePass` entry, **before** the existing ASCII-prefix bail-out loops: @@ -777,6 +826,38 @@ The Vector128 path is the **WASM and Apple Silicon target** — without it both - Local `dotnet test` covers correctness; per-tier benchmarks measure the multi-byte speedup - Phase 1+2 (AVX-512BW + Vector128 in `CountUtf8Chars` + `EncodeUtf8SinglePass` Phase 1) **landed 2026-05-05** — covered by existing round-trip tests, no regression on non-AVX-512 hosts (validated on AVX2-host bench) +## ACCORE-BIN-T-H2Q6: Fixed-width dual-length string header (Small/Medium/Big) for 1-pass decode +**Priority:** P1 · **Type:** Wire-format + Performance · **Related:** `DecodeUtf8SinglePass`, `CountUtf8Chars`, `WriteStringWithDispatch`, `ReadStringUtf8` + +Current Compact string decode uses two-pass flow for non-ASCII payloads (`CountUtf8Chars` + `DecodeUtf8SinglePass`). +Planned direction: remove VarUInt-based string-length path for the new string wire variant, and carry both lengths in a fixed-width header so deserialize can allocate target `string` immediately and decode in a single pass. + +### Planned format tiers + +- **Small**: packed `uint16` (`charLen:8 | utf8Len:8`) +- **Medium**: packed `uint32` (`charLen:16 | utf8Len:16`) +- **Big**: `uint32 charLen + uint32 utf8Len` + +Writer picks the smallest fitting tier; reader dispatches by marker and reads fixed-width lengths (no VarUInt loop for string length metadata). + +### Why + +- Removes `CountUtf8Chars` pass on the new markers (1-pass decode path) +- Keeps decode branch profile stable (fixed-size header reads) +- Maintains range safety with explicit Big overflow path + +### Constraints captured from current benchmark context + +- Performance evaluation target is non-ASCII-heavy data (ASCII-shortcuts intentionally not primary) +- Wire-format backward compatibility is not required for this development phase + +### Acceptance + +- New string markers implemented for Small/Medium/Big tiers +- Deserialize path for these markers performs single-pass decode without `CountUtf8Chars` +- Existing round-trip tests pass, plus new boundary tests for tier transitions +- Benchmark report includes before/after for Compact mode on non-ASCII dataset (Ser/Deser/RT + Size) + ## ACCORE-BIN-T-S5L8: Sentinel-length encoding for strings (wire-size optimization, both modes) **Priority:** P3 · **Type:** Wire-format optimization · **Related:** `AcBinarySerializer.WriteString`, `AcBinaryDeserializer.ReadValue` string dispatch @@ -1019,3 +1100,114 @@ Reader-side: SGen-generated code drops the per-property `ReadByte()` + `IsTinyIn - Schema-evolution fragility documented in `BINARY_FEATURES.md` (alongside the existing `PropertySkip` / default-omission caveat from `ACCORE-BIN-I-D9Y2`) - Opt-in flag with default `false` (preserves marker-driven default; consumers explicitly opt in for frozen-schema scenarios) +## ACCORE-BIN-T-V4N3: Symmetric `GetUtf8ByteCount` API + writer-side BCL kihagyás (cold path) +**Priority:** P3 · **Type:** Performance · **Status:** Closed (2026-05-06) · **Related:** `EncodeUtf8SinglePass`, `WriteStringUtf8Internal`, `PropertyMetadataBase.NameUtf8` + +Symmetric byte-count helper for `EncodeUtf8SinglePass`, paired with writer-side BCL `Encoding.UTF8.GetBytes` / `GetByteCount` removal across all cold-path call sites. `Utf8Transcoder.GetUtf8ByteCount(ReadOnlySpan)` SIMD impl (Vector512 / Vector256 / Vector128 / scalar tier hierarchy, 5-popcount closed-form aggregation handling chunk-split surrogate pairs correctly). + +**Implementation summary**: +- `Utf8Transcoder.GetUtf8ByteCount` SIMD impl with closed-form `bytes = 3*N - ascii - c_lt_0x800 + highSur - 3*lowSur` aggregation +- `Utf8TranscoderTests` extended (29 new tests covering ASCII / Hungarian / CJK / emoji / boundary 0-64, plus surrogate-pair-split-across-SIMD-chunks regression coverage) +- `WriteStringUtf8Internal` (`BinarySerializationContext.cs:875`) refactored from BCL two-pass to single-pass D-2 layout (worst-case `length*4` allocate + `EncodeUtf8SinglePass` + VarUInt backfill); the `4×` worst-case capacity is amortized by the buffer growth doubling strategy (`Math.Max(buffer.Length*2, position+needed)` + ArrayPool bucket-rounding to next power-of-2) +- Cold path cleanup: `AcBinarySerializer.AnalyzeStringInternCandidates` (analysis log) and `PropertyMetadataBase.NameUtf8` ctor-once init both migrated to `Utf8Transcoder` + +### Resolution + +Landed 2026-05-06. All `Utf8TranscoderTests` pass (55/55). Binary test suite unchanged (222 pass / 13 pre-existing GuidIId failures, untouched). + +**Critical observation surfaced during the audit**: `WriteStringUtf8Internal` has only one caller (`WriteFixStrDirect`), and `WriteFixStrDirect` itself is **uncalled anywhere in the codebase** — no core call site, no SourceGenerator template hit (verified against `AcBinarySourceGenerator.cs` line 706/724/1492/1514 — generator emits `WriteStringGenerated` and `context.WriteStringUtf8` (the public 659-line method, not `WriteStringUtf8Internal`)), no test, no reflection path. The V4N3 implementation therefore landed cleanly but its hot-path benchmark impact is limited to the two cold-path init sites. Dead-code disposition tracked as `ACCORE-BIN-T-V4N5`. + +**Algorithmic correctness lesson** — the initial 4-popcount formula (`3*N - c_lt_0x80 - c_lt_0x800 - 2*highSur`) was wrong on chunks where a surrogate pair straddles the SIMD chunk boundary (it implicitly assumed `lowSur == highSur` per chunk, which is true over the whole well-formed string but NOT per chunk). Fix: 5-popcount closed-form (`3*N - ascii - c_lt_0x800 + highSur - 3*lowSur`), with the scalar tail using the same per-char accounting model (`i += 1` per char regardless of role; high → 4, low → 0, BMP → 3, two-byte → 2, ASCII → 1). Caught by `GetUtf8ByteCount_MultipleEmojiBoundary_MatchesBcl` and `GetUtf8ByteCount_BoundaryAsciiToEmoji_MatchesBcl` regression tests — exactly the `prefixLen` 1, 7 boundaries that exercise chunk-split surrogate pairs. + +## ACCORE-BIN-T-V4N4: NativeAOT-specific inlining / codegen audit on hot UTF-8 path +**Priority:** P2 · **Type:** Performance · **Related:** `EncodeUtf8SinglePass`, `DecodeUtf8SinglePass`, `WriteStringWithDispatch`, `Utf8Transcoder` SIMD path + +Hypothesis: NativeAOT (the benchmark target environment) does not match Tier 1 JIT optimization quality on the UTF-8 hot path, despite `[MethodImpl(AggressiveInlining)]` hints. Symptoms in 2026-05-05 / 2026-05-06 benchmarks: + +- Repeated cell perzisztens 8-11% Compact ≤ MemPack lemaradás (Magyar content + repeated string pattern) +- Compact Ser/Deser cellán mozaikos eredmények run-to-run (4-7/10 cell wins, 3-6 noise/loss bands) +- Methodonkénti Compact gyorsítások a Medium/Large/Deep cellán **konzisztensek** (-22% to -28% vs MemPack), ami JIT/AOT inlining-eltérésnek tűnik a Repeated-en — ott a `WriteStringWithDispatch` short-lane sokszor hívódik 10× repeated string-en + +**Suspect mechanisms (ranked by likelihood)**: + +1. **AOT inline budget**. NativeAOT is more conservative than the Tier 1 JIT in respecting `AggressiveInlining` for large method bodies. `EncodeUtf8SinglePass` (~190 lines, 4 SIMD path + scalar), `DecodeUtf8SinglePass` (~120 lines), `GetUtf8ByteCount` (~120 lines) may exceed the AOT inline budget at hot call sites (`WriteStringWithDispatch` short-lane, `ReadString` decode callback). If the AOT compiler emits `call ` instead of inlining, every iteration of the Repeated 10-string loop pays the call overhead. + +2. **`[Intrinsic]` `IsSupported` constant folding**. `Avx512BW.IsSupported`, `Vector512.IsHardwareAccelerated`, `Vector256.IsHardwareAccelerated`, `Vector128.IsHardwareAccelerated` should constant-fold per host on AOT. Verify via disasm — if any remain runtime checks, every iteration pays the branch cost (3 nested `if`-s in each Utf8Transcoder method). + +3. **`Vector256.LessThan` unsigned compare emulation**. No native `pcmpltw_unsigned` on AVX2; JIT/AOT lowers to `pminuw` + `pcmpeqw`. Cost amortized over many chars in long content but can dominate on short Magyar runs (`KözösCímke` ~6 runs of 2-3 chars). Less likely if (1) holds — the inlining hit dwarfs the per-instruction emulation cost. + +4. **Method size cascade**. The Utf8Transcoder method bodies grew with the V4N3 `GetUtf8ByteCount` addition. Adjacent methods in the same source file may have lost inlining at SGen-generated callers due to AOT compilation-unit heuristics (file-locality affects inline cost models on some AOT codegen). + +**Investigation steps (no code changes — diagnostic phase first)**: + +1. NativeAOT publish dump: + ``` + dotnet publish AyCode.Core.Serializers.Console -c Release -r win-x64 -p:PublishAot=true + dumpbin /disasm > disasm.txt + ``` +2. Locate `EncodeUtf8SinglePass`, `DecodeUtf8SinglePass`, `GetUtf8ByteCount`, `CountUtf8Chars` symbols in the disasm +3. Verify constant folding on `IsSupported` checks — no run-time CMP/JMP at the path-selector branches; the dead branches eliminated +4. Verify inlining at `WriteStringWithDispatch` / `ReadString` callers — if `call ` instructions remain, inlining failed +5. Method size inspection — large method bodies hint at inline-eligibility issues; large prologue/epilogue at hot call sites is a tell +6. Cross-compare with Tier 1 JIT disasm (run with `DOTNET_TieredCompilation=0` + `DOTNET_TC_QuickJit=0` to force Tier 1, dump the JIT-tier disasm via WinDbg or `BenchmarkDotNet`'s `[DisassemblyDiagnoser]`) to confirm the gap is AOT-specific rather than algorithmic + +**Possible fixes (Open until disasm confirms which apply)**: + +- **A. Method split** — `EncodeUtf8SinglePass` → small dispatcher + per-tier inner methods (each Vector512 / Vector256 / Vector128 / scalar in its own AOT-inline-friendly small method). Same for `DecodeUtf8SinglePass`. The dispatcher stays small enough to inline at the hot call site; the dead-branch tier methods are never called on a given host. +- **B. `[MethodImpl(NoInlining)]` on cold tiers** — paradox tactic that can REDUCE the hot-path code emitted at the call site by preventing the AOT from speculatively considering the dead branches as inlining candidates. +- **C. Per-target ISA build** — if the benchmark environment has a fixed ISA (e.g. AVX2 baseline), use `` in `csproj` to constant-fold the `IsSupported` checks at AOT compile time. Alternative: separate per-ISA AOT publish artifacts. +- **D. Manual hot-path inlining** — for the Repeated cell, hand-inline `EncodeUtf8SinglePass` short-string lane into `WriteStringWithDispatch` FixStr path (≤31 byte case). Trades code-size for hot-path speed. +- **E. Algorithm change** — if the AOT can't inline the SIMD bodies efficiently, a smaller scalar-only fast path for short strings (≤31 byte) bypassing the SIMD setup might be faster on AOT than on JIT (where Tier 1 is fine with the SIMD path inlined). + +### Why P2 + +- Repeated benchmark cell is the canonical witness for the **i18n production deploy** narrative — public NuGet release narrative depends on parity-or-better against MemPack across all cells (cloud / desktop / mobile / Blazor WASM) +- AOT-specific tuning is high-leverage on the hot path — JIT-only optimizations will not match +- Disasm validation is the prerequisite for any of the fix directions; without it, any change is speculative and risks reintroducing 2c-style regression + +### Acceptance + +- Disasm report confirms (or refutes) inlining + constant-fold hypotheses on the hot UTF-8 path +- If hypotheses confirmed: the chosen fix delivers Repeated Compact Ser+Deser ratio ≤ 1.0 vs MemPack on the AOT benchmark target +- No regression on Small / Medium / Large / Deep cells (or net positive) +- Fix maintains cross-tier SIMD correctness (round-trip tests pass on all UTF-8 content classes); both `Utf8TranscoderTests` and the binary test suite stay green + +### Trigger + +- Pre-NuGet release: i18n claim cannot ship with an 8-11% gap on a representative cell +- Disasm + bench correlation step before any code change (no speculative refactoring) + +## ACCORE-BIN-T-V4N5: Dead-code review — `WriteFixStrDirect` + `WriteStringUtf8Internal` +**Priority:** P3 · **Type:** Refactor / hygiene · **Related:** `BinarySerializationContext.WriteFixStrDirect` (line 832), `WriteStringUtf8Internal` (line 875) + +V4N3 audit surfaced two methods with no callers in the entire workspace: + +- `WriteFixStrDirect(string)` — public method, no call site (no core, no SourceGenerator template, no test, no reflection / Expression-compile) +- `WriteStringUtf8Internal(string)` — private method called only from `WriteFixStrDirect`'s non-ASCII fallback branch + +The pair forms a closed dead loop (`WriteFixStrDirect` → `WriteStringUtf8Internal`), but no entry point reaches `WriteFixStrDirect`. The public-API `WriteStringUtf8` (line 659) is the live equivalent and is called from the SourceGenerator template (polymorphism path: assembly-qualified type-name write). The hot-path string-write goes through `WriteStringWithDispatch` (line 734) which uses the M3R7 marker-dispatch — NOT through this dead pair. + +### Disposition options (decide pre-NuGet release) + +1. **Delete both methods** — pure dead-code cleanup; reduces public surface, removes maintenance burden, simplifies onboarding. Functionality is fully covered by `WriteStringWithDispatch` (M3R7 marker-dispatch — emits `FixStr` / `FixStrAscii` directly with proper ASCII detection via `bytesWritten == charLength` after `EncodeUtf8SinglePass`). +2. **Activate `WriteFixStrDirect` for property-name writes** — SGen could emit `WriteFixStrDirect(propName)` instead of `WriteStringWithDispatch(propName)` for known-short, often-ASCII property names — saving the marker-dispatch overhead. Requires SGen template change + benchmark validation that the saving is real (likely marginal — property names are typically <31 char ASCII, so M3R7 already takes the FixStrAscii fast path with one byte-write to `_buffer`). The pre-encoded `NameUtf8` byte[] on `PropertyMetadataBase` already provides a faster path (`WriteFixStrBytes` at line 853) which the SGen / runtime writer could use directly. +3. **Defer** — leave as-is, document as dead code, revisit when the codebase has another reason to touch this area. + +### Why P3 + +- No correctness or perf impact in either direction (dead code is dead — no consumer affected) +- Cleanup vs activation is a low-stakes choice; benchmark would decide if option 2 has real saving +- Surfaced during V4N3 work, not blocking the NuGet release + +### Acceptance + +- Decision recorded (delete / activate / defer) with rationale +- If "delete": grep across workspace confirms zero callers post-removal; binary test suite unchanged (still 235 pass / 13 pre-existing failures) +- If "activate": SGen template change + benchmark validation showing ≥ 2% Ser improvement on a representative cell (otherwise revert to "delete") +- Documentation in `BINARY_IMPLEMENTATION.md` updated (or remove the old reference if both methods deleted) + +### Trigger + +- Pre-NuGet release housekeeping pass +- Or: any future refactor that touches `BinarySerializationContext` string-write methods (then decide rather than leave the dead pair behind) +