diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index e7d72af..032f158 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -166,5 +166,4 @@ Full doctrine: `../docs/ARCHITECTURE.md#framework-vs-consumer-boundary`
 19. **Documentation layering** — write `.md` documentation at the **defining layer** (where the code lives). Higher-layer `.md` files reference the base docs (e.g. `see AyCode.Services/docs/SIGNALR/README.md`) and document only project-specific overrides or extensions. Never duplicate base-layer descriptions in consumer-level docs.
 20. **Do not re-read .md files** already in your context window. They only change if you modify them yourself (new content is already in context) or if the developer tells you they changed — in that case re-read them once.
 21. **Folder navigation** — start from the root `README.md` for solution-level navigation. When you need to understand a folder's contents or find a type/class, read the `README.md` in that folder first — it indexes the local files and sub-folders. Follow this before grepping or reading source files.
-
 22. **Language Preference**: Communicate in Hungarian as requested by the user.
diff --git a/AyCode.Core.Serializers.Console/Program.cs b/AyCode.Core.Serializers.Console/Program.cs
index ed4ca4a..2ddba06 100644
--- a/AyCode.Core.Serializers.Console/Program.cs
+++ b/AyCode.Core.Serializers.Console/Program.cs
@@ -47,9 +47,13 @@ public static class Program
 #else
     private static int WarmupIterations = 10000; //5000
     private static int TestIterations = 1000; //1000
-    private static int BenchmarkSamples = 3;
+    private static int BenchmarkSamples = 5;
 #endif
 
+    // Interactive settings: selected AcBinary wire mode for benchmark runs.
+    // 1 = Compact, 2 = Fast
+    private static WireMode SelectedWireMode = WireMode.Compact;
+
     // Serializer name constants
     // Engine identifiers (used in Engine column + comparison logic)
     private const string EngineAcBinary = "AcBinary";
@@ -480,21 +484,22 @@ public static class Program
     private static List<ISerializerBenchmark> CreateSerializers(TestDataSet testData, string serializerMode)
     {
         // FastestByte mode — focused 1:1 comparison on the "fastest Byte[]" path.
-        // THREE benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + AcBinary FastMode Byte[]
-        // (WireMode.Fast = UTF-16 raw memcpy) + MemoryPack Byte[]. Shows BOTH sides of AcBinary's
-        // positioning vs MemPack:
-        //   - Compact: smallest wire, UTF-8 encode/decode CPU cost
-        //   - Fast (UTF-16 raw): comparable wire to MemPack, no encoding cost
+        // TWO benchmarks: AcBinary FastMode Byte[] (Compact UTF-8) + MemoryPack Byte[].
+        //   - Compact: smallest wire, UTF-8 encode/decode CPU cost vs MemPack head-to-head.
         // Tight optimization-iteration loop: ~30-45 sec vs full 2-3 min.
+        //
+        // FastWire row (UTF-16 raw memcpy) commented out for the current optimization sprint —
+        // we are tuning Compact mode against MemPack directly; FastWire was used as a noise-floor
+        // reference earlier. Re-enable when revisiting Fast wire-mode performance.
         if (serializerMode == "fastestbyte")
         {
-            var fastWireOptions = AcBinarySerializerOptions.FastMode;
-            fastWireOptions.WireMode = WireMode.Fast;
+            var fastestByteOptions = AcBinarySerializerOptions.FastMode;
+            fastestByteOptions.WireMode = SelectedWireMode;
 
             return new List<ISerializerBenchmark>
             {
-                new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.FastMode, "FastMode"),
-                new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"),
+                new AcBinaryBenchmark(testData.Order, fastestByteOptions, "FastMode"),
+                //new AcBinaryBenchmark(testData.Order, fastWireOptions, "FastMode (FastWire)"),
                 new MemoryPackBenchmark(testData.Order, "Default"),
             };
         }
@@ -513,6 +518,7 @@ public static class Program
             // wire chunk AND kernel transfer unit; change ONLY this line when tuning.
             var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode;
             binaryFastModePipeChunkOnly.BufferWriterChunkSize = PipeChunkSize;
+            binaryFastModePipeChunkOnly.WireMode = SelectedWireMode;
 
             return new List<ISerializerBenchmark>
             {
@@ -547,12 +553,18 @@ public static class Program
 
         var binaryNoInternOption = AcBinarySerializerOptions.Default;
         binaryNoInternOption.UseStringInterning = StringInterningMode.None;
+        binaryNoInternOption.WireMode = SelectedWireMode;
 
         var binaryDefaultNoSgenOption = AcBinarySerializerOptions.Default;
         binaryDefaultNoSgenOption.UseGeneratedCode = false;
+        binaryDefaultNoSgenOption.WireMode = SelectedWireMode;
 
         var binaryFastModeNoSgenOption = AcBinarySerializerOptions.FastMode;
         binaryFastModeNoSgenOption.UseGeneratedCode = false;
+        binaryFastModeNoSgenOption.WireMode = SelectedWireMode;
+
+        var binaryFastModeOption = AcBinarySerializerOptions.FastMode;
+        binaryFastModeOption.WireMode = SelectedWireMode;
 
         // BufWr new — 4 KB chunk size for the FRESH ArrayBufferWriter scenario. The chunkSize here drives
         // the serializer's GetSpan(N) request → the ArrayBufferWriter's internal allocation per call.
@@ -561,16 +573,19 @@ public static class Program
         // vs syscall count).
         var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode;
         binaryFastModeBufWrChunk.BufferWriterChunkSize = PipeChunkSize;
+        binaryFastModeBufWrChunk.WireMode = SelectedWireMode;
 
         // In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment
         // concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory
         // System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task).
         var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode;
         binaryFastModePipeChunkInMem.BufferWriterChunkSize = PipeChunkSize;
+        binaryFastModePipeChunkInMem.WireMode = SelectedWireMode;
 
         var defaultOptions = AcBinarySerializerOptions.Default;
         defaultOptions.UseStringInterning = StringInterningMode.None;
         defaultOptions.ReferenceHandling = ReferenceHandlingMode.OnlyId;
+        defaultOptions.WireMode = SelectedWireMode;
 
         return new List<ISerializerBenchmark>
         {
@@ -578,7 +593,7 @@ public static class Program
             // AcBinary — Byte[] API (uncomment to compare option presets side-by-side)
             // ============================================================
             // Fastest Byte[] — SGen path (UseGeneratedCode=true, default).
-            new AcBinaryBenchmark(testData.Order, AcBinarySerializerOptions.FastMode, "FastMode"),
+            new AcBinaryBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
             // Fastest Byte[] — Runtime path (UseGeneratedCode=false). Same wire/options, no source-generated dispatch.
             // Always paired with the SGen variant so every layer can compare the SGen speed-up apples-to-apples.
             // NativeAOT-safe: AcSerializerCommon.Create*Getter/Setter falls back to reflection-based delegates
@@ -594,7 +609,7 @@ public static class Program
             //new AcBinaryBenchmark(testData.Order, binaryNoInternOption, "NoIntern"),
 
             // AcBinary via IBufferWriter (reused ArrayBufferWriter — long-running service / batch scenario)
-            new AcBinaryBufferWriterBenchmark(testData.Order, AcBinarySerializerOptions.FastMode, "FastMode"),
+            new AcBinaryBufferWriterBenchmark(testData.Order, binaryFastModeOption, "FastMode"),
 
             // AcBinary via IBufferWriter (FRESH ArrayBufferWriter per call — one-shot scenario).
             // 4 KB chunk size from binaryFastModeBufWrChunk — minimises the per-call ArrayBufferWriter
@@ -859,7 +874,7 @@ public static class Program
             System.Console.WriteLine("  [A] All layers");
             System.Console.WriteLine("  [F] FastestByte    — AcBinary FastMode Byte[] vs MemoryPack Byte[] only (tight optimization loop)");
             System.Console.WriteLine("  [P] AsyncPipe      — streaming I/O isolation (only AsyncPipe, all test data)");
-            System.Console.WriteLine($"  [S] Settings       — modify Warmup ({WarmupIterations}) / Iterations ({TestIterations}) / Samples ({BenchmarkSamples})");
+            System.Console.WriteLine($"  [S] Settings       — Iteration / WireMode (current: {SelectedWireMode})");
             System.Console.WriteLine("  [Q] Quit");
             System.Console.Write("\nSelection: ");
 
@@ -889,10 +904,42 @@ public static class Program
     /// Returns to the caller (which re-displays the main menu).
     /// </summary>
     private static void ShowSettingsMenu()
+    {
+        while (true)
+        {
+            System.Console.WriteLine();
+            System.Console.WriteLine("─────────────────────────────────────────────");
+            System.Console.WriteLine("Settings");
+            System.Console.WriteLine("─────────────────────────────────────────────");
+            System.Console.WriteLine("  [1] Iteration  — Warmup / Iterations / Samples");
+            System.Console.WriteLine($"  [2] WireMode   — current: {SelectedWireMode}");
+            System.Console.WriteLine("  [B] Back");
+            System.Console.Write("\nSelection: ");
+
+            var key = System.Console.ReadKey(intercept: false).KeyChar;
+            System.Console.WriteLine();
+
+            switch (char.ToLower(key))
+            {
+                case '1':
+                    ShowIterationSettingsMenu();
+                    break;
+                case '2':
+                    ShowWireModeSettingsMenu();
+                    break;
+                case 'b':
+                    return;
+                default:
+                    continue;
+            }
+        }
+    }
+
+    private static void ShowIterationSettingsMenu()
     {
         System.Console.WriteLine();
         System.Console.WriteLine("─────────────────────────────────────────────");
-        System.Console.WriteLine("Settings — press Enter to keep current value");
+        System.Console.WriteLine("Iteration settings — press Enter to keep current value");
         System.Console.WriteLine("─────────────────────────────────────────────");
         System.Console.WriteLine();
 
@@ -901,7 +948,42 @@ public static class Program
         BenchmarkSamples = PromptInt("BenchmarkSamples", BenchmarkSamples, min: 1);
 
         System.Console.WriteLine();
-        System.Console.WriteLine($"✓ Settings updated: Warmup={WarmupIterations} | Iterations={TestIterations} | Samples={BenchmarkSamples}");
+        System.Console.WriteLine($"✓ Iteration settings updated: Warmup={WarmupIterations} | Iterations={TestIterations} | Samples={BenchmarkSamples}");
+    }
+
+    private static void ShowWireModeSettingsMenu()
+    {
+        while (true)
+        {
+            System.Console.WriteLine();
+            System.Console.WriteLine("─────────────────────────────────────────────");
+            System.Console.WriteLine("WireMode settings");
+            System.Console.WriteLine("─────────────────────────────────────────────");
+            System.Console.WriteLine($"Current: {SelectedWireMode}");
+            System.Console.WriteLine("  [1] Compact");
+            System.Console.WriteLine("  [2] Fast");
+            System.Console.WriteLine("  [B] Back");
+            System.Console.Write("\nSelection: ");
+
+            var key = System.Console.ReadKey(intercept: false).KeyChar;
+            System.Console.WriteLine();
+
+            switch (char.ToLower(key))
+            {
+                case '1':
+                    SelectedWireMode = WireMode.Compact;
+                    System.Console.WriteLine("✓ WireMode set to Compact");
+                    return;
+                case '2':
+                    SelectedWireMode = WireMode.Fast;
+                    System.Console.WriteLine("✓ WireMode set to Fast");
+                    return;
+                case 'b':
+                    return;
+                default:
+                    continue;
+            }
+        }
     }
 
     /// <summary>
diff --git a/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs b/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs
index 0318c29..d9d8722 100644
--- a/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs
+++ b/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs
@@ -226,6 +226,214 @@ public class Utf8TranscoderTests
         AssertRoundTrip("😀");
     }
 
+    // ──────────────────────────────────────────────────────────────────────
+    // GetUtf8ByteCount — content classes
+    // ──────────────────────────────────────────────────────────────────────
+
+    [TestMethod]
+    public void GetUtf8ByteCount_AsciiOnly_MatchesBcl()
+    {
+        AssertGetUtf8ByteCountMatchesBcl("Hello, World! Plain ASCII text.");
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_AsciiExactly7Bytes_MatchesBcl()
+    {
+        // Boundary: just below Vector128<ushort>.Count (8) — scalar tail only
+        AssertGetUtf8ByteCountMatchesBcl(new string('a', 7));
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_AsciiExactly8Bytes_MatchesBcl()
+    {
+        // Boundary: exactly Vector128<ushort>.Count — Vector128 path triggers
+        AssertGetUtf8ByteCountMatchesBcl(new string('a', 8));
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_AsciiExactly16Bytes_MatchesBcl()
+    {
+        // Boundary: exactly Vector256<ushort>.Count — Vector256 path triggers
+        AssertGetUtf8ByteCountMatchesBcl(new string('a', 16));
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_AsciiExactly32Bytes_MatchesBcl()
+    {
+        // Boundary: exactly Vector512<ushort>.Count — Vector512 path triggers on AVX-512BW
+        AssertGetUtf8ByteCountMatchesBcl(new string('a', 32));
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_AsciiVeryLong_500Chars_MatchesBcl()
+    {
+        AssertGetUtf8ByteCountMatchesBcl(new string('z', 500));
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_HungarianShort_MatchesBcl()
+    {
+        AssertGetUtf8ByteCountMatchesBcl("Termék");
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_HungarianMedium_MatchesBcl()
+    {
+        AssertGetUtf8ByteCountMatchesBcl("árvíztűrő tükörfúrógép");
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_HungarianLong_MatchesBcl()
+    {
+        AssertGetUtf8ByteCountMatchesBcl(string.Concat(Enumerable.Repeat("árvíztűrő tükörfúrógép ", 20)));
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_CjkBmp_MatchesBcl()
+    {
+        AssertGetUtf8ByteCountMatchesBcl("你好世界 こんにちは 안녕하세요");
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_CjkBmpLong_MatchesBcl()
+    {
+        AssertGetUtf8ByteCountMatchesBcl(string.Concat(Enumerable.Repeat("你好世界 ", 30)));
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_SupplementaryPlane_MatchesBcl()
+    {
+        // Each emoji is 2 UTF-16 chars (surrogate pair) → 4 UTF-8 bytes total
+        AssertGetUtf8ByteCountMatchesBcl("😀😁😂🎉🌟");
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_MixedAllClasses_MatchesBcl()
+    {
+        AssertGetUtf8ByteCountMatchesBcl("ASCII Magyar:árvíz CJK:你好 Emoji:😀");
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_LongMixed_MatchesBcl()
+    {
+        var sb = new StringBuilder();
+        for (var i = 0; i < 50; i++)
+        {
+            sb.Append("ASCII run-").Append(i).Append(" Magyar:árvíz CJK:你好 ");
+        }
+        AssertGetUtf8ByteCountMatchesBcl(sb.ToString());
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_Empty_ReturnsZero()
+    {
+        Assert.AreEqual(0, Utf8Transcoder.GetUtf8ByteCount(ReadOnlySpan<char>.Empty));
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_SingleAsciiChar_MatchesBcl()
+    {
+        AssertGetUtf8ByteCountMatchesBcl("X");
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_SingleHungarianChar_MatchesBcl()
+    {
+        AssertGetUtf8ByteCountMatchesBcl("é");
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_SingleCjkChar_MatchesBcl()
+    {
+        AssertGetUtf8ByteCountMatchesBcl("好");
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_SingleEmoji_MatchesBcl()
+    {
+        // Single emoji = surrogate pair, exact 4 bytes
+        AssertGetUtf8ByteCountMatchesBcl("😀");
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_BoundaryAsciiToHungarian_MatchesBcl()
+    {
+        // Exercises split between SIMD ASCII region and 2-byte tail
+        for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
+        {
+            var s = new string('a', asciiLen) + "árvíz";
+            var expected = Utf8.GetByteCount(s);
+            var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan());
+            Assert.AreEqual(expected, actual, $"asciiLen={asciiLen}");
+        }
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_BoundaryAsciiToCjk_MatchesBcl()
+    {
+        // 3-byte sequence boundary stress
+        for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
+        {
+            var s = new string('a', asciiLen) + "你好世界";
+            var expected = Utf8.GetByteCount(s);
+            var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan());
+            Assert.AreEqual(expected, actual, $"asciiLen={asciiLen}");
+        }
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_BoundaryAsciiToEmoji_MatchesBcl()
+    {
+        // CRITICAL: tests that surrogate pairs split across SIMD chunks still produce correct count.
+        // High surrogate may land in chunk N, low surrogate in chunk N+1; total must remain 4 bytes.
+        for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
+        {
+            var s = new string('a', asciiLen) + "😀";
+            var expected = Utf8.GetByteCount(s);
+            var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan());
+            Assert.AreEqual(expected, actual, $"asciiLen={asciiLen}");
+        }
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_MultipleEmojiBoundary_MatchesBcl()
+    {
+        // Surrogate pair split-stress: many emojis at varying offsets
+        for (var prefixLen = 0; prefixLen <= 32; prefixLen++)
+        {
+            var s = new string('a', prefixLen) + "😀😁😂🎉🌟😀😁😂🎉🌟";
+            var expected = Utf8.GetByteCount(s);
+            var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan());
+            Assert.AreEqual(expected, actual, $"prefixLen={prefixLen}");
+        }
+    }
+
+    [TestMethod]
+    public void GetUtf8ByteCount_AgreesWithEncodeUtf8SinglePass_AllContentClasses()
+    {
+        // Round-trip contract: the byte count returned must equal the bytesWritten by EncodeUtf8SinglePass.
+        // This is the load-bearing invariant for two-pass [VarUInt][bytes] writes in cold-fallback paths.
+        var samples = new[]
+        {
+            "Hello",
+            "árvíztűrő tükörfúrógép",
+            "你好世界",
+            "😀🎉🌟",
+            "ASCII Magyar:árvíz CJK:你好 Emoji:😀",
+            new string('z', 500),
+            string.Concat(Enumerable.Repeat("árvíztűrő tükörfúrógép ", 20))
+        };
+
+        foreach (var s in samples)
+        {
+            var byteCountFromCounter = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan());
+            var dst = new byte[s.Length * 4];
+            var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(s.AsSpan(), dst.AsSpan());
+            Assert.AreEqual(bytesWritten, byteCountFromCounter,
+                $"GetUtf8ByteCount disagrees with EncodeUtf8SinglePass for [{s.Substring(0, Math.Min(20, s.Length))}...]");
+        }
+    }
+
     // ──────────────────────────────────────────────────────────────────────
     // Decoder-side cross-check: BCL Encoding.UTF8.GetString reference
     // ──────────────────────────────────────────────────────────────────────
@@ -291,6 +499,19 @@ public class Utf8TranscoderTests
         Assert.AreEqual(original, decoded, $"Decoder output mismatch{ctx}");
     }
 
+    /// <summary>
+    /// Verifies that <see cref="Utf8Transcoder.GetUtf8ByteCount"/> matches
+    /// <see cref="Encoding.GetByteCount(string)"/> for the same input. This is the BCL parity
+    /// invariant — any divergence means the SIMD byte counter is producing wrong values that
+    /// would corrupt VarUInt length prefixes in <c>WriteStringUtf8Internal</c>.
+    /// </summary>
+    private static void AssertGetUtf8ByteCountMatchesBcl(string original)
+    {
+        var expected = Utf8.GetByteCount(original);
+        var actual = Utf8Transcoder.GetUtf8ByteCount(original.AsSpan());
+        Assert.AreEqual(expected, actual, $"GetUtf8ByteCount mismatch for input length {original.Length}");
+    }
+
     /// <summary>
     /// Verifies that DecodeUtf8SinglePass produces output identical to <see cref="Encoding.UTF8.GetString"/>
     /// for the same byte input. Catches silent decoder bugs that pass the round-trip test
diff --git a/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs b/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs
index 72bcf1e..34d1ff2 100644
--- a/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs
+++ b/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs
@@ -1,9 +1,22 @@
 using AyCode.Core.Serializers.Binaries;
+using System.Collections;
+using System.Reflection;
+using System.Runtime.CompilerServices;
 
 namespace AyCode.Core.Tests.TestModels;
 
 public static class BenchmarkTestDataProvider
 {
+    private const int FixStrMaxLength = 31;
+    private const string LongStringSuffix = "__Benchmárk_Long_String_Söffix__";
+
+    private sealed class ReferenceComparer : IEqualityComparer<object>
+    {
+        public static readonly ReferenceComparer Instance = new();
+        public new bool Equals(object? x, object? y) => ReferenceEquals(x, y);
+        public int GetHashCode(object obj) => RuntimeHelpers.GetHashCode(obj);
+    }
+
     public static List<TestDataSet> CreateTestDataSets(bool resetId = true)
     {
         return new List<TestDataSet>
@@ -45,6 +58,8 @@ public static class BenchmarkTestDataProvider
             sharedTag: sharedTag,
             sharedUser: sharedUser);
 
+        EnsureAllStringsBypassFixStr(order);
+
         ClearDeepLevelRefs(order);
 
         return new TestDataSet("Small (2x2x2x2)", order, iidRefPercent: 20);
@@ -77,6 +92,8 @@ public static class BenchmarkTestDataProvider
             sharedMetadata: sharedMeta,
             sharedPreferences: sharedPreferences);
 
+        EnsureAllStringsBypassFixStr(order);
+
         ClearDeepLevelRefs(order);
 
         return new TestDataSet("Medium (3x3x3x4)", order, iidRefPercent: 20);
@@ -107,6 +124,8 @@ public static class BenchmarkTestDataProvider
             sharedUser: sharedUser,
             sharedPreferences: sharedPreferences);
 
+        EnsureAllStringsBypassFixStr(order);
+
         ClearDeepLevelRefs(order);
 
         return new TestDataSet("Large (5x5x5x10)", order, iidRefPercent: 20);
@@ -153,6 +172,8 @@ public static class BenchmarkTestDataProvider
             }
         }
 
+        EnsureAllStringsBypassFixStr(order);
+
         ClearDeepLevelRefs(order);
 
         return new TestDataSet("Repeated Strings (10 items)", order, iidRefPercent: 20);
@@ -185,6 +206,8 @@ public static class BenchmarkTestDataProvider
             sharedPreferences: sharedPreferences,
             sharedCategory: sharedCategory);
 
+        EnsureAllStringsBypassFixStr(order);
+
         ClearDeepLevelRefs(order);
 
         return new TestDataSet("Deep Nested (2x4x4x8)", order, iidRefPercent: 20);
@@ -218,6 +241,65 @@ public static class BenchmarkTestDataProvider
             }
         }
     }
+
+    private static void EnsureAllStringsBypassFixStr(object? root)
+    {
+        if (root == null) return;
+
+        var visited = new HashSet<object>(ReferenceComparer.Instance);
+        var stack = new Stack<object>();
+        stack.Push(root);
+
+        while (stack.Count > 0)
+        {
+            var current = stack.Pop();
+            if (!visited.Add(current)) continue;
+
+            if (current is IEnumerable enumerable && current is not string)
+            {
+                foreach (var item in enumerable)
+                {
+                    if (item != null)
+                        stack.Push(item);
+                }
+                continue;
+            }
+
+            var type = current.GetType();
+            foreach (var property in type.GetProperties(BindingFlags.Instance | BindingFlags.Public))
+            {
+                if (!property.CanRead) continue;
+
+                if (property.PropertyType == typeof(string))
+                {
+                    if (!property.CanWrite) continue;
+
+                    var value = (string?)property.GetValue(current);
+                    property.SetValue(current, ToLongString(value));
+                    continue;
+                }
+
+                if (property.PropertyType.IsValueType || property.PropertyType.IsEnum)
+                    continue;
+
+                var child = property.GetValue(current);
+                if (child != null)
+                    stack.Push(child);
+            }
+        }
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static string ToLongString(string? value)
+    {
+        if (string.IsNullOrEmpty(value))
+            return "Benchmark_String_Value" + LongStringSuffix;
+
+        if (value.Length > FixStrMaxLength)
+            return value;
+
+        return value + LongStringSuffix;
+    }
 }
 
 public sealed class TestDataSet
diff --git a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.cs b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.cs
index ef3c6d0..4f54e76 100644
--- a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.cs
+++ b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.cs
@@ -117,7 +117,7 @@ public static partial class AcBinarySerializer
 
         foreach (var (stringValue, properties) in analysis)
         {
-            var byteLength = Encoding.UTF8.GetByteCount(stringValue);
+            var byteLength = Utf8Transcoder.GetUtf8ByteCount(stringValue.AsSpan());
             foreach (var (propPath, count) in properties)
             {
                 if (!propertyStats.TryGetValue(propPath, out var list))
diff --git a/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs b/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs
index da833e8..22ad29c 100644
--- a/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs
+++ b/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs
@@ -188,6 +188,181 @@ internal static class Utf8Transcoder
         return dstIdx;
     }
 
+    /// <summary>
+    /// Counts the UTF-8 byte length produced by encoding the given UTF-16 char span.
+    /// Symmetric encode-side helper to <see cref="CountUtf8Chars"/>; the value returned equals
+    /// the <c>bytesWritten</c> that <see cref="EncodeUtf8SinglePass"/> would produce.
+    /// </summary>
+    /// <remarks>
+    /// Trusted-input — assumes well-formed UTF-16 (every high surrogate paired with a low surrogate),
+    /// matching <see cref="EncodeUtf8SinglePass"/>'s contract. Bypasses
+    /// <see cref="System.Text.Encoding.UTF8"/>.GetByteCount virtual-dispatch + encoder-fallback overhead.
+    ///
+    /// <para>Layered SIMD: Vector512 (32 chars/iter) on AVX-512BW hosts → Vector256 (16 chars/iter)
+    /// on AVX2 hosts → Vector128 (8 chars/iter) on Apple Silicon NEON / WASM SIMD / SSE2 → scalar tail.
+    /// JIT/AOT path-selection via <c>Avx512BW.IsSupported</c> / <c>Vector{N}.IsHardwareAccelerated</c>
+    /// <c>[Intrinsic]</c> booleans (constant-folded dead branches per host).</para>
+    ///
+    /// <para>Per-char UTF-8 byte contribution:</para>
+    /// <list type="bullet">
+    ///   <item><c>c &lt; 0x80</c> → 1 byte (ASCII)</item>
+    ///   <item><c>0x80 ≤ c &lt; 0x800</c> → 2 bytes (Latin extended, Cyrillic, Greek, Hebrew, Arabic)</item>
+    ///   <item><c>0x800 ≤ c &lt; 0xD800</c> or <c>c ≥ 0xE000</c> → 3 bytes (CJK BMP, other BMP)</item>
+    ///   <item><c>0xD800 ≤ c &lt; 0xDC00</c> (high surrogate) → 4 bytes (whole pair encoded here)</item>
+    ///   <item><c>0xDC00 ≤ c &lt; 0xE000</c> (low surrogate) → 0 bytes (absorbed by paired high surrogate)</item>
+    /// </list>
+    ///
+    /// <para>SIMD per-block: 5 popcount-on-threshold-mask operations
+    /// (&lt; 0x80, &lt; 0x800, &lt; 0xD800, &lt; 0xDC00, &lt; 0xE000). Closed-form aggregation:
+    /// <c>bytes = 3*N - ascii - c_lt_0x800 + highSur - 3*lowSur</c>
+    /// where <c>ascii = popcount(c &lt; 0x80)</c>,
+    /// <c>highSur = popcount(c &lt; 0xDC00) - popcount(c &lt; 0xD800)</c>,
+    /// <c>lowSur = popcount(c &lt; 0xE000) - popcount(c &lt; 0xDC00)</c>.</para>
+    ///
+    /// <para>Both <c>highSur</c> and <c>lowSur</c> must be counted independently — feature-equivalent
+    /// to the per-char model (high → 4 bytes, low → 0 bytes). A natural-looking shortcut
+    /// (<c>lowSur == highSur</c> for well-formed UTF-16) is FALSE within a single SIMD chunk when
+    /// a surrogate pair straddles the chunk boundary; over the whole string the counts equalize
+    /// but per-block they don't. Across-the-boundary correctness: a high surrogate counted in
+    /// chunk N contributes 4 bytes there; its low surrogate (in chunk N+1) contributes 0 bytes —
+    /// total 4 bytes per pair regardless of where the boundary falls.</para>
+    ///
+    /// <para>Pairs with <see cref="EncodeUtf8SinglePass"/> for two-pass [VarUInt][bytes] writes in
+    /// cold-fallback paths (e.g. <c>WriteFixStrDirect</c>'s non-ASCII fallback in
+    /// <c>BinarySerializationContext</c>).</para>
+    /// </remarks>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    internal static int GetUtf8ByteCount(ReadOnlySpan<char> src)
+    {
+        var byteCount = 0;
+        var i = 0;
+        var n = src.Length;
+        ref ushort srcRef = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(src));
+
+        // SIMD path 1: Vector512 (32 chars/iter) on AVX-512BW hosts
+        if (Avx512BW.IsSupported && n >= Vector512<ushort>.Count)
+        {
+            var v_0x80 = Vector512.Create((ushort)0x80);
+            var v_0x800 = Vector512.Create((ushort)0x800);
+            var v_0xD800 = Vector512.Create((ushort)0xD800);
+            var v_0xDC00 = Vector512.Create((ushort)0xDC00);
+            var v_0xE000 = Vector512.Create((ushort)0xE000);
+
+            do
+            {
+                var v = Vector512.LoadUnsafe(ref srcRef, (uint)i);
+
+                var c_lt_0x80   = BitOperations.PopCount(Vector512.LessThan(v, v_0x80).ExtractMostSignificantBits());
+                var c_lt_0x800  = BitOperations.PopCount(Vector512.LessThan(v, v_0x800).ExtractMostSignificantBits());
+                var c_lt_0xD800 = BitOperations.PopCount(Vector512.LessThan(v, v_0xD800).ExtractMostSignificantBits());
+                var c_lt_0xDC00 = BitOperations.PopCount(Vector512.LessThan(v, v_0xDC00).ExtractMostSignificantBits());
+                var c_lt_0xE000 = BitOperations.PopCount(Vector512.LessThan(v, v_0xE000).ExtractMostSignificantBits());
+
+                var highSur = c_lt_0xDC00 - c_lt_0xD800;
+                var lowSur = c_lt_0xE000 - c_lt_0xDC00;
+                byteCount += 3 * Vector512<ushort>.Count - c_lt_0x80 - c_lt_0x800 + highSur - 3 * lowSur;
+
+                i += Vector512<ushort>.Count;
+            } while (n - i >= Vector512<ushort>.Count);
+        }
+
+        // SIMD path 2: Vector256 (16 chars/iter) on AVX2 hosts; also handles AVX-512 tail < 32 chars
+        if (Vector256.IsHardwareAccelerated && n - i >= Vector256<ushort>.Count)
+        {
+            var v_0x80 = Vector256.Create((ushort)0x80);
+            var v_0x800 = Vector256.Create((ushort)0x800);
+            var v_0xD800 = Vector256.Create((ushort)0xD800);
+            var v_0xDC00 = Vector256.Create((ushort)0xDC00);
+            var v_0xE000 = Vector256.Create((ushort)0xE000);
+
+            do
+            {
+                var v = Vector256.LoadUnsafe(ref srcRef, (uint)i);
+
+                var c_lt_0x80   = BitOperations.PopCount(Vector256.LessThan(v, v_0x80).ExtractMostSignificantBits());
+                var c_lt_0x800  = BitOperations.PopCount(Vector256.LessThan(v, v_0x800).ExtractMostSignificantBits());
+                var c_lt_0xD800 = BitOperations.PopCount(Vector256.LessThan(v, v_0xD800).ExtractMostSignificantBits());
+                var c_lt_0xDC00 = BitOperations.PopCount(Vector256.LessThan(v, v_0xDC00).ExtractMostSignificantBits());
+                var c_lt_0xE000 = BitOperations.PopCount(Vector256.LessThan(v, v_0xE000).ExtractMostSignificantBits());
+
+                var highSur = c_lt_0xDC00 - c_lt_0xD800;
+                var lowSur = c_lt_0xE000 - c_lt_0xDC00;
+                byteCount += 3 * Vector256<ushort>.Count - c_lt_0x80 - c_lt_0x800 + highSur - 3 * lowSur;
+
+                i += Vector256<ushort>.Count;
+            } while (n - i >= Vector256<ushort>.Count);
+        }
+
+        // SIMD path 3: Vector128 (8 chars/iter) on Apple Silicon NEON, WASM SIMD, legacy SSE2;
+        // also handles tail < 16 from higher tiers. Cross-platform via Vector128.IsHardwareAccelerated.
+        if (Vector128.IsHardwareAccelerated && n - i >= Vector128<ushort>.Count)
+        {
+            var v_0x80 = Vector128.Create((ushort)0x80);
+            var v_0x800 = Vector128.Create((ushort)0x800);
+            var v_0xD800 = Vector128.Create((ushort)0xD800);
+            var v_0xDC00 = Vector128.Create((ushort)0xDC00);
+            var v_0xE000 = Vector128.Create((ushort)0xE000);
+
+            do
+            {
+                var v = Vector128.LoadUnsafe(ref srcRef, (uint)i);
+
+                var c_lt_0x80   = BitOperations.PopCount(Vector128.LessThan(v, v_0x80).ExtractMostSignificantBits());
+                var c_lt_0x800  = BitOperations.PopCount(Vector128.LessThan(v, v_0x800).ExtractMostSignificantBits());
+                var c_lt_0xD800 = BitOperations.PopCount(Vector128.LessThan(v, v_0xD800).ExtractMostSignificantBits());
+                var c_lt_0xDC00 = BitOperations.PopCount(Vector128.LessThan(v, v_0xDC00).ExtractMostSignificantBits());
+                var c_lt_0xE000 = BitOperations.PopCount(Vector128.LessThan(v, v_0xE000).ExtractMostSignificantBits());
+
+                var highSur = c_lt_0xDC00 - c_lt_0xD800;
+                var lowSur = c_lt_0xE000 - c_lt_0xDC00;
+                byteCount += 3 * Vector128<ushort>.Count - c_lt_0x80 - c_lt_0x800 + highSur - 3 * lowSur;
+
+                i += Vector128<ushort>.Count;
+            } while (n - i >= Vector128<ushort>.Count);
+        }
+
+        // Scalar tail (and fallback for non-SIMD hardware).
+        // CRITICAL: must use the SAME per-char accounting model as the SIMD path so that surrogate
+        // pairs split across a SIMD/scalar boundary count correctly. The SIMD path counts each char
+        // independently — high surrogate → 4 bytes, low surrogate → 0 bytes. The scalar tail must
+        // do the same (i += 1 per char, NOT i += 2 on high surrogate). If the scalar tail
+        // double-consumed surrogate pairs (i += 2 on high), a high surrogate landing in the last
+        // SIMD chunk would be counted there as 4 bytes, then its low surrogate in the scalar tail
+        // would re-trigger the surrogate branch and add 4 more bytes (with i += 2 advancing past
+        // an unrelated next char). Net: +4 byte miscount per split-pair.
+        while (i < n)
+        {
+            var c = Unsafe.Add(ref srcRef, i);
+            if (c < 0x80)
+            {
+                byteCount += 1;
+            }
+            else if (c < 0x800)
+            {
+                byteCount += 2;
+            }
+            else if (c < 0xD800)
+            {
+                byteCount += 3;  // BMP below surrogate range
+            }
+            else if (c < 0xDC00)
+            {
+                byteCount += 4;  // high surrogate → owns the 4-byte encoding for the pair
+            }
+            else if (c < 0xE000)
+            {
+                // low surrogate → 0 bytes (the paired high surrogate already accounted for the 4)
+            }
+            else
+            {
+                byteCount += 3;  // BMP at or above 0xE000
+            }
+            i += 1;
+        }
+
+        return byteCount;
+    }
+
     /// <summary>
     /// Counts UTF-16 chars produced by decoding the given UTF-8 byte span.
     /// </summary>
diff --git a/AyCode.Core/Serializers/PropertyMetadataBase.cs b/AyCode.Core/Serializers/PropertyMetadataBase.cs
index a95e624..dd8370b 100644
--- a/AyCode.Core/Serializers/PropertyMetadataBase.cs
+++ b/AyCode.Core/Serializers/PropertyMetadataBase.cs
@@ -99,7 +99,15 @@ public abstract class PropertyMetadataBase
         [DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.PublicProperties)] Type declaringType)
     {
         Name = prop.Name;
-        NameUtf8 = Encoding.UTF8.GetBytes(prop.Name);
+        // Ctor-once init: SIMD path via Utf8Transcoder (GetUtf8ByteCount + EncodeUtf8SinglePass)
+        // bypasses Encoding.UTF8 virtual-dispatch + encoder-fallback overhead. Ascii.FromUtf16
+        // would be slightly faster for the (overwhelmingly common) ASCII property name case, but
+        // the symmetric Utf8Transcoder API keeps this consistent with the binary serializer's
+        // writer-side BCL-free policy and handles non-ASCII property names without a fallback.
+        var nameByteCount = Utf8Transcoder.GetUtf8ByteCount(prop.Name.AsSpan());
+        var nameBytes = new byte[nameByteCount];
+        Utf8Transcoder.EncodeUtf8SinglePass(prop.Name.AsSpan(), nameBytes);
+        NameUtf8 = nameBytes;
         DeclaringType = declaringType;
         PropertyType = prop.PropertyType;
 
diff --git a/AyCode.Core/docs/BINARY/BINARY_TODO.md b/AyCode.Core/docs/BINARY/BINARY_TODO.md
index 81aa3f3..f32a8f2 100644
--- a/AyCode.Core/docs/BINARY/BINARY_TODO.md
+++ b/AyCode.Core/docs/BINARY/BINARY_TODO.md
@@ -692,6 +692,7 @@ JIT/AOT path-selection via `[Intrinsic]` `IsSupported` static booleans — non-s
 |-------|--------|-----------|-----------|-----------|--------|
 | 1 | `CountUtf8Chars` (decode 1st pass) | ✅ done | ✅ existing | ✅ done | ✅ existing |
 | 2 | `EncodeUtf8SinglePass` Phase 1 (ASCII narrow) | ✅ done | ✅ existing | ✅ done | ✅ existing |
+| 2.5 | `DecodeUtf8SinglePass` scalar run-length decoder (multi-byte baseline) | — | — | — | ⏳ TODO |
 | 3a | `DecodeUtf8SinglePass` multi-byte transcoder (Vector512) | ⏳ TODO | bail-out only | bail-out only | ✅ existing |
 | 3b | `DecodeUtf8SinglePass` multi-byte transcoder (Vector256) | — | 🔍 **deferred — see note** | bail-out only | ✅ existing |
 | 3c | `DecodeUtf8SinglePass` multi-byte transcoder (Vector128) | — | — | ⏳ TODO | ✅ existing |
@@ -731,6 +732,54 @@ The cascading tail-handler hierarchy (existing in Phase 1+2) carries over: AVX-5
 
 The Vector128 path is the **WASM and Apple Silicon target** — without it both platforms fell back to scalar (1 byte/iter). With Phase 1+2 landed, WASM and Apple Silicon now run the UTF-8 hot path at 16 byte/iter (16× scalar speedup on the count + ASCII narrow operations).
 
+### Phase 2.5 — scalar run-length decoder (multi-byte baseline, pre-Phase 3 prototype)
+
+Targets the `DecodeUtf8SinglePass` switch-jumptable per-char dispatch on multi-byte content. Current scalar Phase (jumptable) re-dispatches every char; a run-length-aware scalar decoder runs a tight branchless inner loop on homogeneous runs (long ASCII run, long 2-byte Latin/Cyrillic run, long 3-byte CJK BMP run), with the existing single-codepoint scalar branch as mixed-edge fallback.
+
+**Algorithm sketch**:
+```
+while (s < src.Length)
+{
+    // 1) ASCII run (0xxxxxxx) — already handled by Phase 1 SIMD prefix; this is tail
+    int asciiStart = s;
+    while (s < src.Length && src[s] < 0x80) s++;
+    if (s > asciiStart) { WriteAsciiRun(src.Slice(asciiStart, s-asciiStart), dst, ref d); continue; }
+
+    // 2) 2-byte run (110xxxxx 10xxxxxx) — Hungarian / Cyrillic / Greek / Hebrew / Arabic
+    int twoStart = s;
+    while (s + 1 < src.Length && Is2ByteLead(src[s]) && IsCont(src[s+1])) s += 2;
+    if (s > twoStart) { Decode2ByteRun(src.Slice(twoStart, s-twoStart), dst, ref d); continue; }
+
+    // 3) 3-byte run (1110xxxx 10xxxxxx 10xxxxxx) — CJK BMP, other 3-byte BMP scripts
+    int threeStart = s;
+    while (s + 2 < src.Length && Is3ByteLead(src[s]) && IsCont(src[s+1]) && IsCont(src[s+2])) s += 3;
+    if (s > threeStart) { Decode3ByteRun(src.Slice(threeStart, s-threeStart), dst, ref d); continue; }
+
+    // 4) Mixed-edge fallback (typically 4-byte surrogate pair or single transition char)
+    DecodeSingleCodePoint(src, ref s, dst, ref d);
+}
+```
+
+**Why P2.5 — scalar baseline before SIMD multi-byte (Phase 3a-3c)**:
+- 1-2h prototyping cost vs 6-10h Phase 3 SIMD work
+- A/B benchmark on Repeated cell decides whether the run-length structure already wins on Magyar mixed (`KözösCímke` pattern) — if it does, Phase 3 lifts further; if not, Phase 3 SIMD is the only win path
+- Documents the "switch-jumptable bottleneck on Hungarian benchmark" hypothesis without committing to the larger SIMD effort
+- The `Decode2ByteRun` / `Decode3ByteRun` scalar-batch implementations also serve as algorithm references for the Phase 3 SIMD versions (clear semantics first, optimize after)
+
+**Expected payoff** (per content class, ratio vs current switch-jumptable):
+- Long CJK BMP (3-byte run, e.g. `你好世界` ×30): ~20-40% Deser improvement (long homogeneous run, biggest jumptable savings)
+- Long 2-byte run (`árvíztűrő` ×10+): ~5-15% improvement
+- Magyar mixed (`KözösCímke`, `sötét` — short alternating runs): 0-5% (run-detection overhead may eat the savings on short runs)
+- Long ASCII (≥32 byte): 0% (Phase 1 SIMD prefix already handles)
+- Emoji (4-byte): 0% (mixed-edge fallback unchanged)
+
+**Risk** — the existing switch-jumptable JIT optimization is strong; Magyar mixed text (1-2 char runs) may not show net gain. Implementation must be **isolated prototype first** (alongside the live `DecodeUtf8SinglePass`, not replacing it), with A/B benchmark comparing the two before any switch.
+
+**Acceptance (Phase 2.5)**:
+- Repeated cell Compact Deser ratio ≤ 1.0 vs MemPack on AVX2 hosts (parity with current measurement, no regression)
+- Round-trip tests pass on all UTF-8 content classes (ASCII / 2-byte / 3-byte BMP / 4-byte surrogate-pair)
+- A/B benchmark shows ≥ 5% Deser improvement on Repeated OR ≥ 10% on Large cell — else Phase 2.5 stays in TODO as documented dead-end (negative result is also valuable: confirms the jumptable is fast enough, focus moves entirely to Phase 3)
+
 ### Phase 3 implementation outline
 
 - Insert SIMD multi-byte branches at `DecodeUtf8SinglePass` entry, **before** the existing ASCII-prefix bail-out loops:
@@ -777,6 +826,38 @@ The Vector128 path is the **WASM and Apple Silicon target** — without it both
 - Local `dotnet test` covers correctness; per-tier benchmarks measure the multi-byte speedup
 - Phase 1+2 (AVX-512BW + Vector128 in `CountUtf8Chars` + `EncodeUtf8SinglePass` Phase 1) **landed 2026-05-05** — covered by existing round-trip tests, no regression on non-AVX-512 hosts (validated on AVX2-host bench)
 
+## ACCORE-BIN-T-H2Q6: Fixed-width dual-length string header (Small/Medium/Big) for 1-pass decode
+**Priority:** P1 · **Type:** Wire-format + Performance · **Related:** `DecodeUtf8SinglePass`, `CountUtf8Chars`, `WriteStringWithDispatch`, `ReadStringUtf8`
+
+Current Compact string decode uses two-pass flow for non-ASCII payloads (`CountUtf8Chars` + `DecodeUtf8SinglePass`).
+Planned direction: remove VarUInt-based string-length path for the new string wire variant, and carry both lengths in a fixed-width header so deserialize can allocate target `string` immediately and decode in a single pass.
+
+### Planned format tiers
+
+- **Small**: packed `uint16` (`charLen:8 | utf8Len:8`)
+- **Medium**: packed `uint32` (`charLen:16 | utf8Len:16`)
+- **Big**: `uint32 charLen + uint32 utf8Len`
+
+Writer picks the smallest fitting tier; reader dispatches by marker and reads fixed-width lengths (no VarUInt loop for string length metadata).
+
+### Why
+
+- Removes `CountUtf8Chars` pass on the new markers (1-pass decode path)
+- Keeps decode branch profile stable (fixed-size header reads)
+- Maintains range safety with explicit Big overflow path
+
+### Constraints captured from current benchmark context
+
+- Performance evaluation target is non-ASCII-heavy data (ASCII-shortcuts intentionally not primary)
+- Wire-format backward compatibility is not required for this development phase
+
+### Acceptance
+
+- New string markers implemented for Small/Medium/Big tiers
+- Deserialize path for these markers performs single-pass decode without `CountUtf8Chars`
+- Existing round-trip tests pass, plus new boundary tests for tier transitions
+- Benchmark report includes before/after for Compact mode on non-ASCII dataset (Ser/Deser/RT + Size)
+
 ## ACCORE-BIN-T-S5L8: Sentinel-length encoding for strings (wire-size optimization, both modes)
 **Priority:** P3 · **Type:** Wire-format optimization · **Related:** `AcBinarySerializer.WriteString`, `AcBinaryDeserializer.ReadValue` string dispatch
 
@@ -1019,3 +1100,114 @@ Reader-side: SGen-generated code drops the per-property `ReadByte()` + `IsTinyIn
 - Schema-evolution fragility documented in `BINARY_FEATURES.md` (alongside the existing `PropertySkip` / default-omission caveat from `ACCORE-BIN-I-D9Y2`)
 - Opt-in flag with default `false` (preserves marker-driven default; consumers explicitly opt in for frozen-schema scenarios)
 
+## ACCORE-BIN-T-V4N3: Symmetric `GetUtf8ByteCount` API + writer-side BCL kihagyás (cold path)
+**Priority:** P3 · **Type:** Performance · **Status:** Closed (2026-05-06) · **Related:** `EncodeUtf8SinglePass`, `WriteStringUtf8Internal`, `PropertyMetadataBase.NameUtf8`
+
+Symmetric byte-count helper for `EncodeUtf8SinglePass`, paired with writer-side BCL `Encoding.UTF8.GetBytes` / `GetByteCount` removal across all cold-path call sites. `Utf8Transcoder.GetUtf8ByteCount(ReadOnlySpan<char>)` SIMD impl (Vector512 / Vector256 / Vector128 / scalar tier hierarchy, 5-popcount closed-form aggregation handling chunk-split surrogate pairs correctly).
+
+**Implementation summary**:
+- `Utf8Transcoder.GetUtf8ByteCount` SIMD impl with closed-form `bytes = 3*N - ascii - c_lt_0x800 + highSur - 3*lowSur` aggregation
+- `Utf8TranscoderTests` extended (29 new tests covering ASCII / Hungarian / CJK / emoji / boundary 0-64, plus surrogate-pair-split-across-SIMD-chunks regression coverage)
+- `WriteStringUtf8Internal` (`BinarySerializationContext.cs:875`) refactored from BCL two-pass to single-pass D-2 layout (worst-case `length*4` allocate + `EncodeUtf8SinglePass` + VarUInt backfill); the `4×` worst-case capacity is amortized by the buffer growth doubling strategy (`Math.Max(buffer.Length*2, position+needed)` + ArrayPool bucket-rounding to next power-of-2)
+- Cold path cleanup: `AcBinarySerializer.AnalyzeStringInternCandidates` (analysis log) and `PropertyMetadataBase.NameUtf8` ctor-once init both migrated to `Utf8Transcoder`
+
+### Resolution
+
+Landed 2026-05-06. All `Utf8TranscoderTests` pass (55/55). Binary test suite unchanged (222 pass / 13 pre-existing GuidIId failures, untouched).
+
+**Critical observation surfaced during the audit**: `WriteStringUtf8Internal` has only one caller (`WriteFixStrDirect`), and `WriteFixStrDirect` itself is **uncalled anywhere in the codebase** — no core call site, no SourceGenerator template hit (verified against `AcBinarySourceGenerator.cs` line 706/724/1492/1514 — generator emits `WriteStringGenerated` and `context.WriteStringUtf8` (the public 659-line method, not `WriteStringUtf8Internal`)), no test, no reflection path. The V4N3 implementation therefore landed cleanly but its hot-path benchmark impact is limited to the two cold-path init sites. Dead-code disposition tracked as `ACCORE-BIN-T-V4N5`.
+
+**Algorithmic correctness lesson** — the initial 4-popcount formula (`3*N - c_lt_0x80 - c_lt_0x800 - 2*highSur`) was wrong on chunks where a surrogate pair straddles the SIMD chunk boundary (it implicitly assumed `lowSur == highSur` per chunk, which is true over the whole well-formed string but NOT per chunk). Fix: 5-popcount closed-form (`3*N - ascii - c_lt_0x800 + highSur - 3*lowSur`), with the scalar tail using the same per-char accounting model (`i += 1` per char regardless of role; high → 4, low → 0, BMP → 3, two-byte → 2, ASCII → 1). Caught by `GetUtf8ByteCount_MultipleEmojiBoundary_MatchesBcl` and `GetUtf8ByteCount_BoundaryAsciiToEmoji_MatchesBcl` regression tests — exactly the `prefixLen` 1, 7 boundaries that exercise chunk-split surrogate pairs.
+
+## ACCORE-BIN-T-V4N4: NativeAOT-specific inlining / codegen audit on hot UTF-8 path
+**Priority:** P2 · **Type:** Performance · **Related:** `EncodeUtf8SinglePass`, `DecodeUtf8SinglePass`, `WriteStringWithDispatch`, `Utf8Transcoder` SIMD path
+
+Hypothesis: NativeAOT (the benchmark target environment) does not match Tier 1 JIT optimization quality on the UTF-8 hot path, despite `[MethodImpl(AggressiveInlining)]` hints. Symptoms in 2026-05-05 / 2026-05-06 benchmarks:
+
+- Repeated cell perzisztens 8-11% Compact ≤ MemPack lemaradás (Magyar content + repeated string pattern)
+- Compact Ser/Deser cellán mozaikos eredmények run-to-run (4-7/10 cell wins, 3-6 noise/loss bands)
+- Methodonkénti Compact gyorsítások a Medium/Large/Deep cellán **konzisztensek** (-22% to -28% vs MemPack), ami JIT/AOT inlining-eltérésnek tűnik a Repeated-en — ott a `WriteStringWithDispatch` short-lane sokszor hívódik 10× repeated string-en
+
+**Suspect mechanisms (ranked by likelihood)**:
+
+1. **AOT inline budget**. NativeAOT is more conservative than the Tier 1 JIT in respecting `AggressiveInlining` for large method bodies. `EncodeUtf8SinglePass` (~190 lines, 4 SIMD path + scalar), `DecodeUtf8SinglePass` (~120 lines), `GetUtf8ByteCount` (~120 lines) may exceed the AOT inline budget at hot call sites (`WriteStringWithDispatch` short-lane, `ReadString` decode callback). If the AOT compiler emits `call <method>` instead of inlining, every iteration of the Repeated 10-string loop pays the call overhead.
+
+2. **`[Intrinsic]` `IsSupported` constant folding**. `Avx512BW.IsSupported`, `Vector512.IsHardwareAccelerated`, `Vector256.IsHardwareAccelerated`, `Vector128.IsHardwareAccelerated` should constant-fold per host on AOT. Verify via disasm — if any remain runtime checks, every iteration pays the branch cost (3 nested `if`-s in each Utf8Transcoder method).
+
+3. **`Vector256.LessThan<ushort>` unsigned compare emulation**. No native `pcmpltw_unsigned` on AVX2; JIT/AOT lowers to `pminuw` + `pcmpeqw`. Cost amortized over many chars in long content but can dominate on short Magyar runs (`KözösCímke` ~6 runs of 2-3 chars). Less likely if (1) holds — the inlining hit dwarfs the per-instruction emulation cost.
+
+4. **Method size cascade**. The Utf8Transcoder method bodies grew with the V4N3 `GetUtf8ByteCount` addition. Adjacent methods in the same source file may have lost inlining at SGen-generated callers due to AOT compilation-unit heuristics (file-locality affects inline cost models on some AOT codegen).
+
+**Investigation steps (no code changes — diagnostic phase first)**:
+
+1. NativeAOT publish dump:
+   ```
+   dotnet publish AyCode.Core.Serializers.Console -c Release -r win-x64 -p:PublishAot=true
+   dumpbin /disasm <output.exe> > disasm.txt
+   ```
+2. Locate `EncodeUtf8SinglePass`, `DecodeUtf8SinglePass`, `GetUtf8ByteCount`, `CountUtf8Chars` symbols in the disasm
+3. Verify constant folding on `IsSupported` checks — no run-time CMP/JMP at the path-selector branches; the dead branches eliminated
+4. Verify inlining at `WriteStringWithDispatch` / `ReadString` callers — if `call <Utf8Transcoder.*>` instructions remain, inlining failed
+5. Method size inspection — large method bodies hint at inline-eligibility issues; large prologue/epilogue at hot call sites is a tell
+6. Cross-compare with Tier 1 JIT disasm (run with `DOTNET_TieredCompilation=0` + `DOTNET_TC_QuickJit=0` to force Tier 1, dump the JIT-tier disasm via WinDbg or `BenchmarkDotNet`'s `[DisassemblyDiagnoser]`) to confirm the gap is AOT-specific rather than algorithmic
+
+**Possible fixes (Open until disasm confirms which apply)**:
+
+- **A. Method split** — `EncodeUtf8SinglePass` → small dispatcher + per-tier inner methods (each Vector512 / Vector256 / Vector128 / scalar in its own AOT-inline-friendly small method). Same for `DecodeUtf8SinglePass`. The dispatcher stays small enough to inline at the hot call site; the dead-branch tier methods are never called on a given host.
+- **B. `[MethodImpl(NoInlining)]` on cold tiers** — paradox tactic that can REDUCE the hot-path code emitted at the call site by preventing the AOT from speculatively considering the dead branches as inlining candidates.
+- **C. Per-target ISA build** — if the benchmark environment has a fixed ISA (e.g. AVX2 baseline), use `<IlcInstructionSet>` in `csproj` to constant-fold the `IsSupported` checks at AOT compile time. Alternative: separate per-ISA AOT publish artifacts.
+- **D. Manual hot-path inlining** — for the Repeated cell, hand-inline `EncodeUtf8SinglePass` short-string lane into `WriteStringWithDispatch` FixStr path (≤31 byte case). Trades code-size for hot-path speed.
+- **E. Algorithm change** — if the AOT can't inline the SIMD bodies efficiently, a smaller scalar-only fast path for short strings (≤31 byte) bypassing the SIMD setup might be faster on AOT than on JIT (where Tier 1 is fine with the SIMD path inlined).
+
+### Why P2
+
+- Repeated benchmark cell is the canonical witness for the **i18n production deploy** narrative — public NuGet release narrative depends on parity-or-better against MemPack across all cells (cloud / desktop / mobile / Blazor WASM)
+- AOT-specific tuning is high-leverage on the hot path — JIT-only optimizations will not match
+- Disasm validation is the prerequisite for any of the fix directions; without it, any change is speculative and risks reintroducing 2c-style regression
+
+### Acceptance
+
+- Disasm report confirms (or refutes) inlining + constant-fold hypotheses on the hot UTF-8 path
+- If hypotheses confirmed: the chosen fix delivers Repeated Compact Ser+Deser ratio ≤ 1.0 vs MemPack on the AOT benchmark target
+- No regression on Small / Medium / Large / Deep cells (or net positive)
+- Fix maintains cross-tier SIMD correctness (round-trip tests pass on all UTF-8 content classes); both `Utf8TranscoderTests` and the binary test suite stay green
+
+### Trigger
+
+- Pre-NuGet release: i18n claim cannot ship with an 8-11% gap on a representative cell
+- Disasm + bench correlation step before any code change (no speculative refactoring)
+
+## ACCORE-BIN-T-V4N5: Dead-code review — `WriteFixStrDirect` + `WriteStringUtf8Internal`
+**Priority:** P3 · **Type:** Refactor / hygiene · **Related:** `BinarySerializationContext.WriteFixStrDirect` (line 832), `WriteStringUtf8Internal` (line 875)
+
+V4N3 audit surfaced two methods with no callers in the entire workspace:
+
+- `WriteFixStrDirect(string)` — public method, no call site (no core, no SourceGenerator template, no test, no reflection / Expression-compile)
+- `WriteStringUtf8Internal(string)` — private method called only from `WriteFixStrDirect`'s non-ASCII fallback branch
+
+The pair forms a closed dead loop (`WriteFixStrDirect` → `WriteStringUtf8Internal`), but no entry point reaches `WriteFixStrDirect`. The public-API `WriteStringUtf8` (line 659) is the live equivalent and is called from the SourceGenerator template (polymorphism path: assembly-qualified type-name write). The hot-path string-write goes through `WriteStringWithDispatch` (line 734) which uses the M3R7 marker-dispatch — NOT through this dead pair.
+
+### Disposition options (decide pre-NuGet release)
+
+1. **Delete both methods** — pure dead-code cleanup; reduces public surface, removes maintenance burden, simplifies onboarding. Functionality is fully covered by `WriteStringWithDispatch` (M3R7 marker-dispatch — emits `FixStr` / `FixStrAscii` directly with proper ASCII detection via `bytesWritten == charLength` after `EncodeUtf8SinglePass`).
+2. **Activate `WriteFixStrDirect` for property-name writes** — SGen could emit `WriteFixStrDirect(propName)` instead of `WriteStringWithDispatch(propName)` for known-short, often-ASCII property names — saving the marker-dispatch overhead. Requires SGen template change + benchmark validation that the saving is real (likely marginal — property names are typically <31 char ASCII, so M3R7 already takes the FixStrAscii fast path with one byte-write to `_buffer`). The pre-encoded `NameUtf8` byte[] on `PropertyMetadataBase` already provides a faster path (`WriteFixStrBytes` at line 853) which the SGen / runtime writer could use directly.
+3. **Defer** — leave as-is, document as dead code, revisit when the codebase has another reason to touch this area.
+
+### Why P3
+
+- No correctness or perf impact in either direction (dead code is dead — no consumer affected)
+- Cleanup vs activation is a low-stakes choice; benchmark would decide if option 2 has real saving
+- Surfaced during V4N3 work, not blocking the NuGet release
+
+### Acceptance
+
+- Decision recorded (delete / activate / defer) with rationale
+- If "delete": grep across workspace confirms zero callers post-removal; binary test suite unchanged (still 235 pass / 13 pre-existing failures)
+- If "activate": SGen template change + benchmark validation showing ≥ 2% Ser improvement on a representative cell (otherwise revert to "delete")
+- Documentation in `BINARY_IMPLEMENTATION.md` updated (or remove the old reference if both methods deleted)
+
+### Trigger
+
+- Pre-NuGet release housekeeping pass
+- Or: any future refactor that touches `BinarySerializationContext` string-write methods (then decide rather than leave the dead pair behind)
+