diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index 2e79706..db427c3 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -68,7 +68,12 @@
       "Bash(curl -s \"https://raw.githubusercontent.com/dotnet/runtime/main/src/libraries/System.IO.Pipelines/src/System/IO/Pipelines/StreamPipeWriter.cs\")",
       "WebFetch(domain:lemire.me)",
       "Bash(gh pr *)",
-      "Bash(gh api *)"
+      "Bash(gh api *)",
+      "Bash(ls -la 'C:\\\\Users\\\\Fullepi\\\\Downloads\\\\_baseline\\\\cpuprofiler' 2>&1 | head -30)",
+      "Bash(where PerfView.exe)",
+      "Bash(where dotnet-trace *)",
+      "Bash(dotnet tool *)",
+      "Bash(dotnet-trace convert *)"
     ]
   }
 }
diff --git a/AyCode.Core.Serializers.Console/Program.cs b/AyCode.Core.Serializers.Console/Program.cs
index 19f0b5a..b1b7907 100644
--- a/AyCode.Core.Serializers.Console/Program.cs
+++ b/AyCode.Core.Serializers.Console/Program.cs
@@ -161,6 +161,24 @@ public static class Program
                $"Compression={options.UseCompression}{extra}";
     }
 
+    /// <summary>
+    /// Returns MemoryPack serializer options aligned with <see cref="SelectedWireMode"/> for a fair
+    /// apples-to-apples wire-format comparison:
+    /// <list type="bullet">
+    ///   <item><see cref="WireMode.Compact"/> → <see cref="MemoryPackSerializerOptions.Default"/> (UTF-8) — both
+    ///   engines encode UTF-8, comparison is purely about header / tier / dispatch overhead.</item>
+    ///   <item><see cref="WireMode.Fast"/> → <see cref="MemoryPackSerializerOptions.Utf16"/> (UTF-16 raw memcpy) —
+    ///   both engines write UTF-16 raw bytes, so wire-size and CPU comparison reflect the same string-encoding family.</item>
+    /// </list>
+    /// Without this alignment the FastWire vs MemPack-default comparison conflates two unrelated dimensions
+    /// (UTF-16 raw vs UTF-8 encoded) and produces a misleading +40% wire-size delta that is structurally
+    /// the encoding-family difference, NOT an AcBinary-specific overhead.
+    /// </summary>
+    private static MemoryPackSerializerOptions GetMemPackOptions() =>
+        SelectedWireMode == WireMode.Fast
+            ? MemoryPackSerializerOptions.Utf16
+            : MemoryPackSerializerOptions.Default;
+
     /// <summary>
     /// Converts a total-time (in ms across <see cref="TestIterations"/>) into per-operation microseconds.
     /// Formula: <c>totalMs / iterations × 1000</c>. The benchmark stores <c>*TimeMs</c> as the cumulative
@@ -185,6 +203,63 @@ public static class Program
     private static double DesPerOp(BenchmarkResult r) => ToPerOpMicros(r.DeserializeTimeMs, r.DeserializeIterations);
     private static double RtPerOp(BenchmarkResult r) => ToPerOpMicros(r.RoundTripTimeMs, r.RoundTripIterations);
 
+    /// <summary>
+    /// Per-cell-paired aggregation of an overall comparison. Captures three different aggregation
+    /// strategies so the reader can judge whether the headline delta is dominated by one large cell
+    /// (arithmetic mean) or representative of typical workload (geometric mean / median).
+    /// </summary>
+    /// <param name="ArithMeanPct">Arithmetic mean of µs/op — magnitude-weighted; biased toward Large cell.</param>
+    /// <param name="GeoMeanPct">Geometric mean of per-cell ratios — magnitude-neutral; each cell weighted equally.</param>
+    /// <param name="MedianPct">Median of per-cell ratios — outlier-resistant.</param>
+    /// <param name="AcAvg">Arithmetic mean AcBinary value (µs/op or bytes).</param>
+    /// <param name="MpAvg">Arithmetic mean MemPack value.</param>
+    /// <param name="CellCount">Number of paired cells contributing to the geo/median.</param>
+    private record OverallStats(double ArithMeanPct, double GeoMeanPct, double MedianPct, double AcAvg, double MpAvg, int CellCount);
+
+    /// <summary>
+    /// Computes arithmetic + geometric + median aggregation of an AcBinary-vs-MemPack comparison
+    /// across paired cells (joined by <c>TestDataName</c>). Per-cell pairing is required for the
+    /// geo/median variants — a cell where AcBinary or MemPack is missing is dropped from all stats.
+    /// Returns null when no paired cell has a valid value.
+    /// </summary>
+    private static OverallStats? ComputeOverallStats(
+        List<BenchmarkResult> acResults,
+        List<BenchmarkResult> mpResults,
+        Func<BenchmarkResult, double> getValue)
+    {
+        if (acResults.Count == 0 || mpResults.Count == 0) return null;
+
+        var pairs = (from ac in acResults
+                     join mp in mpResults on ac.TestDataName equals mp.TestDataName
+                     let acV = getValue(ac)
+                     let mpV = getValue(mp)
+                     where acV > 0 && mpV > 0
+                     select (ac: acV, mp: mpV)).ToList();
+
+        if (pairs.Count == 0) return null;
+
+        var acAvg = pairs.Average(p => p.ac);
+        var mpAvg = pairs.Average(p => p.mp);
+        var ratios = pairs.Select(p => p.ac / p.mp).ToList();
+
+        // Geometric mean: exp(avg(ln(ratios))) — numerically stable vs Π ratios then ^(1/N).
+        var geoMean = Math.Exp(ratios.Sum(Math.Log) / ratios.Count);
+
+        // Median (paired-ratio): for even N use the midpoint of the two middle values.
+        var sorted = ratios.OrderBy(r => r).ToList();
+        var median = sorted.Count % 2 == 1
+            ? sorted[sorted.Count / 2]
+            : (sorted[sorted.Count / 2 - 1] + sorted[sorted.Count / 2]) / 2.0;
+
+        return new OverallStats(
+            ArithMeanPct: (acAvg / mpAvg - 1) * 100,
+            GeoMeanPct: (geoMean - 1) * 100,
+            MedianPct: (median - 1) * 100,
+            AcAvg: acAvg,
+            MpAvg: mpAvg,
+            CellCount: ratios.Count);
+    }
+
     /// <summary>
     /// Formats a per-op micros value with its inter-sample range and CV-threshold marker as
     /// <c>"26.86 (24.5..29.1)"</c> or <c>"26.86 (24.5..29.1) ⚠️5.2%"</c>. Median first, range in parentheses,
@@ -1452,6 +1527,7 @@ public static class Program
     private sealed class MemoryPackBenchmark : ISerializerBenchmark
     {
         private readonly TestOrder _order;
+        private readonly MemoryPackSerializerOptions _options;
         private readonly byte[] _serialized;
 
         public string Engine => EngineMemoryPack;
@@ -1461,12 +1537,14 @@ public static class Program
         public int SerializedSize => _serialized.Length;
         public long SetupSerializeAllocBytes => 0;
         public long SetupDeserializeAllocBytes => 0;
+        public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}";
 
         public MemoryPackBenchmark(TestOrder order, string optionsPreset)
         {
             _order = order;
             OptionsPreset = optionsPreset;
-            _serialized = MemoryPackSerializer.Serialize(order);
+            _options = GetMemPackOptions();
+            _serialized = MemoryPackSerializer.Serialize(order, _options);
         }
 
         public void Warmup(int iterations)
@@ -1479,15 +1557,15 @@ public static class Program
         }
 
         [MethodImpl(MethodImplOptions.NoInlining)]
-        public void Serialize() => MemoryPackSerializer.Serialize(_order);
+        public void Serialize() => MemoryPackSerializer.Serialize(_order, _options);
 
         [MethodImpl(MethodImplOptions.NoInlining)]
-        public void Deserialize() => MemoryPackSerializer.Deserialize<TestOrder>(_serialized);
+        public void Deserialize() => MemoryPackSerializer.Deserialize<TestOrder>(_serialized, _options);
 
         public bool VerifyRoundTrip()
         {
-            var bytes = MemoryPackSerializer.Serialize(_order);
-            var roundTripped = MemoryPackSerializer.Deserialize<TestOrder>(bytes);
+            var bytes = MemoryPackSerializer.Serialize(_order, _options);
+            var roundTripped = MemoryPackSerializer.Deserialize<TestOrder>(bytes, _options);
             return DeepEqualsViaJson(_order, roundTripped);
         }
     }
@@ -2422,6 +2500,7 @@ public static class Program
     private sealed class MemoryPackFreshBufferWriterBenchmark : ISerializerBenchmark
     {
         private readonly TestOrder _order;
+        private readonly MemoryPackSerializerOptions _options;
         private readonly byte[] _serialized;
 
         public string Engine => EngineMemoryPack;
@@ -2431,12 +2510,14 @@ public static class Program
         public int SerializedSize => _serialized.Length;
         public long SetupSerializeAllocBytes => 0;
         public long SetupDeserializeAllocBytes => 0;
+        public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}";
 
         public MemoryPackFreshBufferWriterBenchmark(TestOrder order, string optionsPreset)
         {
             _order = order;
             OptionsPreset = optionsPreset;
-            _serialized = MemoryPackSerializer.Serialize(order);
+            _options = GetMemPackOptions();
+            _serialized = MemoryPackSerializer.Serialize(order, _options);
         }
 
         public void Warmup(int iterations)
@@ -2452,17 +2533,17 @@ public static class Program
         public void Serialize()
         {
             var abw = new ArrayBufferWriter<byte>();
-            MemoryPackSerializer.Serialize(abw, _order);
+            MemoryPackSerializer.Serialize(abw, _order, _options);
         }
 
         [MethodImpl(MethodImplOptions.NoInlining)]
-        public void Deserialize() => MemoryPackSerializer.Deserialize<TestOrder>(_serialized);
+        public void Deserialize() => MemoryPackSerializer.Deserialize<TestOrder>(_serialized, _options);
 
         public bool VerifyRoundTrip()
         {
             var abw = new ArrayBufferWriter<byte>();
-            MemoryPackSerializer.Serialize(abw, _order);
-            var roundTripped = MemoryPackSerializer.Deserialize<TestOrder>(abw.WrittenSpan.ToArray());
+            MemoryPackSerializer.Serialize(abw, _order, _options);
+            var roundTripped = MemoryPackSerializer.Deserialize<TestOrder>(abw.WrittenSpan.ToArray(), _options);
             return DeepEqualsViaJson(_order, roundTripped);
         }
     }
@@ -2535,6 +2616,7 @@ public static class Program
     private sealed class MemoryPackBufferWriterBenchmark : ISerializerBenchmark
     {
         private readonly TestOrder _order;
+        private readonly MemoryPackSerializerOptions _options;
         private readonly byte[] _serialized;
         private readonly ArrayBufferWriter<byte> _bufferWriter;
 
@@ -2545,12 +2627,14 @@ public static class Program
         public int SerializedSize => _serialized.Length;
         public long SetupSerializeAllocBytes { get; }
         public long SetupDeserializeAllocBytes => 0;
+        public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}";
 
         public MemoryPackBufferWriterBenchmark(TestOrder order, string optionsPreset)
         {
             _order = order;
             OptionsPreset = optionsPreset;
-            _serialized = MemoryPackSerializer.Serialize(order);
+            _options = GetMemPackOptions();
+            _serialized = MemoryPackSerializer.Serialize(order, _options);
 
             // Serialize-side setup only — see AcBinaryBufferWriterBenchmark for the full rationale.
             GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
@@ -2573,17 +2657,17 @@ public static class Program
         public void Serialize()
         {
             _bufferWriter.ResetWrittenCount();
-            MemoryPackSerializer.Serialize(_bufferWriter, _order);
+            MemoryPackSerializer.Serialize(_bufferWriter, _order, _options);
         }
 
         [MethodImpl(MethodImplOptions.NoInlining)]
-        public void Deserialize() => MemoryPackSerializer.Deserialize<TestOrder>(_serialized);
+        public void Deserialize() => MemoryPackSerializer.Deserialize<TestOrder>(_serialized, _options);
 
         public bool VerifyRoundTrip()
         {
             _bufferWriter.ResetWrittenCount();
-            MemoryPackSerializer.Serialize(_bufferWriter, _order);
-            var roundTripped = MemoryPackSerializer.Deserialize<TestOrder>(_bufferWriter.WrittenSpan.ToArray());
+            MemoryPackSerializer.Serialize(_bufferWriter, _order, _options);
+            var roundTripped = MemoryPackSerializer.Deserialize<TestOrder>(_bufferWriter.WrittenSpan.ToArray(), _options);
             return DeepEqualsViaJson(_order, roundTripped);
         }
     }
@@ -2932,63 +3016,62 @@ public static class Program
 
         // All averages are over per-op µs (iter-independent). Batch-time averaging would mix rows
         // measured with different iter counts (post-calibration), producing meaningless numbers.
-        var memPackAvgSer = memPackSerResults.Count > 0 ? memPackSerResults.Average(r => SerPerOp(r)) : 0;
-        var memPackAvgDes = memPackDesResults.Average(r => DesPerOp(r));
-        var memPackAvgRt = memPackRtResults.Average(r => RtPerOp(r));
-        var memPackAvgSize = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).Average(r => r.SerializedSize);
-        var memPackAvgSerAlloc = memPackSerResults.Count > 0 ? memPackSerResults.Average(r => r.SerializeAllocBytesPerOp) : 0;
-        var memPackAvgDesAlloc = memPackDesResults.Count > 0 ? memPackDesResults.Average(r => r.DeserializeAllocBytesPerOp) : 0;
+        // Three aggregations per metric:
+        //   - Arithmetic mean (current behavior) — magnitude-weighted, biased toward Large cell.
+        //   - Geometric mean of per-cell ratios — magnitude-neutral, each cell weighted equally.
+        //   - Median of per-cell ratios — outlier-resistant.
+        // The geo/median variants surface when a single cell dominates the arithmetic average
+        // (typical when one cell's µs-per-op is an order of magnitude larger than the others).
+        var sizeAcResults = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)).ToList();
+        var sizeMpResults = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).ToList();
 
-        var acBinaryAvgSer = acBinarySerResults.Count > 0 ? acBinarySerResults.Average(r => SerPerOp(r)) : 0;
-        var acBinaryAvgDes = acBinaryDesResults.Average(r => DesPerOp(r));
-        var acBinaryAvgRt = acBinaryRtResults.Average(r => RtPerOp(r));
-        var acBinaryAvgSize = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)).Average(r => r.SerializedSize);
-        var acBinaryAvgSerAlloc = acBinarySerResults.Count > 0 ? acBinarySerResults.Average(r => r.SerializeAllocBytesPerOp) : 0;
-        var acBinaryAvgDesAlloc = acBinaryDesResults.Count > 0 ? acBinaryDesResults.Average(r => r.DeserializeAllocBytesPerOp) : 0;
+        var serStats = ComputeOverallStats(acBinarySerResults, memPackSerResults, SerPerOp);
+        var desStats = ComputeOverallStats(acBinaryDesResults, memPackDesResults, DesPerOp);
+        var rtStats = ComputeOverallStats(acBinaryRtResults, memPackRtResults, RtPerOp);
+        var sizeStats = ComputeOverallStats(sizeAcResults, sizeMpResults, r => r.SerializedSize);
+        var serAllocStats = ComputeOverallStats(acBinarySerResults, memPackSerResults, r => r.SerializeAllocBytesPerOp);
+        var desAllocStats = ComputeOverallStats(acBinaryDesResults, memPackDesResults, r => r.DeserializeAllocBytesPerOp);
 
         System.Console.WriteLine();
         System.Console.WriteLine($"── {"AcBinary (Byte[], SGen)"} vs {"MemoryPack (Byte[])"} (Overall) ──");
 
-        // Only show serialize comparison if data available
-        if (memPackAvgSer > 0 && acBinaryAvgSer > 0)
-        {
-            var serPctAll = (acBinaryAvgSer / memPackAvgSer - 1) * 100;
-            System.Console.ForegroundColor = serPctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
-            System.Console.WriteLine($"  Serialize:   {serPctAll:+0;-0}% ({acBinaryAvgSer:F2} µs/op vs {memPackAvgSer:F2} µs/op)");
-            System.Console.ResetColor();
-        }
+        WriteOverallLine("Serialize",   "µs/op", serStats);
+        WriteOverallLine("Deserialize", "µs/op", desStats);
+        WriteOverallLine("Round-trip",  "µs/op", rtStats);
+        WriteOverallLine("Size",        "B",     sizeStats, "F0");
+        WriteOverallLine("Ser Alloc",   "B/op",  serAllocStats, "F0");
+        WriteOverallLine("Des Alloc",   "B/op",  desAllocStats, "F0");
+    }
 
-        var desPctAll = (acBinaryAvgDes / memPackAvgDes - 1) * 100;
-        var rtPctAll = (acBinaryAvgRt / memPackAvgRt - 1) * 100;
-        var sizePctAll = (acBinaryAvgSize / memPackAvgSize - 1) * 100;
+    /// <summary>
+    /// Formats a signed percent delta with explicit sign for positive values (`+1.5%`, `-3.0%`, `0.0%`).
+    /// Padded to 7 chars (e.g. ` +12.3%`, `-100.0%`) for column alignment in the Overall block.
+    /// </summary>
+    private static string FormatPctSigned(double pct) => pct.ToString("+0.0;-0.0;0.0", System.Globalization.CultureInfo.InvariantCulture).PadLeft(6) + "%";
 
-        System.Console.ForegroundColor = desPctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
-        System.Console.WriteLine($"  Deserialize: {desPctAll:+0;-0}% ({acBinaryAvgDes:F2} µs/op vs {memPackAvgDes:F2} µs/op)");
+    /// <summary>
+    /// Renders one Overall row with arith / geo / median deltas + AcBinary/MemPack absolute means.
+    /// Color is driven by the geometric-mean delta (magnitude-neutral signal). Skips silently when
+    /// stats is null (no paired data).
+    /// </summary>
+    private static void WriteOverallLine(string label, string unit, OverallStats? stats, string fmt = "F2")
+    {
+        if (stats == null) return;
+        // Color follows geo-mean (the magnitude-neutral signal). The arith-mean column may show a
+        // different sign when a single big cell dominates — that's exactly the signal we want to surface.
+        System.Console.ForegroundColor = stats.GeoMeanPct <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
+        System.Console.WriteLine($"  {label,-12} arith {FormatPctSigned(stats.ArithMeanPct)} │ geo {FormatPctSigned(stats.GeoMeanPct)} │ median {FormatPctSigned(stats.MedianPct)}   ({stats.AcAvg.ToString(fmt, System.Globalization.CultureInfo.InvariantCulture)} {unit} vs {stats.MpAvg.ToString(fmt, System.Globalization.CultureInfo.InvariantCulture)} {unit}, {stats.CellCount} cells)");
         System.Console.ResetColor();
+    }
 
-        System.Console.ForegroundColor = rtPctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
-        System.Console.WriteLine($"  Round-trip:  {rtPctAll:+0;-0}% ({acBinaryAvgRt:F2} µs/op vs {memPackAvgRt:F2} µs/op)");
-        System.Console.ResetColor();
-
-        System.Console.ForegroundColor = sizePctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
-        System.Console.WriteLine($"  Size:        {sizePctAll:+0;-0}% ({acBinaryAvgSize:F0} B vs {memPackAvgSize:F0} B)");
-        System.Console.ResetColor();
-
-        // Allocation comparison: byte[] API allocates the output array on both sides — delta shows serializer-overhead diff.
-        if (memPackAvgSerAlloc > 0 && acBinaryAvgSerAlloc > 0)
-        {
-            var serAllocPct = (acBinaryAvgSerAlloc / memPackAvgSerAlloc - 1) * 100;
-            System.Console.ForegroundColor = serAllocPct <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
-            System.Console.WriteLine($"  Ser Alloc:   {serAllocPct:+0;-0}% ({acBinaryAvgSerAlloc:F0} B/op vs {memPackAvgSerAlloc:F0} B/op)");
-            System.Console.ResetColor();
-        }
-        if (memPackAvgDesAlloc > 0 && acBinaryAvgDesAlloc > 0)
-        {
-            var desAllocPct = (acBinaryAvgDesAlloc / memPackAvgDesAlloc - 1) * 100;
-            System.Console.ForegroundColor = desAllocPct <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
-            System.Console.WriteLine($"  Des Alloc:   {desAllocPct:+0;-0}% ({acBinaryAvgDesAlloc:F0} B/op vs {memPackAvgDesAlloc:F0} B/op)");
-            System.Console.ResetColor();
-        }
+    /// <summary>
+    /// Same as <see cref="WriteOverallLine"/> but appends to a <see cref="StringBuilder"/> (no color).
+    /// Used by the .log and .LLM file writers.
+    /// </summary>
+    private static void AppendOverallLine(StringBuilder sb, string label, string unit, OverallStats? stats, string fmt = "F2")
+    {
+        if (stats == null) return;
+        sb.AppendLine($"  {label,-12} arith {FormatPctSigned(stats.ArithMeanPct)} | geo {FormatPctSigned(stats.GeoMeanPct)} | median {FormatPctSigned(stats.MedianPct)}   ({stats.AcAvg.ToString(fmt, System.Globalization.CultureInfo.InvariantCulture)} {unit} vs {stats.MpAvg.ToString(fmt, System.Globalization.CultureInfo.InvariantCulture)} {unit}, {stats.CellCount} cells)");
     }
 
     private static void SaveResults(List<BenchmarkResult> results, List<TestDataSet> testDataSets)
@@ -3143,39 +3226,17 @@ public static class Program
             return;
         }
 
-        if (memPackSerResults2.Count > 0 && acBinarySerResults2.Count > 0)
-        {
-            // Per-op µs averages (iter-independent) — see comment above the parallel block in PrintSummary.
-            var memPackAvgSer2 = memPackSerResults2.Average(r => SerPerOp(r));
-            var acBinaryAvgSer2 = acBinarySerResults2.Average(r => SerPerOp(r));
-            var memPackAvgSerAlloc2 = memPackSerResults2.Average(r => r.SerializeAllocBytesPerOp);
-            var acBinaryAvgSerAlloc2 = acBinarySerResults2.Average(r => r.SerializeAllocBytesPerOp);
-            sb.AppendLine($"  Serialize:   {((acBinaryAvgSer2 / memPackAvgSer2 - 1) * 100):+0;-0}% ({acBinaryAvgSer2:F2} µs/op vs {memPackAvgSer2:F2} µs/op)");
-            if (memPackAvgSerAlloc2 > 0)
-                sb.AppendLine($"  Ser Alloc:   {((acBinaryAvgSerAlloc2 / memPackAvgSerAlloc2 - 1) * 100):+0;-0}% ({acBinaryAvgSerAlloc2:F0} B/op vs {memPackAvgSerAlloc2:F0} B/op)");
-        }
+        // Per-cell-paired aggregation: arithmetic / geometric / median. See PrintSummary's parallel
+        // block + the OverallStats record for the rationale (per-cell ratio vs magnitude-weighted mean).
+        var sizeAcResults2 = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)).ToList();
+        var sizeMpResults2 = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).ToList();
 
-        if (memPackDesResults2.Count > 0 && acBinaryDesResults2.Count > 0)
-        {
-            var memPackAvgDes2 = memPackDesResults2.Average(r => DesPerOp(r));
-            var acBinaryAvgDes2 = acBinaryDesResults2.Average(r => DesPerOp(r));
-            var memPackAvgDesAlloc2 = memPackDesResults2.Average(r => r.DeserializeAllocBytesPerOp);
-            var acBinaryAvgDesAlloc2 = acBinaryDesResults2.Average(r => r.DeserializeAllocBytesPerOp);
-            sb.AppendLine($"  Deserialize: {((acBinaryAvgDes2 / memPackAvgDes2 - 1) * 100):+0;-0}% ({acBinaryAvgDes2:F2} µs/op vs {memPackAvgDes2:F2} µs/op)");
-            if (memPackAvgDesAlloc2 > 0)
-                sb.AppendLine($"  Des Alloc:   {((acBinaryAvgDesAlloc2 / memPackAvgDesAlloc2 - 1) * 100):+0;-0}% ({acBinaryAvgDesAlloc2:F0} B/op vs {memPackAvgDesAlloc2:F0} B/op)");
-        }
-
-        if (memPackRtResults2.Count > 0 && acBinaryRtResults2.Count > 0)
-        {
-            var memPackAvgRt2 = memPackRtResults2.Average(r => RtPerOp(r));
-            var acBinaryAvgRt2 = acBinaryRtResults2.Average(r => RtPerOp(r));
-            sb.AppendLine($"  Round-trip:  {((acBinaryAvgRt2 / memPackAvgRt2 - 1) * 100):+0;-0}% ({acBinaryAvgRt2:F2} µs/op vs {memPackAvgRt2:F2} µs/op)");
-        }
-
-        var memPackAvgSize2 = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).Average(r => r.SerializedSize);
-        var acBinaryAvgSize2 = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)).Average(r => r.SerializedSize);
-        sb.AppendLine($"  Size:        {((acBinaryAvgSize2 / memPackAvgSize2 - 1) * 100):+0;-0}% ({acBinaryAvgSize2:F0} B vs {memPackAvgSize2:F0} B)");
+        AppendOverallLine(sb, "Serialize",   "µs/op", ComputeOverallStats(acBinarySerResults2, memPackSerResults2, SerPerOp));
+        AppendOverallLine(sb, "Ser Alloc",   "B/op",  ComputeOverallStats(acBinarySerResults2, memPackSerResults2, r => r.SerializeAllocBytesPerOp), "F0");
+        AppendOverallLine(sb, "Deserialize", "µs/op", ComputeOverallStats(acBinaryDesResults2, memPackDesResults2, DesPerOp));
+        AppendOverallLine(sb, "Des Alloc",   "B/op",  ComputeOverallStats(acBinaryDesResults2, memPackDesResults2, r => r.DeserializeAllocBytesPerOp), "F0");
+        AppendOverallLine(sb, "Round-trip",  "µs/op", ComputeOverallStats(acBinaryRtResults2, memPackRtResults2, RtPerOp));
+        AppendOverallLine(sb, "Size",        "B",     ComputeOverallStats(sizeAcResults2, sizeMpResults2, r => r.SerializedSize), "F0");
 
         File.WriteAllText(logFilePath, sb.ToString(), Utf8NoBom);
         System.Console.WriteLine($"✓ Results saved to: {logFilePath}");
@@ -3253,6 +3314,36 @@ public static class Program
             }
         }
 
+        // Overall AcBinary (SGen, Byte[]) vs MemoryPack (Byte[]) comparison — same three aggregations
+        // as the .log / console output (arithmetic / geometric / median of per-cell ratios). The
+        // arith mean is magnitude-weighted (Large cell dominates); geo/median are per-cell-equal
+        // signals. Adding this lets an LLM diagnose whether a headline delta is a real overall
+        // win/loss or a single-cell artifact.
+        var memPackByteArrayResults = results.Where(r => r.Engine == EngineMemoryPack && r.IoMode == IoByteArray).ToList();
+        var acBinarySGenByteArrayResults = results.Where(r => r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen).ToList();
+        var memPackSerResultsLlm = memPackByteArrayResults.Where(r => r.SerializeTimeMs > 0).ToList();
+        var memPackDesResultsLlm = memPackByteArrayResults.Where(r => r.DeserializeTimeMs > 0).ToList();
+        var memPackRtResultsLlm = memPackByteArrayResults.Where(r => r.RoundTripTimeMs > 0).ToList();
+        var acBinarySerResultsLlm = acBinarySGenByteArrayResults.Where(r => r.SerializeTimeMs > 0).ToList();
+        var acBinaryDesResultsLlm = acBinarySGenByteArrayResults.Where(r => r.DeserializeTimeMs > 0).ToList();
+        var acBinaryRtResultsLlm = acBinarySGenByteArrayResults.Where(r => r.RoundTripTimeMs > 0).ToList();
+
+        if (memPackRtResultsLlm.Count > 0 && acBinaryRtResultsLlm.Count > 0)
+        {
+            sb.AppendLine();
+            sb.AppendLine("## Overall: AcBinary (Byte[], SGen) vs MemoryPack (Byte[])");
+            sb.AppendLine();
+            sb.AppendLine("Three aggregations of per-cell results: **arith** = arithmetic mean of µs/op (magnitude-weighted, Large cell dominates); **geo** = geometric mean of per-cell ratios (each cell weighted equally); **median** = median of per-cell ratios (outlier-resistant). Negative % = AcBinary faster/smaller; positive % = MemPack faster/smaller. The geo/median variants surface when a single big cell skews the arithmetic mean.");
+            sb.AppendLine();
+            sb.AppendLine("```");
+            AppendOverallLine(sb, "Serialize",   "µs/op", ComputeOverallStats(acBinarySerResultsLlm, memPackSerResultsLlm, SerPerOp));
+            AppendOverallLine(sb, "Ser Alloc",   "B/op",  ComputeOverallStats(acBinarySerResultsLlm, memPackSerResultsLlm, r => r.SerializeAllocBytesPerOp), "F0");
+            AppendOverallLine(sb, "Deserialize", "µs/op", ComputeOverallStats(acBinaryDesResultsLlm, memPackDesResultsLlm, DesPerOp));
+            AppendOverallLine(sb, "Des Alloc",   "B/op",  ComputeOverallStats(acBinaryDesResultsLlm, memPackDesResultsLlm, r => r.DeserializeAllocBytesPerOp), "F0");
+            AppendOverallLine(sb, "Round-trip",  "µs/op", ComputeOverallStats(acBinaryRtResultsLlm, memPackRtResultsLlm, RtPerOp));
+            AppendOverallLine(sb, "Size",        "B",     ComputeOverallStats(acBinarySGenByteArrayResults, memPackByteArrayResults, r => r.SerializedSize), "F0");
+            sb.AppendLine("```");
+        }
 
         File.WriteAllText(filePath, sb.ToString(), Utf8NoBom);
         System.Console.WriteLine($"✓ LLM results saved to: {filePath}");
diff --git a/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.cs b/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.cs
index 31137b5..09b6611 100644
--- a/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.cs
+++ b/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.cs
@@ -18,6 +18,32 @@ public class AcBinarySourceGenerator : IIncrementalGenerator
 {
     private const string AttributeName = "AyCode.Core.Serializers.Attributes.AcBinarySerializableAttribute";
 
+    // ────────────────────────────────────────────────────────────────────────────────────────────
+    // TEMPORARY (2026-05-08) — A/B test feature gates for hot-path overhead measurement.
+    //
+    // The generated SGen `WriteProperties` / `ScanObject` methods emit two kinds of overhead-blocks
+    // that are unconditionally present today but rarely exercised in typical workloads:
+    //
+    //   1. PropertyFilter guard (`UsePropertyFilter`) — every non-markerless property emit-site
+    //      checks `context.HasPropertyFilter` + filter-context allocation + lambda-call.
+    //      The benchmark workload never sets a property-filter → branch is always false →
+    //      pure overhead (CPU cycles + i-cache pressure on the hot path).
+    //
+    //   2. Polymorphic object-with-type-name emit (`UsePolymorphType`) — `System.Object` declared
+    //      properties emit `ObjectWithTypeName` marker + `WriteStringUtf8(AssemblyQualifiedName)`
+    //      under `!context.UseMetadata`. Same: rarely used in typical DTO graphs.
+    //
+    // Setting either to `false` skips the corresponding emit at compile time → leaner generated
+    // code. The bench measures the actual delta vs MemPack apples-to-apples (which has neither
+    // of these features).
+    //
+    // Long-term: these flags will move to `[AcBinarySerializable(UsePropertyFilter = false, ...)]`
+    // attribute properties so consumers can opt out per type. Until then, keep both `false` for
+    // benchmark-vs-MemPack measurements; flip to `true` for production where the features are needed.
+    // ────────────────────────────────────────────────────────────────────────────────────────────
+    private const bool UsePropertyFilter = false;
+    private const bool UsePolymorphType = false;
+
     private static readonly DiagnosticDescriptor CircularReferenceWarning = new(
         id: "ACBIN001",
         title: "Circular reference detected",
@@ -672,16 +698,21 @@ public class AcBinarySourceGenerator : IIncrementalGenerator
         }
 
         // All non-markerless properties: emit PropertyFilter guard
-        // When filter returns false, write PropertySkip and skip the property write
-        sb.AppendLine($"{i}if (context.HasPropertyFilter)");
-        sb.AppendLine($"{i}{{");
-        sb.AppendLine($"{i}    var fc_{p.Name} = new BinaryPropertyFilterContext(obj, typeof({fullTypeName}), \"{p.Name}\", typeof({p.TypeNameForTypeof}), static o => (({fullTypeName})o).{p.Name});");
-        sb.AppendLine($"{i}    if (!context.PropertyFilter!(in fc_{p.Name}))");
-        sb.AppendLine($"{i}    {{");
-        sb.AppendLine($"{i}        context.WriteByte(BinaryTypeCode.PropertySkip);");
-        sb.AppendLine($"{i}        goto skip_{p.Name};");
-        sb.AppendLine($"{i}    }}");
-        sb.AppendLine($"{i}}}");
+        // When filter returns false, write PropertySkip and skip the property write.
+        // Gated by `UsePropertyFilter` (TEMPORARY const) — `false` skips emit entirely → leaner
+        // generated code on benchmark workloads where no property-filter is ever set.
+        if (UsePropertyFilter)
+        {
+            sb.AppendLine($"{i}if (context.HasPropertyFilter)");
+            sb.AppendLine($"{i}{{");
+            sb.AppendLine($"{i}    var fc_{p.Name} = new BinaryPropertyFilterContext(obj, typeof({fullTypeName}), \"{p.Name}\", typeof({p.TypeNameForTypeof}), static o => (({fullTypeName})o).{p.Name});");
+            sb.AppendLine($"{i}    if (!context.PropertyFilter!(in fc_{p.Name}))");
+            sb.AppendLine($"{i}    {{");
+            sb.AppendLine($"{i}        context.WriteByte(BinaryTypeCode.PropertySkip);");
+            sb.AppendLine($"{i}        goto skip_{p.Name};");
+            sb.AppendLine($"{i}    }}");
+            sb.AppendLine($"{i}}}");
+        }
 
         // Nullable value types always use markered path (need Null marker)
         if (IsNullableVTKind(p.TypeKind))
@@ -715,14 +746,21 @@ public class AcBinarySourceGenerator : IIncrementalGenerator
                     // System.Object property: runtime type unknown at compile time.
                     // Write ObjectWithTypeName prefix so deserializer can resolve the concrete type.
                     // Use value.GetType() for runtime type dispatch (not typeof(object)).
+                    // Gated by `UsePolymorphType` (TEMPORARY const) — `false` skips the type-name emit
+                    // entirely (deser will use the property's declared type, which is `object` so the
+                    // round-trip would fail on polymorphic instances; safe ONLY when the workload is
+                    // known not to use polymorphic object-typed properties — true for the benchmark).
                     sb.AppendLine($"{i}if ({a} == null) context.WriteByte(BinaryTypeCode.PropertySkip);");
                     sb.AppendLine($"{i}else");
                     sb.AppendLine($"{i}{{");
-                    sb.AppendLine($"{i}    if (!context.UseMetadata)");
-                    sb.AppendLine($"{i}    {{");
-                    sb.AppendLine($"{i}        context.WriteByte(BinaryTypeCode.ObjectWithTypeName);");
-                    sb.AppendLine($"{i}        context.WriteStringUtf8({a}.GetType().AssemblyQualifiedName!);");
-                    sb.AppendLine($"{i}    }}");
+                    if (UsePolymorphType)
+                    {
+                        sb.AppendLine($"{i}    if (!context.UseMetadata)");
+                        sb.AppendLine($"{i}    {{");
+                        sb.AppendLine($"{i}        context.WriteByte(BinaryTypeCode.ObjectWithTypeName);");
+                        sb.AppendLine($"{i}        context.WriteStringUtf8({a}.GetType().AssemblyQualifiedName!);");
+                        sb.AppendLine($"{i}    }}");
+                    }
                     sb.AppendLine($"{i}    AcBinarySerializer.WriteValueGenerated({a}, {a}.GetType(), context, depth);");
                     sb.AppendLine($"{i}}}");
                 }
@@ -881,8 +919,9 @@ public class AcBinarySourceGenerator : IIncrementalGenerator
         var a = $"obj.{p.Name}";
 
         // PropertyFilter: must match write pass — if filter skips property, scan must skip too
-        // Only for non-markerless properties (matching EmitProp behavior)
-        if (!IsMarkerless(p.TypeKind))
+        // Only for non-markerless properties (matching EmitProp behavior).
+        // Gated by `UsePropertyFilter` (TEMPORARY const) — same A/B flag as the writer pass.
+        if (UsePropertyFilter && !IsMarkerless(p.TypeKind))
         {
             sb.AppendLine($"{i}if (context.HasPropertyFilter)");
             sb.AppendLine($"{i}{{");
@@ -1849,7 +1888,7 @@ public class AcBinarySourceGenerator : IIncrementalGenerator
         sb.AppendLine($"{i}    {{");
         sb.AppendLine($"{i}        if (context.FastWire)");
         sb.AppendLine($"{i}        {{");
-        sb.AppendLine($"{i}            var fwlen = (int)context.ReadVarUInt();");
+        sb.AppendLine($"{i}            var fwlen = context.ReadInt32Unsafe();");
         sb.AppendLine($"{i}            {a} = fwlen == 0 ? string.Empty : context.ReadStringUtf8(fwlen);");
         sb.AppendLine($"{i}        }}");
         sb.AppendLine($"{i}        else");
diff --git a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs
index d741c7f..832c91f 100644
--- a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs
+++ b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs
@@ -128,6 +128,20 @@ public static partial class AcBinaryDeserializer
             return value;
         }
 
+        /// <summary>
+        /// Reads a 4-byte signed integer (little-endian on Intel/AMD, native-endian elsewhere).
+        /// Symmetric with <c>Unsafe.WriteUnaligned&lt;int&gt;</c> on the writer side. Used by FastWire
+        /// <c>StringSmall</c> reader to grab <c>charLen:int32</c>.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public int ReadInt32Unsafe()
+        {
+            EnsureAvailable(4);
+            var value = Unsafe.ReadUnaligned<int>(ref _buffer[_position]);
+            _position += 4;
+            return value;
+        }
+
         /// <summary>
         /// Reads an 8-byte unsigned integer (little-endian on Intel/AMD, native-endian elsewhere).
         /// Used by H2Q6 <c>StringBig</c> reader to grab packed <c>charLen:32 | utf8Len:32</c> in a single load.
diff --git a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs
index 5490bff..fe20de4 100644
--- a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs
+++ b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs
@@ -1157,8 +1157,9 @@ public static partial class AcBinaryDeserializer
     {
         if (context.FastWire)
         {
-            // Mode-shared marker: FastWire payload is [VarUInt charCount][UTF-16 raw bytes]
-            var charLenF = (int)context.ReadVarUInt();
+            // Mode-shared marker: FastWire payload is [charLen:int32 LE][UTF-16 raw bytes]
+            // Fix-int charLen (matches MemPack WriteUtf16 shape) — single 4-byte read, no VarUInt loop.
+            var charLenF = context.ReadInt32Unsafe();
             return context.ReadStringUtf8(charLenF);
         }
 
diff --git a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs
index 034a406..5072623 100644
--- a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs
+++ b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs
@@ -499,11 +499,14 @@ public static partial class AcBinarySerializer
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static int VarUIntSize(uint value)
         {
-            if (value < 0x80) return 1;
-            if (value < 0x4000) return 2;
-            if (value < 0x200000) return 3;
-            if (value < 0x10000000) return 4;
-            return 5;
+            return value switch
+            {
+                < 0x80 => 1,
+                < 0x4000 => 2,
+                < 0x200000 => 3,
+                < 0x10000000 => 4,
+                _ => 5
+            };
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -750,13 +753,16 @@ public static partial class AcBinarySerializer
 
             if (FastWire)
             {
-                // FastWire: [StringSmall marker][VarUInt charCount][UTF-16 raw bytes]
-                // Marker value 91 is mode-shared (Compact StringSmall vs FastWire string marker);
-                // reader dispatches by deserializer mode, NOT by re-interpreting the marker.
-                WriteByte(BinaryTypeCode.StringSmall);
+                // FastWire: [StringSmall marker:1][charLen:int32 LE][UTF-16 raw bytes]
+                // Fix-int header (no tier-dispatch, no VarUInt branch loop) — matches MemPack `WriteUtf16`
+                // shape (which emits a fix `int` length). Single Unsafe.WriteUnaligned<int> store on the
+                // writer; symmetric ReadInt32Unsafe on the reader.
                 var byteLenF = charLength * 2;  // safe: charLength ≤ 0x1FFFFFFF guarantees no overflow
-                WriteVarUInt((uint)charLength);
-                EnsureCapacity(byteLenF);
+                EnsureCapacity(7 + byteLenF);
+                var fwPos = _position;
+                var packed = (ulong)BinaryTypeCode.StringSmall | ((ulong)(uint)charLength << 8);
+                Unsafe.WriteUnaligned<ulong>(ref _buffer[fwPos], packed);
+                _position = fwPos + 5;
                 MemoryMarshal.AsBytes(value.AsSpan()).CopyTo(_buffer.AsSpan(_position, byteLenF));
                 _position += byteLenF;
                 return;
@@ -772,10 +778,12 @@ public static partial class AcBinarySerializer
             //      reserve was Medium (5 byte) — body is left-shifted by 2 bytes to compact.
             var maxBytes = charLength * 4;
 
-            int reserveHeader;
-            if (charLength <= 63)         reserveHeader = 3;
-            else if (charLength <= 16383) reserveHeader = 5;
-            else                           reserveHeader = 9;
+            int reserveHeader = charLength switch
+            {
+                <= 63 => 3,
+                <= 16383 => 5,
+                _ => 9
+            };
 
             EnsureCapacity(reserveHeader + maxBytes);
 
diff --git a/AyCode.Core/docs/BINARY/BINARY_TODO.md b/AyCode.Core/docs/BINARY/BINARY_TODO.md
index 5042ae3..2a6dbcd 100644
--- a/AyCode.Core/docs/BINARY/BINARY_TODO.md
+++ b/AyCode.Core/docs/BINARY/BINARY_TODO.md
@@ -756,6 +756,8 @@ The optimization-value signal proved below the bench noise floor on the availabl
 
 **Re-evaluable as of 2026-05-07 per `ACCORE-BIN-T-D9X3`** — bench stabilization removes the noise-floor that made the original signal unmeasurable; retest before any code change. (Charset bias remains — pair with `ACCORE-BIN-T-C5R8` for CJK validation.)
 
+**Retested 2026-05-08 — REGRESSION CONFIRMED** (Latin1Long charset, stabilized bench): adding the do-while inner loop on both 2-byte and 3-byte tiers in `DecodeUtf8SinglePass` produced **+5-8pp Deser regression on every cell** vs. the switch-jumptable baseline (Small +7.8pp, Medium +7.1pp, Large +5.5pp, Repeated +7.4pp, Deep +4.9pp). Reverted to switch-jumptable single-decode same day. The V4N2 entry's original prediction held: "Magyar mixed (KözösCímke, sötét — short alternating runs): 0-5% (run-detection overhead may eat the savings on short runs)" — Latin1Long suffix has 1-2 char average run length, well below the run-detection break-even point. **Phase 2.5 is dead on Magyar mixed.** CJK retest still untried, but Phase 2.5 is now obsoleted by `ACCORE-BIN-T-K7M3` (the decoder hot path runs `Utf8.ToUtf16` BCL static API, not `DecodeUtf8SinglePass`).
+
 **Below: original Phase 2.5 design notes preserved as documentation.** Implementation details remain accurate even though the implementation was reverted.
 
 ---
@@ -1217,7 +1219,7 @@ Reader-side: SGen-generated code drops the per-property `ReadByte()` + `IsTinyIn
 - Opt-in flag with default `false` (preserves marker-driven default; consumers explicitly opt in for frozen-schema scenarios)
 
 ## ACCORE-BIN-T-V4N3: Symmetric `GetUtf8ByteCount` API + writer-side BCL kihagyás (cold path)
-**Priority:** P3 · **Type:** Performance · **Status:** Closed (2026-05-06) · **Related:** `EncodeUtf8SinglePass`, `WriteStringUtf8Internal`, `PropertyMetadataBase.NameUtf8`
+**Priority:** P3 · **Type:** Performance · **Status:** Superseded (2026-05-08, by `ACCORE-BIN-T-K7M3`) — landed Closed 2026-05-06; subsequent A/B against modern `Utf8.FromUtf16` / `Utf8.ToUtf16` showed the BCL modern API outperforms the custom transcoder on every benchmark cell, leading to full hot-path switch in K7M3 · **Related:** `EncodeUtf8SinglePass`, `WriteStringUtf8Internal`, `PropertyMetadataBase.NameUtf8`, `ACCORE-BIN-T-K7M3` (hot-path BCL switch)
 
 Symmetric byte-count helper for `EncodeUtf8SinglePass`, paired with writer-side BCL `Encoding.UTF8.GetBytes` / `GetByteCount` removal across all cold-path call sites. `Utf8Transcoder.GetUtf8ByteCount(ReadOnlySpan<char>)` SIMD impl (Vector512 / Vector256 / Vector128 / scalar tier hierarchy, 5-popcount closed-form aggregation handling chunk-split surrogate pairs correctly).
 
@@ -1235,6 +1237,12 @@ Landed 2026-05-06. All `Utf8TranscoderTests` pass (55/55). Binary test suite unc
 
 **Algorithmic correctness lesson** — the initial 4-popcount formula (`3*N - c_lt_0x80 - c_lt_0x800 - 2*highSur`) was wrong on chunks where a surrogate pair straddles the SIMD chunk boundary (it implicitly assumed `lowSur == highSur` per chunk, which is true over the whole well-formed string but NOT per chunk). Fix: 5-popcount closed-form (`3*N - ascii - c_lt_0x800 + highSur - 3*lowSur`), with the scalar tail using the same per-char accounting model (`i += 1` per char regardless of role; high → 4, low → 0, BMP → 3, two-byte → 2, ASCII → 1). Caught by `GetUtf8ByteCount_MultipleEmojiBoundary_MatchesBcl` and `GetUtf8ByteCount_BoundaryAsciiToEmoji_MatchesBcl` regression tests — exactly the `prefixLen` 1, 7 boundaries that exercise chunk-split surrogate pairs.
 
+### Superseded by `ACCORE-BIN-T-K7M3` (2026-05-08)
+
+The V4N3 audit measured the custom transcoder against the **legacy `Encoding.UTF8.GetBytes`** API and won. **Did NOT measure against the modern `System.Text.Unicode.Utf8.FromUtf16` / `Utf8.ToUtf16` static API** (.NET 7+, used by MemoryPack source-gen). Once `D9X3` stabilized the bench, a direct A/B revealed the BCL modern API outperforms the custom transcoder on **every** cell (Ser deficit -14 to -22pp, Deser flips from behind to ahead). All 8 hot-path call sites switched to BCL in `K7M3`. The `Utf8Transcoder.cs` file is fully commented out — preserved as historical reference.
+
+The V4N3 algorithmic correctness work (5-popcount surrogate-pair-split-across-chunks closed-form) remains a **valid algorithmic contribution**, but no longer load-bearing on the hot path.
+
 ## ACCORE-BIN-T-V4N4: NativeAOT-specific inlining / codegen audit on hot UTF-8 path
 **Priority:** P2 · **Type:** Performance · **Status:** Reverted (2026-05-07) — bench instability made the optimization signal unmeasurable · **Related:** `EncodeUtf8SinglePass`, `DecodeUtf8SinglePass`, `WriteStringWithDispatch`, `Utf8Transcoder` SIMD path
 
@@ -1343,7 +1351,7 @@ A V4N4 audit **konklúziója** változatlan érvényes (constant-fold OK, reader
 
 **Re-evaluable as of 2026-05-07 per `ACCORE-BIN-T-D9X3`** — bench stabilization removes the noise-floor that made the original signal unmeasurable; retest before any code change.
 
-## ACCORE-BIN-T-V4N5: Dead-code review — `WriteFixStrDirect` + `WriteStringUtf8Internal`
+**Obsoleted (2026-05-08) by `ACCORE-BIN-T-K7M3`** — the writer hot path no longer calls the custom `EncodeUtf8SinglePass` at all (`WriteStringWithDispatch` was switched to `Utf8.FromUtf16` BCL). The "AOT method-split / inlining audit" target (`Utf8Transcoder` body method-size in NativeAOT inline budget) is moot — the BCL `Utf8.FromUtf16` is a single static method with its own AOT-friendly inline footprint, and the audit's hypothesis space (Vector256 `IsSupported` constant-fold, lambda delegate cache) was correct for the prior code but no longer applies. The V4N4 disasm methodology remains a **valid technique** for future investigations of generic specialization / inline failures, but the specific hot-path target it analyzed is gone.
 **Priority:** P3 · **Type:** Refactor / hygiene · **Status:** Closed (2026-05-06) · **Related:** `BinarySerializationContext.cs`
 
 V4N3 audit surfaced two methods with no callers in the entire workspace:
@@ -1992,4 +2000,443 @@ Header line updated:
 - Before: `Iterations: 1000 | Warmup: 10000 | Samples: 10 (median) | ...`
 - After: `Iterations: per-cell adaptive (target ~250 ms/sample) | Warmup: 10000 | Samples: 10 (median) + 1 pilot discarded | ... | UnstableCV threshold: 3%`
 
+## ACCORE-BIN-T-K7M3: Hot-path UTF-8 transcoder switch — `Utf8Transcoder` → BCL `Utf8.FromUtf16` / `Utf8.ToUtf16`
+**Priority:** P1 · **Type:** Performance · **Status:** Closed (2026-05-08) · **Related:** `ACCORE-BIN-T-V4N3` (custom transcoder origin), `ACCORE-BIN-T-V4N2` (Phase 3 SIMD multi-byte), `ACCORE-BIN-T-V4N4` (Reverted method-split), `ACCORE-BIN-T-D9X3` (bench stabilization that made the comparison measurable)
+
+The custom `Utf8Transcoder` (V4N3) was originally implemented to bypass `System.Text.Encoding.UTF8.GetBytes` virtual-dispatch + EncoderFallback overhead. The V4N3 audit measured wins vs. the **legacy `Encoding.UTF8`** API. **What it did NOT measure**: the modern `System.Text.Unicode.Utf8.FromUtf16` / `Utf8.ToUtf16` API (.NET 7+, tier-1 optimized, used by MemoryPack `WriteUtf8` / `ReadUtf8` paths internally). Once the bench stabilized (D9X3), a direct A/B comparison surfaced that the BCL modern API consistently outperforms the custom transcoder on the binary serializer's hot path.
+
+### Bench A/B (Latin1Long charset, FastMode SGen Compact)
+
+| Cell | Ser delta vs MemPack — custom (`EncodeUtf8SinglePass`) | Ser delta vs MemPack — BCL (`Utf8.FromUtf16`) | Improvement |
+|------|--------------------------------------------------------|------------------------------------------------|-------------|
+| Small | +28.5% | +7.3% | **-21pp** |
+| Medium | +23.8% | +3.1% | -21pp |
+| Large | +19.6% | +5.1% | -14pp |
+| Repeated | +28.8% | +10.9% | -18pp |
+| Deep | +23.1% | +0.6% | -22pp |
+
+| Cell | Deser delta vs MemPack — custom (`DecodeUtf8SinglePass`) | Deser delta vs MemPack — BCL (`Utf8.ToUtf16`) | Improvement |
+|------|---------------------------------------------------------|------------------------------------------------|-------------|
+| Small | +17.6% | -1.2% (paritás) | -19pp |
+| Medium | +12.8% | -4.7% (AcBinary nyer) | -17pp |
+| Large | +4.9% | -10.3% (AcBinary nyer) | -15pp |
+| Repeated | +16.9% | -1.6% (paritás) | -18pp |
+| Deep | +7.0% | -9.0% (AcBinary nyer) | -16pp |
+
+The Deser side flipped from "consistently behind" to "wins on 3 of 5 cells, paritás on 2". The Ser side closed the deficit from +20-29% to 0-11%. **Both sides** measurable improvement on **every** cell.
+
+### Why the custom transcoder lost
+
+The V4N3 implementation included a 4-tier SIMD ASCII prefix path (Vector512BW / Vector256 / Vector128 / scalar) plus a DWORD ASCII batch + scalar 4-branch multi-byte fallback. **All correct, all SIMD-tuned**. But:
+
+1. **`Utf8.FromUtf16` is also SIMD-tuned in .NET 9** — the .NET team rewrote it on top of `System.Text.Unicode.Utf8` primitives that share infrastructure with `Ascii.IsValid` / `Latin1.GetString`. AOT-publish-friendly, branch-friendly, no virtual dispatch (the `Utf8` API is static, not via an `Encoding` instance with virtual-method-table).
+2. **The custom transcoder's ASCII prefix path bails out on first non-ASCII byte** — on multi-byte content (Latin extended / Cyrillic / CJK) the SIMD path runs only for the leading ASCII span, then the entire remainder falls into per-char scalar 4-branch dispatch. The BCL `Utf8.FromUtf16` SIMD-batches multi-byte content too (different algorithm — the BCL doesn't bail on first non-ASCII).
+3. **AOT inline budget**: the custom transcoder's body grew with the V4N3 / V4N4 / V4N5 additions; in NativeAOT publish the call sites in `WriteStringWithDispatch` / `ReadString*` did NOT inline (V4N4 disasm audit confirmed). The BCL `Utf8.FromUtf16` is a single static method with a tighter call-site footprint.
+
+### Resolution
+
+Landed 2026-05-08. The 8 production hot-path call sites of `Utf8Transcoder.*` switched to BCL:
+
+| File / line | Before | After |
+|---|---|---|
+| `AcBinarySerializer.cs:120` | `Utf8Transcoder.GetUtf8ByteCount` | `Encoding.UTF8.GetByteCount` |
+| `AcBinarySerializer.BinarySerializationContext.cs:694` | `Utf8Transcoder.EncodeUtf8SinglePass` | `Utf8.FromUtf16(...)` |
+| `AcBinarySerializer.BinarySerializationContext.cs:784` | `Utf8Transcoder.EncodeUtf8SinglePass` | `Utf8.FromUtf16(...)` |
+| `AcBinarySerializer.BinarySerializationContext.cs:901` | `Utf8Transcoder.EncodeUtf8SinglePass` | `Utf8.FromUtf16(...)` |
+| `AcBinaryDeserializer.BinaryDeserializationContext.Read.cs:523` | `Utf8Transcoder.CountUtf8Chars` | `Encoding.UTF8.GetCharCount` |
+| `AcBinaryDeserializer.BinaryDeserializationContext.Read.cs:527` | `Utf8Transcoder.DecodeUtf8SinglePass` | `Utf8.ToUtf16(...)` |
+| `AcBinaryDeserializer.BinaryDeserializationContext.Read.cs:565` | `Utf8Transcoder.DecodeUtf8SinglePass` | `Utf8.ToUtf16(...)` |
+| `PropertyMetadataBase.cs:104-109` (ctor-once) | `Utf8Transcoder.GetUtf8ByteCount` + `EncodeUtf8SinglePass` (two-pass) | `Encoding.UTF8.GetBytes(string)` (single-pass with exact-size byte[] return) |
+
+The count-only call sites (`GetByteCount` / `GetCharCount`) stay on the **legacy** `Encoding.UTF8` API — `System.Text.Unicode.Utf8` has no count-only equivalent (only `FromUtf16` / `ToUtf16` which encode + count combined). For pure count, the legacy API is the optimal tool (single SIMD-tuned scan, no encode/decode work).
+
+The `Utf8Transcoder.cs` file remains in the repo but **fully commented out** — the class definition is preserved as historical reference / future reactivation if a workload ever surfaces where it could win again. `Utf8TranscoderTests.cs` is not currently exercising live code.
+
+### Lesson — the V4N3 audit's blind spot
+
+The V4N3 (custom transcoder) audit compared against **legacy `Encoding.UTF8.GetBytes`** and won. **The audit did NOT compare against `Utf8.FromUtf16`** (the modern API, .NET 7+). On modern runtime the BCL has two UTF-8 transcoders: a legacy one (instance-method on `Encoding`, virtual dispatch) and a modern one (static `Utf8.FromUtf16` / `Utf8.ToUtf16`). MemoryPack uses the modern one — that's what we should have been comparing against from the start.
+
+**Generalizable lesson**: when measuring a custom implementation against a "BCL baseline", verify which BCL API is used by the actual competition (here: MemoryPack source-gen). The `Encoding.UTF8.*` instance API and `System.Text.Unicode.Utf8` static API are different generations of the same logical operation; treating them as interchangeable hides the comparison's scope.
+
+### Why P1
+
+- Closed the FastMode Compact mode Ser deficit from +20-29% to ≤11% on every cell (Latin1Long benchmark)
+- Flipped the Deser side from -1 to -10% deficit to **AcBinary winning on 3 of 5 cells**, parity on 2 (Latin1Long benchmark)
+- One-time fixed cost (8 production call-site cseréje) — every future bench profits
+- Removed a load-bearing ~600-line custom SIMD module from the maintained surface area; future maintainers don't need to reason about Vector512BW / cross-lane shuffle / 5-popcount surrogate-pair correctness — the BCL handles it
+
+### Follow-up — `Utf8Transcoder.cs` cleanup
+
+The file is fully commented out. Either:
+- **Delete** entirely (preferred for repo cleanliness) — `Utf8TranscoderTests.cs` then needs deletion or revival as a regression-only guard
+- **Keep** the comment-block as historical reference, with a header comment pointing to this entry
+
+Decision deferred — the comment-block does no harm to build / runtime. Address when the next docs-archive sweep runs.
+
+## ACCORE-BIN-T-P3X7: Profile-driven Compact-mode Ser optimalizációs roadmap (post-K7M3 hot-path analysis)
+**Priority:** P2 · **Type:** Performance roadmap · **Status:** Open · **Related:** `ACCORE-BIN-T-K7M3` (BCL UTF-8 transcoder switch — előfeltétele), `ACCORE-BIN-T-D9X3` (bench stabilization), `ACCORE-BIN-T-S2X9` (markerless schema lane — primitív property-marker már kivezetve a SGen-ben), `ACCORE-BIN-T-V4N4` (audit methodológia hivatkozás)
+
+A 2026-05-08 VS Performance Profiler session (4 sec range, AcBinary FastMode Serialize, Latin1Long charset, FastWire mode) konkrét hot-path-decomposition-t adott a K7M3 BCL-csere utáni állapotról. A string-encoding már nem akadály (a `Utf8.FromUtf16` SIMD-tuned), a fennmaradó AcBinary-specific overhead azonosítható.
+
+### Profile session adatok (Self CPU%)
+
+| Self CPU% | Function | Category |
+|---|---|---|
+| 39.77% | `System.Buffer._Memmove` | Közös MemPack-kel (UTF-16 raw + return-time `byte[]`-copy) — **NEM AcBinary-spec** |
+| **10.03%** | `AcBinarySerializer.Serialize<T>` | Top-level (context-acquire, type lookup, return-alloc) |
+| **7.48%** | `TestMeasurementPoint_GeneratedWriter.WriteProperties` | SGen template (legkisebb levél típus, ~12500 hívás Large cellán) |
+| **5.31%** | `WriteStringWithDispatch` | String hot path |
+| **3.23%** | `TestMeasurement_GeneratedWriter.WriteProperties` | SGen |
+| **1.66%** | `WriteVarUIntMultiByteUnsafe` | VarUInt int-property encode |
+| 1.10% | `TestPallet_GeneratedWriter.WriteProperties` | SGen |
+| 0.39% | `TestOrderItem_GeneratedWriter.WriteProperties` | SGen |
+| 0.32% | `SharedUser_GeneratedWriter.WriteProperties` | SGen |
+| 0.05% | `ArrayBinaryOutput.Grow` | Buffer-grow (ritka, kicsi probléma) |
+
+**Total SGen `WriteProperties` Self CPU**: ~12.6% — a leg nagyobb AcBinary-specific surface.
+
+A `AcBinarySerializer.Serialize<T>` line-szintű drill-down (`AcBinarySerializer.cs:312-335`):
+- `WriteObject(value, wrapper, context, 0)` Total: 28.05% — a teljes serializációs fa (SGen + Writer hot path)
+- `context.Output.ToArray(context._buffer, context._position)` Total: **47.37%** — final `byte[]`-alloc + content-memcpy (= a 39.77% `_Memmove` Self nagy része)
+
+### MemPack-összehasonlítás (referenciaként)
+
+A MemPack `Serialize<T>(T value)` mechanizmus:
+1. **`[ThreadStatic]`** writer-state — nincs pool-bérlés, nincs lock, nincs concurrent dictionary lookup
+2. **`ReusableLinkedArrayBufferWriter`** — linked chunk-list (4 KB → 8 KB → 16 KB geometriai); buffer-grow = új chunk hozzáadása, **nincs memcpy a régi adaton**
+3. **`ToArrayAndReset()`** — végén alloc + chunks → byte[] memcpy (közös overhead az AcBinary-vel)
+
+Az AcBinary `AcquireArrayOutputContext(options)` pool-bérlés + lineáris `byte[]` `Array.Resize` + `Output.ToArray(...)` — két memcpy-cost (grow + return), de a grow ritka.
+
+### Sorrendezett optimalizációs ötletek
+
+#### A. SGen `WriteProperties` — ensure-capacity batching (várt: -1-3pp Ser, **revíziós becslés**)
+
+Jelenlegi SGen-template per-property emit (mindenenkit külön ensure):
+```csharp
+context.WriteVarInt(obj.Id);                    // ensure(5) + write(1-5)
+context.WriteByte(BinaryTypeCode.Object);        // ensure(1) + write(1)
+context.WriteVarInt((int)obj.Status);            // ensure(5) + write(1-5)
+context.WriteRaw(obj.Weight);                     // ensure(8) + write(8)
+```
+
+Csoportosított ensure pattern:
+```csharp
+context.EnsureCapacity(maxBytesForGroup);        // worst-case sum, 1× hívás
+context.WriteVarIntUnsafe(obj.Id);                // no ensure (csak buffer write)
+context.WriteByteUnsafe(BinaryTypeCode.Object);   // no ensure
+context.WriteVarIntUnsafe((int)obj.Status);
+context.WriteRawUnsafe(obj.Weight);
+```
+
+A `AcBinarySourceGenerator.cs` `WriteProperties` template-jét kell módosítani:
+1. Property-listából contiguous primitív csoportok kinyerése (Object/Collection property-knél megszakítva — mély rekurzió, méret nem előre kiszámítható)
+2. Csoportonként worst-case-size compute compile-time-on (a primitív type-ok mérete fix vagy worst-case ismert)
+3. Egyetlen `EnsureCapacity(sum)` + bulk `*Unsafe` write-ok
+
+`*Unsafe` írók szükségessége: `WriteVarUIntUnsafe` már létezik. **`WriteByteUnsafe`, `WriteRawUnsafe<T>`** valószínűleg hozzá kell adni a `BinarySerializationContext`-hez.
+
+**Becslés-revízió (2026-05-08)**: az eredeti -4-6pp becslés felső volt. Egy `EnsureCapacity` inline-olva ~1-2 ns/call (a hot path-on a branch-prediction perfekt — sosem jut el a Grow-hoz). 10 property × 1.5 ns = ~15 ns / object megtakarítás batch-eléssel — Latin1Long Large cell 1250 instance × 13 ns = ~16 µs / 120 µs Ser ≈ **~13% felső**, de **csak az ensure-szám csökkenéséből**. A SGen `WriteProperties` Self CPU 12.6%-a **NEM csak** ensure-check; tartalmaz `HasPropertyFilter` branch-check, null-check + depth-check dispatch, `Unsafe.As<T>` cast, etc. — lásd **F**. Az ensure-batching önmagában reálisan **1-3pp Ser javulás**.
+
+**Wire-formátum változatlan**, backward-kompatibilis, kis kockázat. Hatás minden cellán mérhető (TestOrder cell-szerkezet ~100+ primitív property per Object-instance).
+
+#### B. `WriteStringWithDispatch` Compact ág batch-write (várt: -1-2pp Ser)
+
+A FastWire ágat már `K7M3`-ban + a 2026-05-08 batch-write fixxel egyetlen ensure + direct-write-ra alakítottuk. A **Compact ág** ugyanaz a 3-step pattern (post-encode tier-shift `CopyTo` ha `actualHeader < reserveHeader`, plus header-write a tier alapján). A Compact ágon is alkalmazható batch-write — egyetlen `EnsureCapacity` a worst-case-tier-szel + direct header-write a `Utf8.FromUtf16` után.
+
+#### C. Thread-static context (várt: -2-4pp Ser, NAGY refactor)
+
+A `AcquireArrayOutputContext(options)` pool-bérlés overhead-jét mérsékelheti a MemPack `[ThreadStatic]` mintázat. A jelenlegi pool-bérlés:
+- Pool dictionary lookup (lehet, lock-os)
+- Context-state init / reset minden hívásnál
+
+Thread-static cseréje:
+- Per-thread cached context, nincs lock
+- Context-reset minden hívásnál ugyanaz, de a `state` allokáció egyszer fut
+
+**Refactor szempontok**:
+- A `BinarySerializationContext` state-tárolása nem thread-safe önmagában — pool-bérlés vagy thread-static mind a single-thread haszálatot biztosítja
+- Az `options` paraméter érintheti a state-init logikát — multi-options scenárió esetén a thread-static state-t reset-elni kell
+- Concurrent serialize hívások (több thread egyidejű) — minden thread saját state-tel rendelkezne; nincs cross-thread sharing igény
+
+#### D. Linked-array buffer chunk strategy (kicsi hatás, NAGY refactor)
+
+A MemPack `ReusableLinkedArrayBufferWriter` linked chunk-list helyettesíti a lineáris `byte[]`-grow stratégiát. Buffer-grow = új chunk hozzáadása (no memcpy a régi adaton).
+
+**A profile szerint a `ArrayBinaryOutput.Grow` Self CPU csak 0.05%** — a buffer-grow ritkán fut, a default kapacitás elég nagy a Large cell-hez. **Kicsi hatás, nagy refactor**. Alacsony prioritás.
+
+#### F. SGen `HasPropertyFilter` lift-out a `WriteProperties` method elejére (várt: -2-4pp Ser)
+
+A jelenlegi SGen-template **minden property-emit előtt** ellenőrzi a property-filter-t:
+```csharp
+public void WriteProperties<TOutput>(object value, ...)
+{
+    var obj = Unsafe.As<TestPallet>(value);
+
+    if (context.HasPropertyFilter)                     // ← MINDEN property-en check!
+    {
+        var fc_Category = new BinaryPropertyFilterContext(obj, ..., "Category", ...);
+        if (!context.PropertyFilter!(in fc_Category)) {
+            context.WriteByte(BinaryTypeCode.PropertySkip);
+            goto skip_Category;
+        }
+    }
+    if (obj.Category == null) context.WriteByte(BinaryTypeCode.PropertySkip);
+    else if (depth > context.MaxDepth) context.WriteByte(BinaryTypeCode.Null);
+    else { context.WriteByte(BinaryTypeCode.Object); ...WriteProperties... }
+    skip_Category:;
+
+    if (context.HasPropertyFilter) { /* same for Inspector */ }   // ← újra!
+    // ... 10× ismétlés property-listán
+}
+```
+
+A `HasPropertyFilter` per-property branch-check **TestOrder benchmark workload-on mindig false** (a benchmark nem használ property-filter-t). De a check minden property-en lefut — kód-cache-ben benne van, branch-predict ugyan jó, **mégis CPU cycle**.
+
+Optimalizáció — kétpályás SGen kódgenerálás:
+```csharp
+public void WriteProperties<TOutput>(object value, ..., int depth)
+{
+    var obj = Unsafe.As<TestPallet>(value);
+
+    if (context.HasPropertyFilter)
+    {
+        WritePropertiesWithFilter(obj, context, depth);    // ritka path — full per-property check
+        return;
+    }
+
+    // Fast path — NO filter check anywhere
+    if (obj.Category == null) context.WriteByte(BinaryTypeCode.PropertySkip);
+    else if (depth > context.MaxDepth) context.WriteByte(BinaryTypeCode.Null);
+    else { ... }
+    // (no skip_Category goto — never needed)
+
+    context.WriteVarInt(obj.Id);                       // primitív, no filter check
+    // ... rest of properties without HasPropertyFilter check
+}
+
+// Külön emit-elt method ritka path-ra:
+private static void WritePropertiesWithFilter<TOutput>(TestPallet obj, ..., int depth)
+{
+    // Full per-property filter-aware kód (the current behavior)
+}
+```
+
+A `AcBinarySourceGenerator.cs`-t kell módosítani:
+1. A `WriteProperties` method elején egyetlen `HasPropertyFilter` check
+2. Két különböző code-path emit:
+   - **Fast path** (default — no filter): nincs per-property `if (context.HasPropertyFilter)` check, nincs filter-context allokáció + lambda-call, nincs `goto skip_X`
+   - **Slow path** (filter aware — separate static method): a jelenlegi viselkedés
+
+**Várt nyereség**: a fast path ~10 elimináció / object × 1-2 ns / branch ≈ ~15-20 ns / object. Latin1Long Large cell 1250 instance × 18 ns = ~22 µs / 120 µs Ser ≈ **~18% felső becslés**; reálisan **2-4pp Ser javulás** (a kód-bloat növekedés és a JIT inlinelés-ráhatás miatt mérséklődik).
+
+**Kombinálható az A-val**: az **A + F** együtt **3-7pp javulás** célozható meg — a SGen `WriteProperties` 12.6% Self CPU jelentős csökkenése.
+
+**Wire-formátum változatlan**, kód-méret kicsivel nő (két path-ot generál minden type-on), de a fast path a JIT-tel jobban inlinelhető.
+
+#### G. SGen `WriteProperties` null/depth/object-ref kombinálás (kapcsolt az F-hez)
+
+A komplex (Object) property-knél a 3-ágú dispatch:
+```csharp
+if (obj.X == null) context.WriteByte(BinaryTypeCode.PropertySkip);
+else if (depth > context.MaxDepth) context.WriteByte(BinaryTypeCode.Null);
+else { context.WriteByte(BinaryTypeCode.Object); X_GeneratedWriter.Instance.WriteProperties(...); }
+```
+
+Ez minden komplex property-en fut. Lehetséges optimalizáció: a `depth > MaxDepth` check egy method-szintű branch-szé alakítás (egyszer ellenőrizni a method elején, aztán a property-szintű ágat egyszerűsíteni). De ez **kis hatás** és a `MaxDepth` jellemzően nem érintő (a legtöbb workload-on `depth < MaxDepth`).
+
+Alacsony prio, F-tel kombinált.
+
+#### E. `WriteVarUIntMultiByteUnsafe` (1.66% Self) → fix-int (várható: -1pp Ser, **NEM javasolt önmagában**)
+
+A `WriteVarInt` (signed int property-encode, ZigZag + VarUInt) kódolás a SGen-template-ekben gyakori (Id, Status, TrayCount, stb.). A multi-byte ág 1.66% Self CPU.
+
+Fix-int (4 byte) cseréje wire-méret-növekedéssel jár (kis int-eken +3 byte / property), ami a wire-formátum kompaktság-előnyét rontja. **Csak `ACCORE-BIN-T-S2X9` markerless lane kontextusban** érdemes — ahol a property-marker eltávolításával együtt fix-int kicserélése wire-szempontból kompenzálódik.
+
+### Közös, NEM AcBinary-spec overhead — nem optimalizálható
+
+A `Buffer._Memmove` 39.77% Self CPU + a `Output.ToArray()` 47.37% Total **a return-time `byte[]`-alloc + content-memcpy**, ami minden `byte[] Serialize(T)` hívásnál fut. **Mindkét engine fizeti** (MemPack `ToArrayAndReset()` is alloc + memcpy a chunkokból). Az API contract (`byte[] Serialize(T)`) miatt elkerülhetetlen.
+
+**Aki teljesítményt akar**, használja a `IBufferWriter<byte>` overload-ot (`AcBinaryBufferWriterBenchmark` vs `MemoryPackBufferWriterBenchmark` apples-to-apples a benchmarkban — mindkét engine ugyanezt csinálja).
+
+### Acceptance (per-section)
+
+- **A** (SGen ensure-batching): Latin1Long FastWire bench AcBinary Ser delta vs MemPack -1-3pp javulás minden cellán
+- **F** (HasPropertyFilter lift-out): Latin1Long Ser delta -2-4pp; **A + F együtt** SGen `WriteProperties` Self CPU ≤ 8% (jelenleg ~12.6%)
+- **G** (null/depth/object-ref kombinálás): kis hatás, F-tel kombinált
+- **B** (WriteStringWithDispatch Compact batch-write): Latin1Long Compact bench AcBinary Ser delta vs MemPack ≤ +5% minden cellán
+- **C** (Thread-static context): `Serialize<T>` Self CPU ≤ 6% (jelenleg ~10%)
+- **D** (Linked-array): nem prioritás — buffer-grow Self CPU már ≤ 0.05%
+- **E** (VarInt → fix-int): csak az `S2X9` markerless lane sprint kontextusában mérni
+
+### Sorrend
+
+1. **A + F kombinálva** — SGen `WriteProperties` template átfogó refactor (ensure-batching + HasPropertyFilter lift-out + esetleg G null/depth-combine). Együtt **~3-7pp Ser javulás** várt minden cellán. Izolált változtatás csak `AcBinarySourceGenerator.cs`-en, wire-format változatlan.
+2. **B** — ~1-2pp javulás, ugyanaz a pattern mint a `K7M3` FastWire batch-write
+3. **C** — ~2-4pp, de NAGY refactor (thread-safety, pool semantics felülvizsgálat)
+4. **D** — alacsony prioritás (kis hatás, nagy refactor)
+5. **E** — csak `S2X9` kontextusban
+
+### Trigger
+
+- **A + F** → most azonnal implementálható; ezek a SGen template-en belül kombinálandók (egyetlen template-átdolgozás kétségtelenül jobb mint külön refactor-körök). Minden továbbai mérés ettől függ.
+- **B** → A+F után, hasonló pattern alkalmazása más writer-helyen
+- **C** → ha a Serialize<T> Self CPU 10% továbbra is dominál A+F+B után
+- **D, E** → opcionális, az A/F/B/C eredmények alapján
+
+## ACCORE-BIN-T-Q5T2: Önleíró wire-formátum — duplikált object-marker-ek + UTF-16 string marker (per-type/property encoding choice)
+**Priority:** P2 · **Type:** Architecture / Performance · **Status:** Open · **Related:** `ACCORE-BIN-T-P3X7` (profile-driven roadmap — kis-adat slowdown diagnózis), `ACCORE-BIN-T-K7M3` (BCL UTF-8 transcoder — előfeltétele), `ACCORE-BIN-T-S2X9` (markerless schema lane), `ACCORE-BIN-T-V4N2` (UTF-8 SIMD)
+
+A 2026-05-08 design-session során merült fel mint válasz a kis-adat-slowdown problémára és az `if (FastWire)` / `if (UseMetadata)` runtime-branch-ek széles jelenlétére. Cél: a wire-mode kivezetése a globális header-ből, **per-object/per-property encoding-szabadság** attribute-tal, megőrizve a SGen↔Runtime wire-kompatibilitást.
+
+### LLM Context (cold-start)
+
+Egy fresh session olvasásához ez a kontextus elég:
+
+**Wire-modell**: AcBinary két párhuzamos serializációs path-ot futtat — **SGen** (compile-time generált, `[AcBinarySerializable]` típusokra) és **Runtime** (reflection + `Expression.Compile`). **Mindkettő ugyanazt a wire-t produkálja és olvassa** (interop garancia, `BINARY_SGEN.md` "Hybrid Execution Model").
+
+**Markerless body**: object scope-on belül a primitív property-k (int, long, double, …) **közvetlenül** írnak a wire-be, marker-byte nélkül. A reader a sorrendet compile-time schema-ból (SGen) vagy `OrderedProperties` metadata-ból (Runtime) tudja. A wire object-prefix-szel kezdődik (1-byte marker), majd markerless body.
+
+**Meglévő object-marker család** (`AcBinarySerializer.BinarySerializationContext.cs` writer-ek + `AcBinaryDeserializer.cs` reader-dispatch switch):
+- `Object` — sima first-occurrence
+- `ObjectWithTypeName` — polimorf (`runtimeType != declaredType`)
+- `ObjectFullMarkerIId` / `ObjectFullMarkerAll` — `RefHandling=IId|All` first-occurrence
+- `ObjectRef` / `ObjectRefIId` — subsequent (csak ID, **NEM duplikálódik** — nincs primitív property körülötte)
+
+**OPT-OUT minta** (jelenlegi konvenció): default SGen flexibilis — minden runtime-branch-et generál (pl. `if (context.UseRefHandling)`). Class-attribute disable-eli a feature-t → SGen omitti a branch-et → drasztikus optimum. Q5T2 ezt a mintát terjeszti ki **encoding-választásra**.
+
+**Naming-konvenció**: PascalCase, suffix-variánsok (`Object` → `ObjectVarUInt`, `String` → `StringUtf16`). NEM `Object_NoZZ`, NEM `ObjVU`.
+
+### Motiváció
+
+A jelenlegi `AcBinaryOptions.WireMode` (FastMode vs Compact) **payload-szintű globális flag**:
+- A kódban sok `if (FastWire) { ... } else { ... }` branch (lásd `WriteVarInt` 514. sor, `WriteStringWithDispatch`, `WriteValueNonPrimitive`, property-writers)
+- A fejlesztő nem optimalizálhat granuláris szinten (pl. `[NoZZ]` egy hot type-ra, default másnak)
+- Schema-evolúciós szempontból: ha a szerver attribute-ot változtat egy type-on, a klienseknek (akár régebbi verzió) **rekomp nélkül** olvasniuk kell az új wire-t
+
+A `ACCORE-BIN-T-P3X7` profile-bench mérése szerint a kis-adat slowdown (Latin1Long Small +2.6%, Medium +1.5% AcBinary lassulás MemPack-hez képest) jelentős részben a VarUInt per-call overhead-ből származik (ZigZag shift + multi-byte branch loop). A type-szintű `[IntEncoding=VarUInt]` attribute-tal a fejlesztő a non-negative property-ket VarUInt-NoZigZag-ra állíthatja → ZigZag shift kiesik, kis-adatra mérhető nyereség.
+
+### Wire-formátum design
+
+**5 új `BinaryTypeCode` marker** (naming TBD: `*VarUInt` vagy `*NoZZ` suffix, implementációkor véglegesítendő):
+
+| Új marker | Cél | Alkalmazási hely |
+|---|---|---|
+| `ObjectVarUInt` | Object scope primitive int/long/enum-jai NoZigZag VarUInt encoding-ban | sima object first-occurrence |
+| `ObjectWithTypeNameVarUInt` | Polimorf first-occurrence NoZZ-variánsa | `runtimeType != declaredType` esetén |
+| `ObjectFullMarkerIIdVarUInt` | `RefHandling=IId` first-occurrence NoZZ-variánsa | csak first; subsequent `ObjectRefIId` változatlan |
+| `ObjectFullMarkerAllVarUInt` | `RefHandling=All` first-occurrence NoZZ-variánsa | csak first; subsequent `ObjectRef` változatlan |
+| `StringUtf16` | UTF-16 encoded string content (property-szintű) | bárhol egy string property emit-jénél |
+
+**Wire-példa**:
+```
+[ObjectVarUInt marker]                  ← scope-szintű: int-property-k VarUInt-NoZZ
+  WriteVarUInt(obj.Id)                   ← markerless body, encoding a marker alapján
+  WriteVarUInt(obj.Status)
+  [String marker] UTF-8(obj.Notes)        ← default UTF-8
+  [StringUtf16 marker] UTF-16(obj.Name)   ← property-szintű override
+```
+
+**Byte-szintű példa** (`Order { Id=42, Status=3, Notes="ok" }`, class-szintű `IntEncoding=VarUInt`):
+- Default ZigZag wire: `[Object]` `[0x54]` (VarInt 42 ZigZag: `((42<<1)^(42>>31))=84`) `[0x06]` (VarInt 3 ZigZag: 6) `[String]` `[0x02]` `0x6F 0x6B`
+- New VarUInt wire: `[ObjectVarUInt]` `[0x2A]` (VarUInt 42 raw: `0x2A`) `[0x03]` (VarUInt 3 raw: `0x03`) `[String]` `[0x02]` `0x6F 0x6B`
+- Body-sorrend és byte-szám változatlan; csak az encoding-szabályok mások. Stringek ugyanúgy markered (UTF-8 default itt). String-encoding override esetén `[StringUtf16]` `[char-count]` `[2-byte-per-char]`.
+
+A primitive property-k körüli wire **markerless marad** — a body-encoding-ot az object-marker határozza meg, nem per-property byte. Wire-bloat csak ott van, ahol most is van marker (object-prefix, string-marker).
+
+### Attribute design
+
+**Object-szintű** (mert object-marker is object-szintű):
+```csharp
+[AcBinarySerializable(IntEncoding = IntEncoding.VarUInt)]
+public class Order { ... }
+```
+
+**Property-szintű** (csak string-en, mert string-marker is per-property):
+```csharp
+public class Order {
+    [AcBinaryEncoding(StringEncoding.Utf16)]
+    public string CustomerName { get; set; }
+}
+```
+
+**Új public API elemek**:
+- `AcBinaryEncodingAttribute` (target: `Class | Property`)
+- `IntEncoding` enum (`Default` = ZigZag VarInt, `VarUInt` = NoZigZag)
+- `StringEncoding` enum (`Default` = UTF-8, `Utf16` = UTF-16)
+- `AcBinaryOptions.IntEncoding` és `AcBinaryOptions.StringEncoding` runtime fallback opciók
+
+### Encoding-választás precedenciája (writer-side)
+
+1. **Property attribute** (legerősebb) — pl. `[AcBinaryEncoding(StringEncoding.Utf16)]`
+2. **Class attribute** — pl. `[AcBinarySerializable(IntEncoding=VarUInt)]`
+3. **`AcBinaryOptions` runtime opció** — pl. `options.StringEncoding = Utf16`
+4. **Built-in default** — ZigZag-VarInt + UTF-8
+
+### Szerepkörök és path-ok
+
+| Path | Encoding-választás |
+|---|---|
+| **SGen writer (with attribute)** | Compile-time pinned, hard-coded marker + encoding emit (NO runtime branch) — a meglévő OPT-OUT minta (mint `RefHandling`/`Interning` disable) |
+| **SGen writer (no attribute)** | Runtime branch a `context.IntEncoding`/`context.StringEncoding` option-en — két path generálódik, runtime dönt |
+| **SGen reader** | **Marker-dispatch** (NEM hard-coded marker-expect — runtime-on dönti el, hogy `Object` vagy `ObjectVarUInt` érkezett, és annak megfelelően olvas) |
+| **Runtime writer (reflection-based)** | Reflection-attribute-read + option fallback + default fallback — ugyanaz a precedencia mint SGen-nél |
+| **Runtime reader** | Marker-dispatch (universal — nincs attribute / option használat encoding-döntésre, csak a marker-byte) |
+
+⚠️ **SGen reader marker-dispatch KÖTELEZŐ** (NEM hard-coded marker-expect). Konkrét scenario amit ez kezel:
+
+> Szerver Runtime-mode-ban serializálja `Order`-t. Az `Order` osztályon a szerver-deploy óta **változott az attribute** (új deploy hozott `[IntEncoding=VarUInt]`-ot). Szerver Runtime writer reflection-ből olvassa az új attribute-ot → `ObjectVarUInt` markert emit-el a wire-be.
+>
+> Régi kliens **rekomp nélkül** kapja a payload-ot. Ha a kliens SGen reader-e hard-coded `Object`-marker-expect-tel olvasna → **panik / mismatch**.
+>
+> Marker-dispatch-szel a kliens helyesen dekódol bármelyik markert, függetlenül attól, hogy a kliens-oldali compile-time `Order` typebe-n volt-e az attribute.
+
+Ez biztosítja a **"server-side attribute-change doesn't break clients"** garanciát.
+
+### Kompatibilitási garanciák
+
+| Interakció | Eredmény |
+|---|---|
+| SGen-write (NoZZ attr) → SGen-read | OK (marker-dispatch) |
+| SGen-write (NoZZ attr) → Runtime-read | OK (marker-dispatch) |
+| Runtime-write (option=NoZZ) → SGen-read | OK (marker-dispatch) |
+| Runtime-write (option=NoZZ) → Runtime-read | OK (marker-dispatch) |
+| Server-attribute-changed → old client (no recompile) | OK — kliens csak a marker-t olvassa |
+| Mixed payload (egyik object NoZZ, másik default) | OK — minden object-marker önálló scope |
+
+### Implementációs lépések
+
+1. **`BinaryTypeCode` const-bővítés** — 5 új byte-érték (range-allokáció: a meglévő enum szervezése alapján a következő szabad slot-okba). Wire-format spec frissítés `BINARY_FORMAT.md`-ben.
+2. **`AcBinaryEncodingAttribute` + `IntEncoding` + `StringEncoding` enum-ok** — új fájlok az `AyCode.Core/Serializers/Binaries/` mappában.
+3. **`AcBinaryOptions.IntEncoding` + `AcBinaryOptions.StringEncoding`** opciók hozzáadása (default = `Default`).
+4. **`WriteStringUtf16` / `ReadStringUtf16` context-helper-ek** — `MemoryMarshal.Cast<char,byte>` direct copy + length-prefix (VarUInt char-count).
+5. **Runtime writer reflection** — `BinarySerializeTypeMetadata` cache: `IntEncoding`, `StringEncoding`-per-property flag-ek (attribute-alapján). Encoding-emit a precedencia szerint.
+6. **SGen writer template** — attribute-feldolgozás `EmitWriteValue`-ban: ha attribute → compile-time hard-coded emit; ha nincs → runtime-branch emit a `context` option-en.
+7. **SGen reader template** — `EmitReadValue` marker-dispatch-szel (object-marker scope-encoding-mode tracking + string-marker per-property dispatch).
+8. **Runtime reader update** — object-marker dispatch a scope-encoding-state-be (pl. `BinaryDeserializationContext.CurrentIntEncoding`), string-marker per-property dispatch.
+9. **Cross-mode tesztek** — minden write-read kombináció (SGen↔SGen, SGen↔Runtime, Runtime↔SGen, Runtime↔Runtime) minden encoding-kombinációban (default, attr-only, option-only, attr+option, mixed payload).
+10. **Doc**: `BINARY_FORMAT.md` wire-format spec, `BINARY_OPTIONS.md` új opciók, `BINARY_SGEN.md` precedencia + szerepkörök táblázat.
+
+### Acceptance
+
+- 5 új BinaryTypeCode marker, naming-konvenció dokumentált
+- `AcBinaryEncodingAttribute` + 2 enum + 2 opció extension working
+- Round-trip teszt minden cross-mode kombinációban zöld
+- Wire-bloat default-encoding-on **0 byte** (nincs új per-property marker)
+- Latin1Long Small bench: AcBinary `[IntEncoding=VarUInt]` típuson a slowdown ≤ MemPack +0.5pp (jelenleg +2.6%)
+- `BINARY_FORMAT.md`/`BINARY_OPTIONS.md`/`BINARY_SGEN.md` szinkronban a wire- és attribute-világgal
+- A meglévő `WireMode=Fast/Compact` distinction-ek kompatibilisek maradnak (vagy migrálódnak az új encoding-attribute-okra — külön döntés implementációkor)
+
+### Trigger / Sorrend
+
+Implementáció **ne kezdődjön** azonnal — a `ACCORE-BIN-T-P3X7` A+F szekciói (SGen ensure-batching + HasPropertyFilter lift-out) **előbb mérendő**. Ha az A+F már lehozza a SGen `WriteProperties` Self CPU-t ≤ 8%-ra, és a kis-adat slowdown ettől már ≤ +1pp, akkor ez a Q5T2 entry **alacsony prioritásra** kerül. Ha a kis-adat slowdown az A+F után is megmarad → Q5T2 implementáció **érdemi**.
+
+Egyéb prerekvizit: `ACCORE-BIN-T-W9F1` (compile-time metadata) szinkronizálás — a Runtime writer reflection-attribute-read-je beleilleszthető a generált metadata-ba, ezzel a runtime path is gyorsabb attribute-alapú encoding-választás-on.
+
+### Open kérdések (implementációkor eldöntendő)
+
+- **Marker naming**: `ObjectVarUInt` (semantic, az encoding alapján) vagy `ObjectNoZZ` (rövidebb)?
+- **`[AcBinarySerializable]`-on belül** vegyük fel a `IntEncoding` paramétert, vagy **külön `[AcBinaryEncoding]` attribute** legyen object-szinten is (és a `[AcBinarySerializable]` változatlan)?
+- **`AcBinaryOptions.WireMode` jövője**: a régi `Fast`/`Compact` enum migrálódjon az új `IntEncoding`/`StringEncoding`-ra (BC-break) vagy maradjon mint shortcut-default?
+