diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index 2e79706..db427c3 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -68,7 +68,12 @@
"Bash(curl -s \"https://raw.githubusercontent.com/dotnet/runtime/main/src/libraries/System.IO.Pipelines/src/System/IO/Pipelines/StreamPipeWriter.cs\")",
"WebFetch(domain:lemire.me)",
"Bash(gh pr *)",
- "Bash(gh api *)"
+ "Bash(gh api *)",
+ "Bash(ls -la 'C:\\\\Users\\\\Fullepi\\\\Downloads\\\\_baseline\\\\cpuprofiler' 2>&1 | head -30)",
+ "Bash(where PerfView.exe)",
+ "Bash(where dotnet-trace *)",
+ "Bash(dotnet tool *)",
+ "Bash(dotnet-trace convert *)"
]
}
}
diff --git a/AyCode.Core.Serializers.Console/Program.cs b/AyCode.Core.Serializers.Console/Program.cs
index 19f0b5a..b1b7907 100644
--- a/AyCode.Core.Serializers.Console/Program.cs
+++ b/AyCode.Core.Serializers.Console/Program.cs
@@ -161,6 +161,24 @@ public static class Program
$"Compression={options.UseCompression}{extra}";
}
+ ///
+ /// Returns MemoryPack serializer options aligned with for a fair
+ /// apples-to-apples wire-format comparison:
+ ///
+ /// - → (UTF-8) — both
+ /// engines encode UTF-8, comparison is purely about header / tier / dispatch overhead.
+ /// - → (UTF-16 raw memcpy) —
+ /// both engines write UTF-16 raw bytes, so wire-size and CPU comparison reflect the same string-encoding family.
+ ///
+ /// Without this alignment the FastWire vs MemPack-default comparison conflates two unrelated dimensions
+ /// (UTF-16 raw vs UTF-8 encoded) and produces a misleading +40% wire-size delta that is structurally
+ /// the encoding-family difference, NOT an AcBinary-specific overhead.
+ ///
+ private static MemoryPackSerializerOptions GetMemPackOptions() =>
+ SelectedWireMode == WireMode.Fast
+ ? MemoryPackSerializerOptions.Utf16
+ : MemoryPackSerializerOptions.Default;
+
///
/// Converts a total-time (in ms across ) into per-operation microseconds.
/// Formula: totalMs / iterations × 1000. The benchmark stores *TimeMs as the cumulative
@@ -185,6 +203,63 @@ public static class Program
private static double DesPerOp(BenchmarkResult r) => ToPerOpMicros(r.DeserializeTimeMs, r.DeserializeIterations);
private static double RtPerOp(BenchmarkResult r) => ToPerOpMicros(r.RoundTripTimeMs, r.RoundTripIterations);
+ ///
+ /// Per-cell-paired aggregation of an overall comparison. Captures three different aggregation
+ /// strategies so the reader can judge whether the headline delta is dominated by one large cell
+ /// (arithmetic mean) or representative of typical workload (geometric mean / median).
+ ///
+ /// Arithmetic mean of µs/op — magnitude-weighted; biased toward Large cell.
+ /// Geometric mean of per-cell ratios — magnitude-neutral; each cell weighted equally.
+ /// Median of per-cell ratios — outlier-resistant.
+ /// Arithmetic mean AcBinary value (µs/op or bytes).
+ /// Arithmetic mean MemPack value.
+ /// Number of paired cells contributing to the geo/median.
+ private record OverallStats(double ArithMeanPct, double GeoMeanPct, double MedianPct, double AcAvg, double MpAvg, int CellCount);
+
+ ///
+ /// Computes arithmetic + geometric + median aggregation of an AcBinary-vs-MemPack comparison
+ /// across paired cells (joined by TestDataName). Per-cell pairing is required for the
+ /// geo/median variants — a cell where AcBinary or MemPack is missing is dropped from all stats.
+ /// Returns null when no paired cell has a valid value.
+ ///
+ private static OverallStats? ComputeOverallStats(
+ List acResults,
+ List mpResults,
+ Func getValue)
+ {
+ if (acResults.Count == 0 || mpResults.Count == 0) return null;
+
+ var pairs = (from ac in acResults
+ join mp in mpResults on ac.TestDataName equals mp.TestDataName
+ let acV = getValue(ac)
+ let mpV = getValue(mp)
+ where acV > 0 && mpV > 0
+ select (ac: acV, mp: mpV)).ToList();
+
+ if (pairs.Count == 0) return null;
+
+ var acAvg = pairs.Average(p => p.ac);
+ var mpAvg = pairs.Average(p => p.mp);
+ var ratios = pairs.Select(p => p.ac / p.mp).ToList();
+
+ // Geometric mean: exp(avg(ln(ratios))) — numerically stable vs Π ratios then ^(1/N).
+ var geoMean = Math.Exp(ratios.Sum(Math.Log) / ratios.Count);
+
+ // Median (paired-ratio): for even N use the midpoint of the two middle values.
+ var sorted = ratios.OrderBy(r => r).ToList();
+ var median = sorted.Count % 2 == 1
+ ? sorted[sorted.Count / 2]
+ : (sorted[sorted.Count / 2 - 1] + sorted[sorted.Count / 2]) / 2.0;
+
+ return new OverallStats(
+ ArithMeanPct: (acAvg / mpAvg - 1) * 100,
+ GeoMeanPct: (geoMean - 1) * 100,
+ MedianPct: (median - 1) * 100,
+ AcAvg: acAvg,
+ MpAvg: mpAvg,
+ CellCount: ratios.Count);
+ }
+
///
/// Formats a per-op micros value with its inter-sample range and CV-threshold marker as
/// "26.86 (24.5..29.1)" or "26.86 (24.5..29.1) ⚠️5.2%". Median first, range in parentheses,
@@ -1452,6 +1527,7 @@ public static class Program
private sealed class MemoryPackBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
+ private readonly MemoryPackSerializerOptions _options;
private readonly byte[] _serialized;
public string Engine => EngineMemoryPack;
@@ -1461,12 +1537,14 @@ public static class Program
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes => 0;
public long SetupDeserializeAllocBytes => 0;
+ public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}";
public MemoryPackBenchmark(TestOrder order, string optionsPreset)
{
_order = order;
OptionsPreset = optionsPreset;
- _serialized = MemoryPackSerializer.Serialize(order);
+ _options = GetMemPackOptions();
+ _serialized = MemoryPackSerializer.Serialize(order, _options);
}
public void Warmup(int iterations)
@@ -1479,15 +1557,15 @@ public static class Program
}
[MethodImpl(MethodImplOptions.NoInlining)]
- public void Serialize() => MemoryPackSerializer.Serialize(_order);
+ public void Serialize() => MemoryPackSerializer.Serialize(_order, _options);
[MethodImpl(MethodImplOptions.NoInlining)]
- public void Deserialize() => MemoryPackSerializer.Deserialize(_serialized);
+ public void Deserialize() => MemoryPackSerializer.Deserialize(_serialized, _options);
public bool VerifyRoundTrip()
{
- var bytes = MemoryPackSerializer.Serialize(_order);
- var roundTripped = MemoryPackSerializer.Deserialize(bytes);
+ var bytes = MemoryPackSerializer.Serialize(_order, _options);
+ var roundTripped = MemoryPackSerializer.Deserialize(bytes, _options);
return DeepEqualsViaJson(_order, roundTripped);
}
}
@@ -2422,6 +2500,7 @@ public static class Program
private sealed class MemoryPackFreshBufferWriterBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
+ private readonly MemoryPackSerializerOptions _options;
private readonly byte[] _serialized;
public string Engine => EngineMemoryPack;
@@ -2431,12 +2510,14 @@ public static class Program
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes => 0;
public long SetupDeserializeAllocBytes => 0;
+ public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}";
public MemoryPackFreshBufferWriterBenchmark(TestOrder order, string optionsPreset)
{
_order = order;
OptionsPreset = optionsPreset;
- _serialized = MemoryPackSerializer.Serialize(order);
+ _options = GetMemPackOptions();
+ _serialized = MemoryPackSerializer.Serialize(order, _options);
}
public void Warmup(int iterations)
@@ -2452,17 +2533,17 @@ public static class Program
public void Serialize()
{
var abw = new ArrayBufferWriter();
- MemoryPackSerializer.Serialize(abw, _order);
+ MemoryPackSerializer.Serialize(abw, _order, _options);
}
[MethodImpl(MethodImplOptions.NoInlining)]
- public void Deserialize() => MemoryPackSerializer.Deserialize(_serialized);
+ public void Deserialize() => MemoryPackSerializer.Deserialize(_serialized, _options);
public bool VerifyRoundTrip()
{
var abw = new ArrayBufferWriter();
- MemoryPackSerializer.Serialize(abw, _order);
- var roundTripped = MemoryPackSerializer.Deserialize(abw.WrittenSpan.ToArray());
+ MemoryPackSerializer.Serialize(abw, _order, _options);
+ var roundTripped = MemoryPackSerializer.Deserialize(abw.WrittenSpan.ToArray(), _options);
return DeepEqualsViaJson(_order, roundTripped);
}
}
@@ -2535,6 +2616,7 @@ public static class Program
private sealed class MemoryPackBufferWriterBenchmark : ISerializerBenchmark
{
private readonly TestOrder _order;
+ private readonly MemoryPackSerializerOptions _options;
private readonly byte[] _serialized;
private readonly ArrayBufferWriter _bufferWriter;
@@ -2545,12 +2627,14 @@ public static class Program
public int SerializedSize => _serialized.Length;
public long SetupSerializeAllocBytes { get; }
public long SetupDeserializeAllocBytes => 0;
+ public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}";
public MemoryPackBufferWriterBenchmark(TestOrder order, string optionsPreset)
{
_order = order;
OptionsPreset = optionsPreset;
- _serialized = MemoryPackSerializer.Serialize(order);
+ _options = GetMemPackOptions();
+ _serialized = MemoryPackSerializer.Serialize(order, _options);
// Serialize-side setup only — see AcBinaryBufferWriterBenchmark for the full rationale.
GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
@@ -2573,17 +2657,17 @@ public static class Program
public void Serialize()
{
_bufferWriter.ResetWrittenCount();
- MemoryPackSerializer.Serialize(_bufferWriter, _order);
+ MemoryPackSerializer.Serialize(_bufferWriter, _order, _options);
}
[MethodImpl(MethodImplOptions.NoInlining)]
- public void Deserialize() => MemoryPackSerializer.Deserialize(_serialized);
+ public void Deserialize() => MemoryPackSerializer.Deserialize(_serialized, _options);
public bool VerifyRoundTrip()
{
_bufferWriter.ResetWrittenCount();
- MemoryPackSerializer.Serialize(_bufferWriter, _order);
- var roundTripped = MemoryPackSerializer.Deserialize(_bufferWriter.WrittenSpan.ToArray());
+ MemoryPackSerializer.Serialize(_bufferWriter, _order, _options);
+ var roundTripped = MemoryPackSerializer.Deserialize(_bufferWriter.WrittenSpan.ToArray(), _options);
return DeepEqualsViaJson(_order, roundTripped);
}
}
@@ -2932,63 +3016,62 @@ public static class Program
// All averages are over per-op µs (iter-independent). Batch-time averaging would mix rows
// measured with different iter counts (post-calibration), producing meaningless numbers.
- var memPackAvgSer = memPackSerResults.Count > 0 ? memPackSerResults.Average(r => SerPerOp(r)) : 0;
- var memPackAvgDes = memPackDesResults.Average(r => DesPerOp(r));
- var memPackAvgRt = memPackRtResults.Average(r => RtPerOp(r));
- var memPackAvgSize = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).Average(r => r.SerializedSize);
- var memPackAvgSerAlloc = memPackSerResults.Count > 0 ? memPackSerResults.Average(r => r.SerializeAllocBytesPerOp) : 0;
- var memPackAvgDesAlloc = memPackDesResults.Count > 0 ? memPackDesResults.Average(r => r.DeserializeAllocBytesPerOp) : 0;
+ // Three aggregations per metric:
+ // - Arithmetic mean (current behavior) — magnitude-weighted, biased toward Large cell.
+ // - Geometric mean of per-cell ratios — magnitude-neutral, each cell weighted equally.
+ // - Median of per-cell ratios — outlier-resistant.
+ // The geo/median variants surface when a single cell dominates the arithmetic average
+ // (typical when one cell's µs-per-op is an order of magnitude larger than the others).
+ var sizeAcResults = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)).ToList();
+ var sizeMpResults = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).ToList();
- var acBinaryAvgSer = acBinarySerResults.Count > 0 ? acBinarySerResults.Average(r => SerPerOp(r)) : 0;
- var acBinaryAvgDes = acBinaryDesResults.Average(r => DesPerOp(r));
- var acBinaryAvgRt = acBinaryRtResults.Average(r => RtPerOp(r));
- var acBinaryAvgSize = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)).Average(r => r.SerializedSize);
- var acBinaryAvgSerAlloc = acBinarySerResults.Count > 0 ? acBinarySerResults.Average(r => r.SerializeAllocBytesPerOp) : 0;
- var acBinaryAvgDesAlloc = acBinaryDesResults.Count > 0 ? acBinaryDesResults.Average(r => r.DeserializeAllocBytesPerOp) : 0;
+ var serStats = ComputeOverallStats(acBinarySerResults, memPackSerResults, SerPerOp);
+ var desStats = ComputeOverallStats(acBinaryDesResults, memPackDesResults, DesPerOp);
+ var rtStats = ComputeOverallStats(acBinaryRtResults, memPackRtResults, RtPerOp);
+ var sizeStats = ComputeOverallStats(sizeAcResults, sizeMpResults, r => r.SerializedSize);
+ var serAllocStats = ComputeOverallStats(acBinarySerResults, memPackSerResults, r => r.SerializeAllocBytesPerOp);
+ var desAllocStats = ComputeOverallStats(acBinaryDesResults, memPackDesResults, r => r.DeserializeAllocBytesPerOp);
System.Console.WriteLine();
System.Console.WriteLine($"── {"AcBinary (Byte[], SGen)"} vs {"MemoryPack (Byte[])"} (Overall) ──");
- // Only show serialize comparison if data available
- if (memPackAvgSer > 0 && acBinaryAvgSer > 0)
- {
- var serPctAll = (acBinaryAvgSer / memPackAvgSer - 1) * 100;
- System.Console.ForegroundColor = serPctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
- System.Console.WriteLine($" Serialize: {serPctAll:+0;-0}% ({acBinaryAvgSer:F2} µs/op vs {memPackAvgSer:F2} µs/op)");
- System.Console.ResetColor();
- }
+ WriteOverallLine("Serialize", "µs/op", serStats);
+ WriteOverallLine("Deserialize", "µs/op", desStats);
+ WriteOverallLine("Round-trip", "µs/op", rtStats);
+ WriteOverallLine("Size", "B", sizeStats, "F0");
+ WriteOverallLine("Ser Alloc", "B/op", serAllocStats, "F0");
+ WriteOverallLine("Des Alloc", "B/op", desAllocStats, "F0");
+ }
- var desPctAll = (acBinaryAvgDes / memPackAvgDes - 1) * 100;
- var rtPctAll = (acBinaryAvgRt / memPackAvgRt - 1) * 100;
- var sizePctAll = (acBinaryAvgSize / memPackAvgSize - 1) * 100;
+ ///
+ /// Formats a signed percent delta with explicit sign for positive values (`+1.5%`, `-3.0%`, `0.0%`).
+ /// Padded to 7 chars (e.g. ` +12.3%`, `-100.0%`) for column alignment in the Overall block.
+ ///
+ private static string FormatPctSigned(double pct) => pct.ToString("+0.0;-0.0;0.0", System.Globalization.CultureInfo.InvariantCulture).PadLeft(6) + "%";
- System.Console.ForegroundColor = desPctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
- System.Console.WriteLine($" Deserialize: {desPctAll:+0;-0}% ({acBinaryAvgDes:F2} µs/op vs {memPackAvgDes:F2} µs/op)");
+ ///
+ /// Renders one Overall row with arith / geo / median deltas + AcBinary/MemPack absolute means.
+ /// Color is driven by the geometric-mean delta (magnitude-neutral signal). Skips silently when
+ /// stats is null (no paired data).
+ ///
+ private static void WriteOverallLine(string label, string unit, OverallStats? stats, string fmt = "F2")
+ {
+ if (stats == null) return;
+ // Color follows geo-mean (the magnitude-neutral signal). The arith-mean column may show a
+ // different sign when a single big cell dominates — that's exactly the signal we want to surface.
+ System.Console.ForegroundColor = stats.GeoMeanPct <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
+ System.Console.WriteLine($" {label,-12} arith {FormatPctSigned(stats.ArithMeanPct)} │ geo {FormatPctSigned(stats.GeoMeanPct)} │ median {FormatPctSigned(stats.MedianPct)} ({stats.AcAvg.ToString(fmt, System.Globalization.CultureInfo.InvariantCulture)} {unit} vs {stats.MpAvg.ToString(fmt, System.Globalization.CultureInfo.InvariantCulture)} {unit}, {stats.CellCount} cells)");
System.Console.ResetColor();
+ }
- System.Console.ForegroundColor = rtPctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
- System.Console.WriteLine($" Round-trip: {rtPctAll:+0;-0}% ({acBinaryAvgRt:F2} µs/op vs {memPackAvgRt:F2} µs/op)");
- System.Console.ResetColor();
-
- System.Console.ForegroundColor = sizePctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
- System.Console.WriteLine($" Size: {sizePctAll:+0;-0}% ({acBinaryAvgSize:F0} B vs {memPackAvgSize:F0} B)");
- System.Console.ResetColor();
-
- // Allocation comparison: byte[] API allocates the output array on both sides — delta shows serializer-overhead diff.
- if (memPackAvgSerAlloc > 0 && acBinaryAvgSerAlloc > 0)
- {
- var serAllocPct = (acBinaryAvgSerAlloc / memPackAvgSerAlloc - 1) * 100;
- System.Console.ForegroundColor = serAllocPct <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
- System.Console.WriteLine($" Ser Alloc: {serAllocPct:+0;-0}% ({acBinaryAvgSerAlloc:F0} B/op vs {memPackAvgSerAlloc:F0} B/op)");
- System.Console.ResetColor();
- }
- if (memPackAvgDesAlloc > 0 && acBinaryAvgDesAlloc > 0)
- {
- var desAllocPct = (acBinaryAvgDesAlloc / memPackAvgDesAlloc - 1) * 100;
- System.Console.ForegroundColor = desAllocPct <= 0 ? ConsoleColor.Green : ConsoleColor.Red;
- System.Console.WriteLine($" Des Alloc: {desAllocPct:+0;-0}% ({acBinaryAvgDesAlloc:F0} B/op vs {memPackAvgDesAlloc:F0} B/op)");
- System.Console.ResetColor();
- }
+ ///
+ /// Same as but appends to a (no color).
+ /// Used by the .log and .LLM file writers.
+ ///
+ private static void AppendOverallLine(StringBuilder sb, string label, string unit, OverallStats? stats, string fmt = "F2")
+ {
+ if (stats == null) return;
+ sb.AppendLine($" {label,-12} arith {FormatPctSigned(stats.ArithMeanPct)} | geo {FormatPctSigned(stats.GeoMeanPct)} | median {FormatPctSigned(stats.MedianPct)} ({stats.AcAvg.ToString(fmt, System.Globalization.CultureInfo.InvariantCulture)} {unit} vs {stats.MpAvg.ToString(fmt, System.Globalization.CultureInfo.InvariantCulture)} {unit}, {stats.CellCount} cells)");
}
private static void SaveResults(List results, List testDataSets)
@@ -3143,39 +3226,17 @@ public static class Program
return;
}
- if (memPackSerResults2.Count > 0 && acBinarySerResults2.Count > 0)
- {
- // Per-op µs averages (iter-independent) — see comment above the parallel block in PrintSummary.
- var memPackAvgSer2 = memPackSerResults2.Average(r => SerPerOp(r));
- var acBinaryAvgSer2 = acBinarySerResults2.Average(r => SerPerOp(r));
- var memPackAvgSerAlloc2 = memPackSerResults2.Average(r => r.SerializeAllocBytesPerOp);
- var acBinaryAvgSerAlloc2 = acBinarySerResults2.Average(r => r.SerializeAllocBytesPerOp);
- sb.AppendLine($" Serialize: {((acBinaryAvgSer2 / memPackAvgSer2 - 1) * 100):+0;-0}% ({acBinaryAvgSer2:F2} µs/op vs {memPackAvgSer2:F2} µs/op)");
- if (memPackAvgSerAlloc2 > 0)
- sb.AppendLine($" Ser Alloc: {((acBinaryAvgSerAlloc2 / memPackAvgSerAlloc2 - 1) * 100):+0;-0}% ({acBinaryAvgSerAlloc2:F0} B/op vs {memPackAvgSerAlloc2:F0} B/op)");
- }
+ // Per-cell-paired aggregation: arithmetic / geometric / median. See PrintSummary's parallel
+ // block + the OverallStats record for the rationale (per-cell ratio vs magnitude-weighted mean).
+ var sizeAcResults2 = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)).ToList();
+ var sizeMpResults2 = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).ToList();
- if (memPackDesResults2.Count > 0 && acBinaryDesResults2.Count > 0)
- {
- var memPackAvgDes2 = memPackDesResults2.Average(r => DesPerOp(r));
- var acBinaryAvgDes2 = acBinaryDesResults2.Average(r => DesPerOp(r));
- var memPackAvgDesAlloc2 = memPackDesResults2.Average(r => r.DeserializeAllocBytesPerOp);
- var acBinaryAvgDesAlloc2 = acBinaryDesResults2.Average(r => r.DeserializeAllocBytesPerOp);
- sb.AppendLine($" Deserialize: {((acBinaryAvgDes2 / memPackAvgDes2 - 1) * 100):+0;-0}% ({acBinaryAvgDes2:F2} µs/op vs {memPackAvgDes2:F2} µs/op)");
- if (memPackAvgDesAlloc2 > 0)
- sb.AppendLine($" Des Alloc: {((acBinaryAvgDesAlloc2 / memPackAvgDesAlloc2 - 1) * 100):+0;-0}% ({acBinaryAvgDesAlloc2:F0} B/op vs {memPackAvgDesAlloc2:F0} B/op)");
- }
-
- if (memPackRtResults2.Count > 0 && acBinaryRtResults2.Count > 0)
- {
- var memPackAvgRt2 = memPackRtResults2.Average(r => RtPerOp(r));
- var acBinaryAvgRt2 = acBinaryRtResults2.Average(r => RtPerOp(r));
- sb.AppendLine($" Round-trip: {((acBinaryAvgRt2 / memPackAvgRt2 - 1) * 100):+0;-0}% ({acBinaryAvgRt2:F2} µs/op vs {memPackAvgRt2:F2} µs/op)");
- }
-
- var memPackAvgSize2 = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).Average(r => r.SerializedSize);
- var acBinaryAvgSize2 = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)).Average(r => r.SerializedSize);
- sb.AppendLine($" Size: {((acBinaryAvgSize2 / memPackAvgSize2 - 1) * 100):+0;-0}% ({acBinaryAvgSize2:F0} B vs {memPackAvgSize2:F0} B)");
+ AppendOverallLine(sb, "Serialize", "µs/op", ComputeOverallStats(acBinarySerResults2, memPackSerResults2, SerPerOp));
+ AppendOverallLine(sb, "Ser Alloc", "B/op", ComputeOverallStats(acBinarySerResults2, memPackSerResults2, r => r.SerializeAllocBytesPerOp), "F0");
+ AppendOverallLine(sb, "Deserialize", "µs/op", ComputeOverallStats(acBinaryDesResults2, memPackDesResults2, DesPerOp));
+ AppendOverallLine(sb, "Des Alloc", "B/op", ComputeOverallStats(acBinaryDesResults2, memPackDesResults2, r => r.DeserializeAllocBytesPerOp), "F0");
+ AppendOverallLine(sb, "Round-trip", "µs/op", ComputeOverallStats(acBinaryRtResults2, memPackRtResults2, RtPerOp));
+ AppendOverallLine(sb, "Size", "B", ComputeOverallStats(sizeAcResults2, sizeMpResults2, r => r.SerializedSize), "F0");
File.WriteAllText(logFilePath, sb.ToString(), Utf8NoBom);
System.Console.WriteLine($"✓ Results saved to: {logFilePath}");
@@ -3253,6 +3314,36 @@ public static class Program
}
}
+ // Overall AcBinary (SGen, Byte[]) vs MemoryPack (Byte[]) comparison — same three aggregations
+ // as the .log / console output (arithmetic / geometric / median of per-cell ratios). The
+ // arith mean is magnitude-weighted (Large cell dominates); geo/median are per-cell-equal
+ // signals. Adding this lets an LLM diagnose whether a headline delta is a real overall
+ // win/loss or a single-cell artifact.
+ var memPackByteArrayResults = results.Where(r => r.Engine == EngineMemoryPack && r.IoMode == IoByteArray).ToList();
+ var acBinarySGenByteArrayResults = results.Where(r => r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen).ToList();
+ var memPackSerResultsLlm = memPackByteArrayResults.Where(r => r.SerializeTimeMs > 0).ToList();
+ var memPackDesResultsLlm = memPackByteArrayResults.Where(r => r.DeserializeTimeMs > 0).ToList();
+ var memPackRtResultsLlm = memPackByteArrayResults.Where(r => r.RoundTripTimeMs > 0).ToList();
+ var acBinarySerResultsLlm = acBinarySGenByteArrayResults.Where(r => r.SerializeTimeMs > 0).ToList();
+ var acBinaryDesResultsLlm = acBinarySGenByteArrayResults.Where(r => r.DeserializeTimeMs > 0).ToList();
+ var acBinaryRtResultsLlm = acBinarySGenByteArrayResults.Where(r => r.RoundTripTimeMs > 0).ToList();
+
+ if (memPackRtResultsLlm.Count > 0 && acBinaryRtResultsLlm.Count > 0)
+ {
+ sb.AppendLine();
+ sb.AppendLine("## Overall: AcBinary (Byte[], SGen) vs MemoryPack (Byte[])");
+ sb.AppendLine();
+ sb.AppendLine("Three aggregations of per-cell results: **arith** = arithmetic mean of µs/op (magnitude-weighted, Large cell dominates); **geo** = geometric mean of per-cell ratios (each cell weighted equally); **median** = median of per-cell ratios (outlier-resistant). Negative % = AcBinary faster/smaller; positive % = MemPack faster/smaller. The geo/median variants surface when a single big cell skews the arithmetic mean.");
+ sb.AppendLine();
+ sb.AppendLine("```");
+ AppendOverallLine(sb, "Serialize", "µs/op", ComputeOverallStats(acBinarySerResultsLlm, memPackSerResultsLlm, SerPerOp));
+ AppendOverallLine(sb, "Ser Alloc", "B/op", ComputeOverallStats(acBinarySerResultsLlm, memPackSerResultsLlm, r => r.SerializeAllocBytesPerOp), "F0");
+ AppendOverallLine(sb, "Deserialize", "µs/op", ComputeOverallStats(acBinaryDesResultsLlm, memPackDesResultsLlm, DesPerOp));
+ AppendOverallLine(sb, "Des Alloc", "B/op", ComputeOverallStats(acBinaryDesResultsLlm, memPackDesResultsLlm, r => r.DeserializeAllocBytesPerOp), "F0");
+ AppendOverallLine(sb, "Round-trip", "µs/op", ComputeOverallStats(acBinaryRtResultsLlm, memPackRtResultsLlm, RtPerOp));
+ AppendOverallLine(sb, "Size", "B", ComputeOverallStats(acBinarySGenByteArrayResults, memPackByteArrayResults, r => r.SerializedSize), "F0");
+ sb.AppendLine("```");
+ }
File.WriteAllText(filePath, sb.ToString(), Utf8NoBom);
System.Console.WriteLine($"✓ LLM results saved to: {filePath}");
diff --git a/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.cs b/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.cs
index 31137b5..09b6611 100644
--- a/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.cs
+++ b/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.cs
@@ -18,6 +18,32 @@ public class AcBinarySourceGenerator : IIncrementalGenerator
{
private const string AttributeName = "AyCode.Core.Serializers.Attributes.AcBinarySerializableAttribute";
+ // ────────────────────────────────────────────────────────────────────────────────────────────
+ // TEMPORARY (2026-05-08) — A/B test feature gates for hot-path overhead measurement.
+ //
+ // The generated SGen `WriteProperties` / `ScanObject` methods emit two kinds of overhead-blocks
+ // that are unconditionally present today but rarely exercised in typical workloads:
+ //
+ // 1. PropertyFilter guard (`UsePropertyFilter`) — every non-markerless property emit-site
+ // checks `context.HasPropertyFilter` + filter-context allocation + lambda-call.
+ // The benchmark workload never sets a property-filter → branch is always false →
+ // pure overhead (CPU cycles + i-cache pressure on the hot path).
+ //
+ // 2. Polymorphic object-with-type-name emit (`UsePolymorphType`) — `System.Object` declared
+ // properties emit `ObjectWithTypeName` marker + `WriteStringUtf8(AssemblyQualifiedName)`
+ // under `!context.UseMetadata`. Same: rarely used in typical DTO graphs.
+ //
+ // Setting either to `false` skips the corresponding emit at compile time → leaner generated
+ // code. The bench measures the actual delta vs MemPack apples-to-apples (which has neither
+ // of these features).
+ //
+ // Long-term: these flags will move to `[AcBinarySerializable(UsePropertyFilter = false, ...)]`
+ // attribute properties so consumers can opt out per type. Until then, keep both `false` for
+ // benchmark-vs-MemPack measurements; flip to `true` for production where the features are needed.
+ // ────────────────────────────────────────────────────────────────────────────────────────────
+ private const bool UsePropertyFilter = false;
+ private const bool UsePolymorphType = false;
+
private static readonly DiagnosticDescriptor CircularReferenceWarning = new(
id: "ACBIN001",
title: "Circular reference detected",
@@ -672,16 +698,21 @@ public class AcBinarySourceGenerator : IIncrementalGenerator
}
// All non-markerless properties: emit PropertyFilter guard
- // When filter returns false, write PropertySkip and skip the property write
- sb.AppendLine($"{i}if (context.HasPropertyFilter)");
- sb.AppendLine($"{i}{{");
- sb.AppendLine($"{i} var fc_{p.Name} = new BinaryPropertyFilterContext(obj, typeof({fullTypeName}), \"{p.Name}\", typeof({p.TypeNameForTypeof}), static o => (({fullTypeName})o).{p.Name});");
- sb.AppendLine($"{i} if (!context.PropertyFilter!(in fc_{p.Name}))");
- sb.AppendLine($"{i} {{");
- sb.AppendLine($"{i} context.WriteByte(BinaryTypeCode.PropertySkip);");
- sb.AppendLine($"{i} goto skip_{p.Name};");
- sb.AppendLine($"{i} }}");
- sb.AppendLine($"{i}}}");
+ // When filter returns false, write PropertySkip and skip the property write.
+ // Gated by `UsePropertyFilter` (TEMPORARY const) — `false` skips emit entirely → leaner
+ // generated code on benchmark workloads where no property-filter is ever set.
+ if (UsePropertyFilter)
+ {
+ sb.AppendLine($"{i}if (context.HasPropertyFilter)");
+ sb.AppendLine($"{i}{{");
+ sb.AppendLine($"{i} var fc_{p.Name} = new BinaryPropertyFilterContext(obj, typeof({fullTypeName}), \"{p.Name}\", typeof({p.TypeNameForTypeof}), static o => (({fullTypeName})o).{p.Name});");
+ sb.AppendLine($"{i} if (!context.PropertyFilter!(in fc_{p.Name}))");
+ sb.AppendLine($"{i} {{");
+ sb.AppendLine($"{i} context.WriteByte(BinaryTypeCode.PropertySkip);");
+ sb.AppendLine($"{i} goto skip_{p.Name};");
+ sb.AppendLine($"{i} }}");
+ sb.AppendLine($"{i}}}");
+ }
// Nullable value types always use markered path (need Null marker)
if (IsNullableVTKind(p.TypeKind))
@@ -715,14 +746,21 @@ public class AcBinarySourceGenerator : IIncrementalGenerator
// System.Object property: runtime type unknown at compile time.
// Write ObjectWithTypeName prefix so deserializer can resolve the concrete type.
// Use value.GetType() for runtime type dispatch (not typeof(object)).
+ // Gated by `UsePolymorphType` (TEMPORARY const) — `false` skips the type-name emit
+ // entirely (deser will use the property's declared type, which is `object` so the
+ // round-trip would fail on polymorphic instances; safe ONLY when the workload is
+ // known not to use polymorphic object-typed properties — true for the benchmark).
sb.AppendLine($"{i}if ({a} == null) context.WriteByte(BinaryTypeCode.PropertySkip);");
sb.AppendLine($"{i}else");
sb.AppendLine($"{i}{{");
- sb.AppendLine($"{i} if (!context.UseMetadata)");
- sb.AppendLine($"{i} {{");
- sb.AppendLine($"{i} context.WriteByte(BinaryTypeCode.ObjectWithTypeName);");
- sb.AppendLine($"{i} context.WriteStringUtf8({a}.GetType().AssemblyQualifiedName!);");
- sb.AppendLine($"{i} }}");
+ if (UsePolymorphType)
+ {
+ sb.AppendLine($"{i} if (!context.UseMetadata)");
+ sb.AppendLine($"{i} {{");
+ sb.AppendLine($"{i} context.WriteByte(BinaryTypeCode.ObjectWithTypeName);");
+ sb.AppendLine($"{i} context.WriteStringUtf8({a}.GetType().AssemblyQualifiedName!);");
+ sb.AppendLine($"{i} }}");
+ }
sb.AppendLine($"{i} AcBinarySerializer.WriteValueGenerated({a}, {a}.GetType(), context, depth);");
sb.AppendLine($"{i}}}");
}
@@ -881,8 +919,9 @@ public class AcBinarySourceGenerator : IIncrementalGenerator
var a = $"obj.{p.Name}";
// PropertyFilter: must match write pass — if filter skips property, scan must skip too
- // Only for non-markerless properties (matching EmitProp behavior)
- if (!IsMarkerless(p.TypeKind))
+ // Only for non-markerless properties (matching EmitProp behavior).
+ // Gated by `UsePropertyFilter` (TEMPORARY const) — same A/B flag as the writer pass.
+ if (UsePropertyFilter && !IsMarkerless(p.TypeKind))
{
sb.AppendLine($"{i}if (context.HasPropertyFilter)");
sb.AppendLine($"{i}{{");
@@ -1849,7 +1888,7 @@ public class AcBinarySourceGenerator : IIncrementalGenerator
sb.AppendLine($"{i} {{");
sb.AppendLine($"{i} if (context.FastWire)");
sb.AppendLine($"{i} {{");
- sb.AppendLine($"{i} var fwlen = (int)context.ReadVarUInt();");
+ sb.AppendLine($"{i} var fwlen = context.ReadInt32Unsafe();");
sb.AppendLine($"{i} {a} = fwlen == 0 ? string.Empty : context.ReadStringUtf8(fwlen);");
sb.AppendLine($"{i} }}");
sb.AppendLine($"{i} else");
diff --git a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs
index d741c7f..832c91f 100644
--- a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs
+++ b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs
@@ -128,6 +128,20 @@ public static partial class AcBinaryDeserializer
return value;
}
+ ///
+ /// Reads a 4-byte signed integer (little-endian on Intel/AMD, native-endian elsewhere).
+ /// Symmetric with Unsafe.WriteUnaligned<int> on the writer side. Used by FastWire
+ /// StringSmall reader to grab charLen:int32.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public int ReadInt32Unsafe()
+ {
+ EnsureAvailable(4);
+ var value = Unsafe.ReadUnaligned(ref _buffer[_position]);
+ _position += 4;
+ return value;
+ }
+
///
/// Reads an 8-byte unsigned integer (little-endian on Intel/AMD, native-endian elsewhere).
/// Used by H2Q6 StringBig reader to grab packed charLen:32 | utf8Len:32 in a single load.
diff --git a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs
index 5490bff..fe20de4 100644
--- a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs
+++ b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs
@@ -1157,8 +1157,9 @@ public static partial class AcBinaryDeserializer
{
if (context.FastWire)
{
- // Mode-shared marker: FastWire payload is [VarUInt charCount][UTF-16 raw bytes]
- var charLenF = (int)context.ReadVarUInt();
+ // Mode-shared marker: FastWire payload is [charLen:int32 LE][UTF-16 raw bytes]
+ // Fix-int charLen (matches MemPack WriteUtf16 shape) — single 4-byte read, no VarUInt loop.
+ var charLenF = context.ReadInt32Unsafe();
return context.ReadStringUtf8(charLenF);
}
diff --git a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs
index 034a406..5072623 100644
--- a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs
+++ b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs
@@ -499,11 +499,14 @@ public static partial class AcBinarySerializer
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int VarUIntSize(uint value)
{
- if (value < 0x80) return 1;
- if (value < 0x4000) return 2;
- if (value < 0x200000) return 3;
- if (value < 0x10000000) return 4;
- return 5;
+ return value switch
+ {
+ < 0x80 => 1,
+ < 0x4000 => 2,
+ < 0x200000 => 3,
+ < 0x10000000 => 4,
+ _ => 5
+ };
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -750,13 +753,16 @@ public static partial class AcBinarySerializer
if (FastWire)
{
- // FastWire: [StringSmall marker][VarUInt charCount][UTF-16 raw bytes]
- // Marker value 91 is mode-shared (Compact StringSmall vs FastWire string marker);
- // reader dispatches by deserializer mode, NOT by re-interpreting the marker.
- WriteByte(BinaryTypeCode.StringSmall);
+ // FastWire: [StringSmall marker:1][charLen:int32 LE][UTF-16 raw bytes]
+ // Fix-int header (no tier-dispatch, no VarUInt branch loop) — matches MemPack `WriteUtf16`
+ // shape (which emits a fix `int` length). Single Unsafe.WriteUnaligned store on the
+ // writer; symmetric ReadInt32Unsafe on the reader.
var byteLenF = charLength * 2; // safe: charLength ≤ 0x1FFFFFFF guarantees no overflow
- WriteVarUInt((uint)charLength);
- EnsureCapacity(byteLenF);
+ EnsureCapacity(7 + byteLenF);
+ var fwPos = _position;
+ var packed = (ulong)BinaryTypeCode.StringSmall | ((ulong)(uint)charLength << 8);
+ Unsafe.WriteUnaligned(ref _buffer[fwPos], packed);
+ _position = fwPos + 5;
MemoryMarshal.AsBytes(value.AsSpan()).CopyTo(_buffer.AsSpan(_position, byteLenF));
_position += byteLenF;
return;
@@ -772,10 +778,12 @@ public static partial class AcBinarySerializer
// reserve was Medium (5 byte) — body is left-shifted by 2 bytes to compact.
var maxBytes = charLength * 4;
- int reserveHeader;
- if (charLength <= 63) reserveHeader = 3;
- else if (charLength <= 16383) reserveHeader = 5;
- else reserveHeader = 9;
+ int reserveHeader = charLength switch
+ {
+ <= 63 => 3,
+ <= 16383 => 5,
+ _ => 9
+ };
EnsureCapacity(reserveHeader + maxBytes);
diff --git a/AyCode.Core/docs/BINARY/BINARY_TODO.md b/AyCode.Core/docs/BINARY/BINARY_TODO.md
index 5042ae3..2a6dbcd 100644
--- a/AyCode.Core/docs/BINARY/BINARY_TODO.md
+++ b/AyCode.Core/docs/BINARY/BINARY_TODO.md
@@ -756,6 +756,8 @@ The optimization-value signal proved below the bench noise floor on the availabl
**Re-evaluable as of 2026-05-07 per `ACCORE-BIN-T-D9X3`** — bench stabilization removes the noise-floor that made the original signal unmeasurable; retest before any code change. (Charset bias remains — pair with `ACCORE-BIN-T-C5R8` for CJK validation.)
+**Retested 2026-05-08 — REGRESSION CONFIRMED** (Latin1Long charset, stabilized bench): adding the do-while inner loop on both 2-byte and 3-byte tiers in `DecodeUtf8SinglePass` produced **+5-8pp Deser regression on every cell** vs. the switch-jumptable baseline (Small +7.8pp, Medium +7.1pp, Large +5.5pp, Repeated +7.4pp, Deep +4.9pp). Reverted to switch-jumptable single-decode same day. The V4N2 entry's original prediction held: "Magyar mixed (KözösCímke, sötét — short alternating runs): 0-5% (run-detection overhead may eat the savings on short runs)" — Latin1Long suffix has 1-2 char average run length, well below the run-detection break-even point. **Phase 2.5 is dead on Magyar mixed.** CJK retest still untried, but Phase 2.5 is now obsoleted by `ACCORE-BIN-T-K7M3` (the decoder hot path runs `Utf8.ToUtf16` BCL static API, not `DecodeUtf8SinglePass`).
+
**Below: original Phase 2.5 design notes preserved as documentation.** Implementation details remain accurate even though the implementation was reverted.
---
@@ -1217,7 +1219,7 @@ Reader-side: SGen-generated code drops the per-property `ReadByte()` + `IsTinyIn
- Opt-in flag with default `false` (preserves marker-driven default; consumers explicitly opt in for frozen-schema scenarios)
## ACCORE-BIN-T-V4N3: Symmetric `GetUtf8ByteCount` API + writer-side BCL kihagyás (cold path)
-**Priority:** P3 · **Type:** Performance · **Status:** Closed (2026-05-06) · **Related:** `EncodeUtf8SinglePass`, `WriteStringUtf8Internal`, `PropertyMetadataBase.NameUtf8`
+**Priority:** P3 · **Type:** Performance · **Status:** Superseded (2026-05-08, by `ACCORE-BIN-T-K7M3`) — landed Closed 2026-05-06; subsequent A/B against modern `Utf8.FromUtf16` / `Utf8.ToUtf16` showed the BCL modern API outperforms the custom transcoder on every benchmark cell, leading to full hot-path switch in K7M3 · **Related:** `EncodeUtf8SinglePass`, `WriteStringUtf8Internal`, `PropertyMetadataBase.NameUtf8`, `ACCORE-BIN-T-K7M3` (hot-path BCL switch)
Symmetric byte-count helper for `EncodeUtf8SinglePass`, paired with writer-side BCL `Encoding.UTF8.GetBytes` / `GetByteCount` removal across all cold-path call sites. `Utf8Transcoder.GetUtf8ByteCount(ReadOnlySpan)` SIMD impl (Vector512 / Vector256 / Vector128 / scalar tier hierarchy, 5-popcount closed-form aggregation handling chunk-split surrogate pairs correctly).
@@ -1235,6 +1237,12 @@ Landed 2026-05-06. All `Utf8TranscoderTests` pass (55/55). Binary test suite unc
**Algorithmic correctness lesson** — the initial 4-popcount formula (`3*N - c_lt_0x80 - c_lt_0x800 - 2*highSur`) was wrong on chunks where a surrogate pair straddles the SIMD chunk boundary (it implicitly assumed `lowSur == highSur` per chunk, which is true over the whole well-formed string but NOT per chunk). Fix: 5-popcount closed-form (`3*N - ascii - c_lt_0x800 + highSur - 3*lowSur`), with the scalar tail using the same per-char accounting model (`i += 1` per char regardless of role; high → 4, low → 0, BMP → 3, two-byte → 2, ASCII → 1). Caught by `GetUtf8ByteCount_MultipleEmojiBoundary_MatchesBcl` and `GetUtf8ByteCount_BoundaryAsciiToEmoji_MatchesBcl` regression tests — exactly the `prefixLen` 1, 7 boundaries that exercise chunk-split surrogate pairs.
+### Superseded by `ACCORE-BIN-T-K7M3` (2026-05-08)
+
+The V4N3 audit measured the custom transcoder against the **legacy `Encoding.UTF8.GetBytes`** API and won. **Did NOT measure against the modern `System.Text.Unicode.Utf8.FromUtf16` / `Utf8.ToUtf16` static API** (.NET 7+, used by MemoryPack source-gen). Once `D9X3` stabilized the bench, a direct A/B revealed the BCL modern API outperforms the custom transcoder on **every** cell (Ser deficit -14 to -22pp, Deser flips from behind to ahead). All 8 hot-path call sites switched to BCL in `K7M3`. The `Utf8Transcoder.cs` file is fully commented out — preserved as historical reference.
+
+The V4N3 algorithmic correctness work (5-popcount surrogate-pair-split-across-chunks closed-form) remains a **valid algorithmic contribution**, but no longer load-bearing on the hot path.
+
## ACCORE-BIN-T-V4N4: NativeAOT-specific inlining / codegen audit on hot UTF-8 path
**Priority:** P2 · **Type:** Performance · **Status:** Reverted (2026-05-07) — bench instability made the optimization signal unmeasurable · **Related:** `EncodeUtf8SinglePass`, `DecodeUtf8SinglePass`, `WriteStringWithDispatch`, `Utf8Transcoder` SIMD path
@@ -1343,7 +1351,7 @@ A V4N4 audit **konklúziója** változatlan érvényes (constant-fold OK, reader
**Re-evaluable as of 2026-05-07 per `ACCORE-BIN-T-D9X3`** — bench stabilization removes the noise-floor that made the original signal unmeasurable; retest before any code change.
-## ACCORE-BIN-T-V4N5: Dead-code review — `WriteFixStrDirect` + `WriteStringUtf8Internal`
+**Obsoleted (2026-05-08) by `ACCORE-BIN-T-K7M3`** — the writer hot path no longer calls the custom `EncodeUtf8SinglePass` at all (`WriteStringWithDispatch` was switched to `Utf8.FromUtf16` BCL). The "AOT method-split / inlining audit" target (`Utf8Transcoder` body method-size in NativeAOT inline budget) is moot — the BCL `Utf8.FromUtf16` is a single static method with its own AOT-friendly inline footprint, and the audit's hypothesis space (Vector256 `IsSupported` constant-fold, lambda delegate cache) was correct for the prior code but no longer applies. The V4N4 disasm methodology remains a **valid technique** for future investigations of generic specialization / inline failures, but the specific hot-path target it analyzed is gone.
**Priority:** P3 · **Type:** Refactor / hygiene · **Status:** Closed (2026-05-06) · **Related:** `BinarySerializationContext.cs`
V4N3 audit surfaced two methods with no callers in the entire workspace:
@@ -1992,4 +2000,443 @@ Header line updated:
- Before: `Iterations: 1000 | Warmup: 10000 | Samples: 10 (median) | ...`
- After: `Iterations: per-cell adaptive (target ~250 ms/sample) | Warmup: 10000 | Samples: 10 (median) + 1 pilot discarded | ... | UnstableCV threshold: 3%`
+## ACCORE-BIN-T-K7M3: Hot-path UTF-8 transcoder switch — `Utf8Transcoder` → BCL `Utf8.FromUtf16` / `Utf8.ToUtf16`
+**Priority:** P1 · **Type:** Performance · **Status:** Closed (2026-05-08) · **Related:** `ACCORE-BIN-T-V4N3` (custom transcoder origin), `ACCORE-BIN-T-V4N2` (Phase 3 SIMD multi-byte), `ACCORE-BIN-T-V4N4` (Reverted method-split), `ACCORE-BIN-T-D9X3` (bench stabilization that made the comparison measurable)
+
+The custom `Utf8Transcoder` (V4N3) was originally implemented to bypass `System.Text.Encoding.UTF8.GetBytes` virtual-dispatch + EncoderFallback overhead. The V4N3 audit measured wins vs. the **legacy `Encoding.UTF8`** API. **What it did NOT measure**: the modern `System.Text.Unicode.Utf8.FromUtf16` / `Utf8.ToUtf16` API (.NET 7+, tier-1 optimized, used by MemoryPack `WriteUtf8` / `ReadUtf8` paths internally). Once the bench stabilized (D9X3), a direct A/B comparison surfaced that the BCL modern API consistently outperforms the custom transcoder on the binary serializer's hot path.
+
+### Bench A/B (Latin1Long charset, FastMode SGen Compact)
+
+| Cell | Ser delta vs MemPack — custom (`EncodeUtf8SinglePass`) | Ser delta vs MemPack — BCL (`Utf8.FromUtf16`) | Improvement |
+|------|--------------------------------------------------------|------------------------------------------------|-------------|
+| Small | +28.5% | +7.3% | **-21pp** |
+| Medium | +23.8% | +3.1% | -21pp |
+| Large | +19.6% | +5.1% | -14pp |
+| Repeated | +28.8% | +10.9% | -18pp |
+| Deep | +23.1% | +0.6% | -22pp |
+
+| Cell | Deser delta vs MemPack — custom (`DecodeUtf8SinglePass`) | Deser delta vs MemPack — BCL (`Utf8.ToUtf16`) | Improvement |
+|------|---------------------------------------------------------|------------------------------------------------|-------------|
+| Small | +17.6% | -1.2% (paritás) | -19pp |
+| Medium | +12.8% | -4.7% (AcBinary nyer) | -17pp |
+| Large | +4.9% | -10.3% (AcBinary nyer) | -15pp |
+| Repeated | +16.9% | -1.6% (paritás) | -18pp |
+| Deep | +7.0% | -9.0% (AcBinary nyer) | -16pp |
+
+The Deser side flipped from "consistently behind" to "wins on 3 of 5 cells, paritás on 2". The Ser side closed the deficit from +20-29% to 0-11%. **Both sides** measurable improvement on **every** cell.
+
+### Why the custom transcoder lost
+
+The V4N3 implementation included a 4-tier SIMD ASCII prefix path (Vector512BW / Vector256 / Vector128 / scalar) plus a DWORD ASCII batch + scalar 4-branch multi-byte fallback. **All correct, all SIMD-tuned**. But:
+
+1. **`Utf8.FromUtf16` is also SIMD-tuned in .NET 9** — the .NET team rewrote it on top of `System.Text.Unicode.Utf8` primitives that share infrastructure with `Ascii.IsValid` / `Latin1.GetString`. AOT-publish-friendly, branch-friendly, no virtual dispatch (the `Utf8` API is static, not via an `Encoding` instance with virtual-method-table).
+2. **The custom transcoder's ASCII prefix path bails out on first non-ASCII byte** — on multi-byte content (Latin extended / Cyrillic / CJK) the SIMD path runs only for the leading ASCII span, then the entire remainder falls into per-char scalar 4-branch dispatch. The BCL `Utf8.FromUtf16` SIMD-batches multi-byte content too (different algorithm — the BCL doesn't bail on first non-ASCII).
+3. **AOT inline budget**: the custom transcoder's body grew with the V4N3 / V4N4 / V4N5 additions; in NativeAOT publish the call sites in `WriteStringWithDispatch` / `ReadString*` did NOT inline (V4N4 disasm audit confirmed). The BCL `Utf8.FromUtf16` is a single static method with a tighter call-site footprint.
+
+### Resolution
+
+Landed 2026-05-08. The 8 production hot-path call sites of `Utf8Transcoder.*` switched to BCL:
+
+| File / line | Before | After |
+|---|---|---|
+| `AcBinarySerializer.cs:120` | `Utf8Transcoder.GetUtf8ByteCount` | `Encoding.UTF8.GetByteCount` |
+| `AcBinarySerializer.BinarySerializationContext.cs:694` | `Utf8Transcoder.EncodeUtf8SinglePass` | `Utf8.FromUtf16(...)` |
+| `AcBinarySerializer.BinarySerializationContext.cs:784` | `Utf8Transcoder.EncodeUtf8SinglePass` | `Utf8.FromUtf16(...)` |
+| `AcBinarySerializer.BinarySerializationContext.cs:901` | `Utf8Transcoder.EncodeUtf8SinglePass` | `Utf8.FromUtf16(...)` |
+| `AcBinaryDeserializer.BinaryDeserializationContext.Read.cs:523` | `Utf8Transcoder.CountUtf8Chars` | `Encoding.UTF8.GetCharCount` |
+| `AcBinaryDeserializer.BinaryDeserializationContext.Read.cs:527` | `Utf8Transcoder.DecodeUtf8SinglePass` | `Utf8.ToUtf16(...)` |
+| `AcBinaryDeserializer.BinaryDeserializationContext.Read.cs:565` | `Utf8Transcoder.DecodeUtf8SinglePass` | `Utf8.ToUtf16(...)` |
+| `PropertyMetadataBase.cs:104-109` (ctor-once) | `Utf8Transcoder.GetUtf8ByteCount` + `EncodeUtf8SinglePass` (two-pass) | `Encoding.UTF8.GetBytes(string)` (single-pass with exact-size byte[] return) |
+
+The count-only call sites (`GetByteCount` / `GetCharCount`) stay on the **legacy** `Encoding.UTF8` API — `System.Text.Unicode.Utf8` has no count-only equivalent (only `FromUtf16` / `ToUtf16` which encode + count combined). For pure count, the legacy API is the optimal tool (single SIMD-tuned scan, no encode/decode work).
+
+The `Utf8Transcoder.cs` file remains in the repo but **fully commented out** — the class definition is preserved as historical reference / future reactivation if a workload ever surfaces where it could win again. `Utf8TranscoderTests.cs` is not currently exercising live code.
+
+### Lesson — the V4N3 audit's blind spot
+
+The V4N3 (custom transcoder) audit compared against **legacy `Encoding.UTF8.GetBytes`** and won. **The audit did NOT compare against `Utf8.FromUtf16`** (the modern API, .NET 7+). On modern runtime the BCL has two UTF-8 transcoders: a legacy one (instance-method on `Encoding`, virtual dispatch) and a modern one (static `Utf8.FromUtf16` / `Utf8.ToUtf16`). MemoryPack uses the modern one — that's what we should have been comparing against from the start.
+
+**Generalizable lesson**: when measuring a custom implementation against a "BCL baseline", verify which BCL API is used by the actual competition (here: MemoryPack source-gen). The `Encoding.UTF8.*` instance API and `System.Text.Unicode.Utf8` static API are different generations of the same logical operation; treating them as interchangeable hides the comparison's scope.
+
+### Why P1
+
+- Closed the FastMode Compact mode Ser deficit from +20-29% to ≤11% on every cell (Latin1Long benchmark)
+- Flipped the Deser side from -1 to -10% deficit to **AcBinary winning on 3 of 5 cells**, parity on 2 (Latin1Long benchmark)
+- One-time fixed cost (8 production call-site cseréje) — every future bench profits
+- Removed a load-bearing ~600-line custom SIMD module from the maintained surface area; future maintainers don't need to reason about Vector512BW / cross-lane shuffle / 5-popcount surrogate-pair correctness — the BCL handles it
+
+### Follow-up — `Utf8Transcoder.cs` cleanup
+
+The file is fully commented out. Either:
+- **Delete** entirely (preferred for repo cleanliness) — `Utf8TranscoderTests.cs` then needs deletion or revival as a regression-only guard
+- **Keep** the comment-block as historical reference, with a header comment pointing to this entry
+
+Decision deferred — the comment-block does no harm to build / runtime. Address when the next docs-archive sweep runs.
+
+## ACCORE-BIN-T-P3X7: Profile-driven Compact-mode Ser optimalizációs roadmap (post-K7M3 hot-path analysis)
+**Priority:** P2 · **Type:** Performance roadmap · **Status:** Open · **Related:** `ACCORE-BIN-T-K7M3` (BCL UTF-8 transcoder switch — előfeltétele), `ACCORE-BIN-T-D9X3` (bench stabilization), `ACCORE-BIN-T-S2X9` (markerless schema lane — primitív property-marker már kivezetve a SGen-ben), `ACCORE-BIN-T-V4N4` (audit methodológia hivatkozás)
+
+A 2026-05-08 VS Performance Profiler session (4 sec range, AcBinary FastMode Serialize, Latin1Long charset, FastWire mode) konkrét hot-path-decomposition-t adott a K7M3 BCL-csere utáni állapotról. A string-encoding már nem akadály (a `Utf8.FromUtf16` SIMD-tuned), a fennmaradó AcBinary-specific overhead azonosítható.
+
+### Profile session adatok (Self CPU%)
+
+| Self CPU% | Function | Category |
+|---|---|---|
+| 39.77% | `System.Buffer._Memmove` | Közös MemPack-kel (UTF-16 raw + return-time `byte[]`-copy) — **NEM AcBinary-spec** |
+| **10.03%** | `AcBinarySerializer.Serialize` | Top-level (context-acquire, type lookup, return-alloc) |
+| **7.48%** | `TestMeasurementPoint_GeneratedWriter.WriteProperties` | SGen template (legkisebb levél típus, ~12500 hívás Large cellán) |
+| **5.31%** | `WriteStringWithDispatch` | String hot path |
+| **3.23%** | `TestMeasurement_GeneratedWriter.WriteProperties` | SGen |
+| **1.66%** | `WriteVarUIntMultiByteUnsafe` | VarUInt int-property encode |
+| 1.10% | `TestPallet_GeneratedWriter.WriteProperties` | SGen |
+| 0.39% | `TestOrderItem_GeneratedWriter.WriteProperties` | SGen |
+| 0.32% | `SharedUser_GeneratedWriter.WriteProperties` | SGen |
+| 0.05% | `ArrayBinaryOutput.Grow` | Buffer-grow (ritka, kicsi probléma) |
+
+**Total SGen `WriteProperties` Self CPU**: ~12.6% — a leg nagyobb AcBinary-specific surface.
+
+A `AcBinarySerializer.Serialize` line-szintű drill-down (`AcBinarySerializer.cs:312-335`):
+- `WriteObject(value, wrapper, context, 0)` Total: 28.05% — a teljes serializációs fa (SGen + Writer hot path)
+- `context.Output.ToArray(context._buffer, context._position)` Total: **47.37%** — final `byte[]`-alloc + content-memcpy (= a 39.77% `_Memmove` Self nagy része)
+
+### MemPack-összehasonlítás (referenciaként)
+
+A MemPack `Serialize(T value)` mechanizmus:
+1. **`[ThreadStatic]`** writer-state — nincs pool-bérlés, nincs lock, nincs concurrent dictionary lookup
+2. **`ReusableLinkedArrayBufferWriter`** — linked chunk-list (4 KB → 8 KB → 16 KB geometriai); buffer-grow = új chunk hozzáadása, **nincs memcpy a régi adaton**
+3. **`ToArrayAndReset()`** — végén alloc + chunks → byte[] memcpy (közös overhead az AcBinary-vel)
+
+Az AcBinary `AcquireArrayOutputContext(options)` pool-bérlés + lineáris `byte[]` `Array.Resize` + `Output.ToArray(...)` — két memcpy-cost (grow + return), de a grow ritka.
+
+### Sorrendezett optimalizációs ötletek
+
+#### A. SGen `WriteProperties` — ensure-capacity batching (várt: -1-3pp Ser, **revíziós becslés**)
+
+Jelenlegi SGen-template per-property emit (mindenenkit külön ensure):
+```csharp
+context.WriteVarInt(obj.Id); // ensure(5) + write(1-5)
+context.WriteByte(BinaryTypeCode.Object); // ensure(1) + write(1)
+context.WriteVarInt((int)obj.Status); // ensure(5) + write(1-5)
+context.WriteRaw(obj.Weight); // ensure(8) + write(8)
+```
+
+Csoportosított ensure pattern:
+```csharp
+context.EnsureCapacity(maxBytesForGroup); // worst-case sum, 1× hívás
+context.WriteVarIntUnsafe(obj.Id); // no ensure (csak buffer write)
+context.WriteByteUnsafe(BinaryTypeCode.Object); // no ensure
+context.WriteVarIntUnsafe((int)obj.Status);
+context.WriteRawUnsafe(obj.Weight);
+```
+
+A `AcBinarySourceGenerator.cs` `WriteProperties` template-jét kell módosítani:
+1. Property-listából contiguous primitív csoportok kinyerése (Object/Collection property-knél megszakítva — mély rekurzió, méret nem előre kiszámítható)
+2. Csoportonként worst-case-size compute compile-time-on (a primitív type-ok mérete fix vagy worst-case ismert)
+3. Egyetlen `EnsureCapacity(sum)` + bulk `*Unsafe` write-ok
+
+`*Unsafe` írók szükségessége: `WriteVarUIntUnsafe` már létezik. **`WriteByteUnsafe`, `WriteRawUnsafe`** valószínűleg hozzá kell adni a `BinarySerializationContext`-hez.
+
+**Becslés-revízió (2026-05-08)**: az eredeti -4-6pp becslés felső volt. Egy `EnsureCapacity` inline-olva ~1-2 ns/call (a hot path-on a branch-prediction perfekt — sosem jut el a Grow-hoz). 10 property × 1.5 ns = ~15 ns / object megtakarítás batch-eléssel — Latin1Long Large cell 1250 instance × 13 ns = ~16 µs / 120 µs Ser ≈ **~13% felső**, de **csak az ensure-szám csökkenéséből**. A SGen `WriteProperties` Self CPU 12.6%-a **NEM csak** ensure-check; tartalmaz `HasPropertyFilter` branch-check, null-check + depth-check dispatch, `Unsafe.As` cast, etc. — lásd **F**. Az ensure-batching önmagában reálisan **1-3pp Ser javulás**.
+
+**Wire-formátum változatlan**, backward-kompatibilis, kis kockázat. Hatás minden cellán mérhető (TestOrder cell-szerkezet ~100+ primitív property per Object-instance).
+
+#### B. `WriteStringWithDispatch` Compact ág batch-write (várt: -1-2pp Ser)
+
+A FastWire ágat már `K7M3`-ban + a 2026-05-08 batch-write fixxel egyetlen ensure + direct-write-ra alakítottuk. A **Compact ág** ugyanaz a 3-step pattern (post-encode tier-shift `CopyTo` ha `actualHeader < reserveHeader`, plus header-write a tier alapján). A Compact ágon is alkalmazható batch-write — egyetlen `EnsureCapacity` a worst-case-tier-szel + direct header-write a `Utf8.FromUtf16` után.
+
+#### C. Thread-static context (várt: -2-4pp Ser, NAGY refactor)
+
+A `AcquireArrayOutputContext(options)` pool-bérlés overhead-jét mérsékelheti a MemPack `[ThreadStatic]` mintázat. A jelenlegi pool-bérlés:
+- Pool dictionary lookup (lehet, lock-os)
+- Context-state init / reset minden hívásnál
+
+Thread-static cseréje:
+- Per-thread cached context, nincs lock
+- Context-reset minden hívásnál ugyanaz, de a `state` allokáció egyszer fut
+
+**Refactor szempontok**:
+- A `BinarySerializationContext` state-tárolása nem thread-safe önmagában — pool-bérlés vagy thread-static mind a single-thread haszálatot biztosítja
+- Az `options` paraméter érintheti a state-init logikát — multi-options scenárió esetén a thread-static state-t reset-elni kell
+- Concurrent serialize hívások (több thread egyidejű) — minden thread saját state-tel rendelkezne; nincs cross-thread sharing igény
+
+#### D. Linked-array buffer chunk strategy (kicsi hatás, NAGY refactor)
+
+A MemPack `ReusableLinkedArrayBufferWriter` linked chunk-list helyettesíti a lineáris `byte[]`-grow stratégiát. Buffer-grow = új chunk hozzáadása (no memcpy a régi adaton).
+
+**A profile szerint a `ArrayBinaryOutput.Grow` Self CPU csak 0.05%** — a buffer-grow ritkán fut, a default kapacitás elég nagy a Large cell-hez. **Kicsi hatás, nagy refactor**. Alacsony prioritás.
+
+#### F. SGen `HasPropertyFilter` lift-out a `WriteProperties` method elejére (várt: -2-4pp Ser)
+
+A jelenlegi SGen-template **minden property-emit előtt** ellenőrzi a property-filter-t:
+```csharp
+public void WriteProperties(object value, ...)
+{
+ var obj = Unsafe.As(value);
+
+ if (context.HasPropertyFilter) // ← MINDEN property-en check!
+ {
+ var fc_Category = new BinaryPropertyFilterContext(obj, ..., "Category", ...);
+ if (!context.PropertyFilter!(in fc_Category)) {
+ context.WriteByte(BinaryTypeCode.PropertySkip);
+ goto skip_Category;
+ }
+ }
+ if (obj.Category == null) context.WriteByte(BinaryTypeCode.PropertySkip);
+ else if (depth > context.MaxDepth) context.WriteByte(BinaryTypeCode.Null);
+ else { context.WriteByte(BinaryTypeCode.Object); ...WriteProperties... }
+ skip_Category:;
+
+ if (context.HasPropertyFilter) { /* same for Inspector */ } // ← újra!
+ // ... 10× ismétlés property-listán
+}
+```
+
+A `HasPropertyFilter` per-property branch-check **TestOrder benchmark workload-on mindig false** (a benchmark nem használ property-filter-t). De a check minden property-en lefut — kód-cache-ben benne van, branch-predict ugyan jó, **mégis CPU cycle**.
+
+Optimalizáció — kétpályás SGen kódgenerálás:
+```csharp
+public void WriteProperties(object value, ..., int depth)
+{
+ var obj = Unsafe.As(value);
+
+ if (context.HasPropertyFilter)
+ {
+ WritePropertiesWithFilter(obj, context, depth); // ritka path — full per-property check
+ return;
+ }
+
+ // Fast path — NO filter check anywhere
+ if (obj.Category == null) context.WriteByte(BinaryTypeCode.PropertySkip);
+ else if (depth > context.MaxDepth) context.WriteByte(BinaryTypeCode.Null);
+ else { ... }
+ // (no skip_Category goto — never needed)
+
+ context.WriteVarInt(obj.Id); // primitív, no filter check
+ // ... rest of properties without HasPropertyFilter check
+}
+
+// Külön emit-elt method ritka path-ra:
+private static void WritePropertiesWithFilter(TestPallet obj, ..., int depth)
+{
+ // Full per-property filter-aware kód (the current behavior)
+}
+```
+
+A `AcBinarySourceGenerator.cs`-t kell módosítani:
+1. A `WriteProperties` method elején egyetlen `HasPropertyFilter` check
+2. Két különböző code-path emit:
+ - **Fast path** (default — no filter): nincs per-property `if (context.HasPropertyFilter)` check, nincs filter-context allokáció + lambda-call, nincs `goto skip_X`
+ - **Slow path** (filter aware — separate static method): a jelenlegi viselkedés
+
+**Várt nyereség**: a fast path ~10 elimináció / object × 1-2 ns / branch ≈ ~15-20 ns / object. Latin1Long Large cell 1250 instance × 18 ns = ~22 µs / 120 µs Ser ≈ **~18% felső becslés**; reálisan **2-4pp Ser javulás** (a kód-bloat növekedés és a JIT inlinelés-ráhatás miatt mérséklődik).
+
+**Kombinálható az A-val**: az **A + F** együtt **3-7pp javulás** célozható meg — a SGen `WriteProperties` 12.6% Self CPU jelentős csökkenése.
+
+**Wire-formátum változatlan**, kód-méret kicsivel nő (két path-ot generál minden type-on), de a fast path a JIT-tel jobban inlinelhető.
+
+#### G. SGen `WriteProperties` null/depth/object-ref kombinálás (kapcsolt az F-hez)
+
+A komplex (Object) property-knél a 3-ágú dispatch:
+```csharp
+if (obj.X == null) context.WriteByte(BinaryTypeCode.PropertySkip);
+else if (depth > context.MaxDepth) context.WriteByte(BinaryTypeCode.Null);
+else { context.WriteByte(BinaryTypeCode.Object); X_GeneratedWriter.Instance.WriteProperties(...); }
+```
+
+Ez minden komplex property-en fut. Lehetséges optimalizáció: a `depth > MaxDepth` check egy method-szintű branch-szé alakítás (egyszer ellenőrizni a method elején, aztán a property-szintű ágat egyszerűsíteni). De ez **kis hatás** és a `MaxDepth` jellemzően nem érintő (a legtöbb workload-on `depth < MaxDepth`).
+
+Alacsony prio, F-tel kombinált.
+
+#### E. `WriteVarUIntMultiByteUnsafe` (1.66% Self) → fix-int (várható: -1pp Ser, **NEM javasolt önmagában**)
+
+A `WriteVarInt` (signed int property-encode, ZigZag + VarUInt) kódolás a SGen-template-ekben gyakori (Id, Status, TrayCount, stb.). A multi-byte ág 1.66% Self CPU.
+
+Fix-int (4 byte) cseréje wire-méret-növekedéssel jár (kis int-eken +3 byte / property), ami a wire-formátum kompaktság-előnyét rontja. **Csak `ACCORE-BIN-T-S2X9` markerless lane kontextusban** érdemes — ahol a property-marker eltávolításával együtt fix-int kicserélése wire-szempontból kompenzálódik.
+
+### Közös, NEM AcBinary-spec overhead — nem optimalizálható
+
+A `Buffer._Memmove` 39.77% Self CPU + a `Output.ToArray()` 47.37% Total **a return-time `byte[]`-alloc + content-memcpy**, ami minden `byte[] Serialize(T)` hívásnál fut. **Mindkét engine fizeti** (MemPack `ToArrayAndReset()` is alloc + memcpy a chunkokból). Az API contract (`byte[] Serialize(T)`) miatt elkerülhetetlen.
+
+**Aki teljesítményt akar**, használja a `IBufferWriter` overload-ot (`AcBinaryBufferWriterBenchmark` vs `MemoryPackBufferWriterBenchmark` apples-to-apples a benchmarkban — mindkét engine ugyanezt csinálja).
+
+### Acceptance (per-section)
+
+- **A** (SGen ensure-batching): Latin1Long FastWire bench AcBinary Ser delta vs MemPack -1-3pp javulás minden cellán
+- **F** (HasPropertyFilter lift-out): Latin1Long Ser delta -2-4pp; **A + F együtt** SGen `WriteProperties` Self CPU ≤ 8% (jelenleg ~12.6%)
+- **G** (null/depth/object-ref kombinálás): kis hatás, F-tel kombinált
+- **B** (WriteStringWithDispatch Compact batch-write): Latin1Long Compact bench AcBinary Ser delta vs MemPack ≤ +5% minden cellán
+- **C** (Thread-static context): `Serialize` Self CPU ≤ 6% (jelenleg ~10%)
+- **D** (Linked-array): nem prioritás — buffer-grow Self CPU már ≤ 0.05%
+- **E** (VarInt → fix-int): csak az `S2X9` markerless lane sprint kontextusában mérni
+
+### Sorrend
+
+1. **A + F kombinálva** — SGen `WriteProperties` template átfogó refactor (ensure-batching + HasPropertyFilter lift-out + esetleg G null/depth-combine). Együtt **~3-7pp Ser javulás** várt minden cellán. Izolált változtatás csak `AcBinarySourceGenerator.cs`-en, wire-format változatlan.
+2. **B** — ~1-2pp javulás, ugyanaz a pattern mint a `K7M3` FastWire batch-write
+3. **C** — ~2-4pp, de NAGY refactor (thread-safety, pool semantics felülvizsgálat)
+4. **D** — alacsony prioritás (kis hatás, nagy refactor)
+5. **E** — csak `S2X9` kontextusban
+
+### Trigger
+
+- **A + F** → most azonnal implementálható; ezek a SGen template-en belül kombinálandók (egyetlen template-átdolgozás kétségtelenül jobb mint külön refactor-körök). Minden továbbai mérés ettől függ.
+- **B** → A+F után, hasonló pattern alkalmazása más writer-helyen
+- **C** → ha a Serialize Self CPU 10% továbbra is dominál A+F+B után
+- **D, E** → opcionális, az A/F/B/C eredmények alapján
+
+## ACCORE-BIN-T-Q5T2: Önleíró wire-formátum — duplikált object-marker-ek + UTF-16 string marker (per-type/property encoding choice)
+**Priority:** P2 · **Type:** Architecture / Performance · **Status:** Open · **Related:** `ACCORE-BIN-T-P3X7` (profile-driven roadmap — kis-adat slowdown diagnózis), `ACCORE-BIN-T-K7M3` (BCL UTF-8 transcoder — előfeltétele), `ACCORE-BIN-T-S2X9` (markerless schema lane), `ACCORE-BIN-T-V4N2` (UTF-8 SIMD)
+
+A 2026-05-08 design-session során merült fel mint válasz a kis-adat-slowdown problémára és az `if (FastWire)` / `if (UseMetadata)` runtime-branch-ek széles jelenlétére. Cél: a wire-mode kivezetése a globális header-ből, **per-object/per-property encoding-szabadság** attribute-tal, megőrizve a SGen↔Runtime wire-kompatibilitást.
+
+### LLM Context (cold-start)
+
+Egy fresh session olvasásához ez a kontextus elég:
+
+**Wire-modell**: AcBinary két párhuzamos serializációs path-ot futtat — **SGen** (compile-time generált, `[AcBinarySerializable]` típusokra) és **Runtime** (reflection + `Expression.Compile`). **Mindkettő ugyanazt a wire-t produkálja és olvassa** (interop garancia, `BINARY_SGEN.md` "Hybrid Execution Model").
+
+**Markerless body**: object scope-on belül a primitív property-k (int, long, double, …) **közvetlenül** írnak a wire-be, marker-byte nélkül. A reader a sorrendet compile-time schema-ból (SGen) vagy `OrderedProperties` metadata-ból (Runtime) tudja. A wire object-prefix-szel kezdődik (1-byte marker), majd markerless body.
+
+**Meglévő object-marker család** (`AcBinarySerializer.BinarySerializationContext.cs` writer-ek + `AcBinaryDeserializer.cs` reader-dispatch switch):
+- `Object` — sima first-occurrence
+- `ObjectWithTypeName` — polimorf (`runtimeType != declaredType`)
+- `ObjectFullMarkerIId` / `ObjectFullMarkerAll` — `RefHandling=IId|All` first-occurrence
+- `ObjectRef` / `ObjectRefIId` — subsequent (csak ID, **NEM duplikálódik** — nincs primitív property körülötte)
+
+**OPT-OUT minta** (jelenlegi konvenció): default SGen flexibilis — minden runtime-branch-et generál (pl. `if (context.UseRefHandling)`). Class-attribute disable-eli a feature-t → SGen omitti a branch-et → drasztikus optimum. Q5T2 ezt a mintát terjeszti ki **encoding-választásra**.
+
+**Naming-konvenció**: PascalCase, suffix-variánsok (`Object` → `ObjectVarUInt`, `String` → `StringUtf16`). NEM `Object_NoZZ`, NEM `ObjVU`.
+
+### Motiváció
+
+A jelenlegi `AcBinaryOptions.WireMode` (FastMode vs Compact) **payload-szintű globális flag**:
+- A kódban sok `if (FastWire) { ... } else { ... }` branch (lásd `WriteVarInt` 514. sor, `WriteStringWithDispatch`, `WriteValueNonPrimitive`, property-writers)
+- A fejlesztő nem optimalizálhat granuláris szinten (pl. `[NoZZ]` egy hot type-ra, default másnak)
+- Schema-evolúciós szempontból: ha a szerver attribute-ot változtat egy type-on, a klienseknek (akár régebbi verzió) **rekomp nélkül** olvasniuk kell az új wire-t
+
+A `ACCORE-BIN-T-P3X7` profile-bench mérése szerint a kis-adat slowdown (Latin1Long Small +2.6%, Medium +1.5% AcBinary lassulás MemPack-hez képest) jelentős részben a VarUInt per-call overhead-ből származik (ZigZag shift + multi-byte branch loop). A type-szintű `[IntEncoding=VarUInt]` attribute-tal a fejlesztő a non-negative property-ket VarUInt-NoZigZag-ra állíthatja → ZigZag shift kiesik, kis-adatra mérhető nyereség.
+
+### Wire-formátum design
+
+**5 új `BinaryTypeCode` marker** (naming TBD: `*VarUInt` vagy `*NoZZ` suffix, implementációkor véglegesítendő):
+
+| Új marker | Cél | Alkalmazási hely |
+|---|---|---|
+| `ObjectVarUInt` | Object scope primitive int/long/enum-jai NoZigZag VarUInt encoding-ban | sima object first-occurrence |
+| `ObjectWithTypeNameVarUInt` | Polimorf first-occurrence NoZZ-variánsa | `runtimeType != declaredType` esetén |
+| `ObjectFullMarkerIIdVarUInt` | `RefHandling=IId` first-occurrence NoZZ-variánsa | csak first; subsequent `ObjectRefIId` változatlan |
+| `ObjectFullMarkerAllVarUInt` | `RefHandling=All` first-occurrence NoZZ-variánsa | csak first; subsequent `ObjectRef` változatlan |
+| `StringUtf16` | UTF-16 encoded string content (property-szintű) | bárhol egy string property emit-jénél |
+
+**Wire-példa**:
+```
+[ObjectVarUInt marker] ← scope-szintű: int-property-k VarUInt-NoZZ
+ WriteVarUInt(obj.Id) ← markerless body, encoding a marker alapján
+ WriteVarUInt(obj.Status)
+ [String marker] UTF-8(obj.Notes) ← default UTF-8
+ [StringUtf16 marker] UTF-16(obj.Name) ← property-szintű override
+```
+
+**Byte-szintű példa** (`Order { Id=42, Status=3, Notes="ok" }`, class-szintű `IntEncoding=VarUInt`):
+- Default ZigZag wire: `[Object]` `[0x54]` (VarInt 42 ZigZag: `((42<<1)^(42>>31))=84`) `[0x06]` (VarInt 3 ZigZag: 6) `[String]` `[0x02]` `0x6F 0x6B`
+- New VarUInt wire: `[ObjectVarUInt]` `[0x2A]` (VarUInt 42 raw: `0x2A`) `[0x03]` (VarUInt 3 raw: `0x03`) `[String]` `[0x02]` `0x6F 0x6B`
+- Body-sorrend és byte-szám változatlan; csak az encoding-szabályok mások. Stringek ugyanúgy markered (UTF-8 default itt). String-encoding override esetén `[StringUtf16]` `[char-count]` `[2-byte-per-char]`.
+
+A primitive property-k körüli wire **markerless marad** — a body-encoding-ot az object-marker határozza meg, nem per-property byte. Wire-bloat csak ott van, ahol most is van marker (object-prefix, string-marker).
+
+### Attribute design
+
+**Object-szintű** (mert object-marker is object-szintű):
+```csharp
+[AcBinarySerializable(IntEncoding = IntEncoding.VarUInt)]
+public class Order { ... }
+```
+
+**Property-szintű** (csak string-en, mert string-marker is per-property):
+```csharp
+public class Order {
+ [AcBinaryEncoding(StringEncoding.Utf16)]
+ public string CustomerName { get; set; }
+}
+```
+
+**Új public API elemek**:
+- `AcBinaryEncodingAttribute` (target: `Class | Property`)
+- `IntEncoding` enum (`Default` = ZigZag VarInt, `VarUInt` = NoZigZag)
+- `StringEncoding` enum (`Default` = UTF-8, `Utf16` = UTF-16)
+- `AcBinaryOptions.IntEncoding` és `AcBinaryOptions.StringEncoding` runtime fallback opciók
+
+### Encoding-választás precedenciája (writer-side)
+
+1. **Property attribute** (legerősebb) — pl. `[AcBinaryEncoding(StringEncoding.Utf16)]`
+2. **Class attribute** — pl. `[AcBinarySerializable(IntEncoding=VarUInt)]`
+3. **`AcBinaryOptions` runtime opció** — pl. `options.StringEncoding = Utf16`
+4. **Built-in default** — ZigZag-VarInt + UTF-8
+
+### Szerepkörök és path-ok
+
+| Path | Encoding-választás |
+|---|---|
+| **SGen writer (with attribute)** | Compile-time pinned, hard-coded marker + encoding emit (NO runtime branch) — a meglévő OPT-OUT minta (mint `RefHandling`/`Interning` disable) |
+| **SGen writer (no attribute)** | Runtime branch a `context.IntEncoding`/`context.StringEncoding` option-en — két path generálódik, runtime dönt |
+| **SGen reader** | **Marker-dispatch** (NEM hard-coded marker-expect — runtime-on dönti el, hogy `Object` vagy `ObjectVarUInt` érkezett, és annak megfelelően olvas) |
+| **Runtime writer (reflection-based)** | Reflection-attribute-read + option fallback + default fallback — ugyanaz a precedencia mint SGen-nél |
+| **Runtime reader** | Marker-dispatch (universal — nincs attribute / option használat encoding-döntésre, csak a marker-byte) |
+
+⚠️ **SGen reader marker-dispatch KÖTELEZŐ** (NEM hard-coded marker-expect). Konkrét scenario amit ez kezel:
+
+> Szerver Runtime-mode-ban serializálja `Order`-t. Az `Order` osztályon a szerver-deploy óta **változott az attribute** (új deploy hozott `[IntEncoding=VarUInt]`-ot). Szerver Runtime writer reflection-ből olvassa az új attribute-ot → `ObjectVarUInt` markert emit-el a wire-be.
+>
+> Régi kliens **rekomp nélkül** kapja a payload-ot. Ha a kliens SGen reader-e hard-coded `Object`-marker-expect-tel olvasna → **panik / mismatch**.
+>
+> Marker-dispatch-szel a kliens helyesen dekódol bármelyik markert, függetlenül attól, hogy a kliens-oldali compile-time `Order` typebe-n volt-e az attribute.
+
+Ez biztosítja a **"server-side attribute-change doesn't break clients"** garanciát.
+
+### Kompatibilitási garanciák
+
+| Interakció | Eredmény |
+|---|---|
+| SGen-write (NoZZ attr) → SGen-read | OK (marker-dispatch) |
+| SGen-write (NoZZ attr) → Runtime-read | OK (marker-dispatch) |
+| Runtime-write (option=NoZZ) → SGen-read | OK (marker-dispatch) |
+| Runtime-write (option=NoZZ) → Runtime-read | OK (marker-dispatch) |
+| Server-attribute-changed → old client (no recompile) | OK — kliens csak a marker-t olvassa |
+| Mixed payload (egyik object NoZZ, másik default) | OK — minden object-marker önálló scope |
+
+### Implementációs lépések
+
+1. **`BinaryTypeCode` const-bővítés** — 5 új byte-érték (range-allokáció: a meglévő enum szervezése alapján a következő szabad slot-okba). Wire-format spec frissítés `BINARY_FORMAT.md`-ben.
+2. **`AcBinaryEncodingAttribute` + `IntEncoding` + `StringEncoding` enum-ok** — új fájlok az `AyCode.Core/Serializers/Binaries/` mappában.
+3. **`AcBinaryOptions.IntEncoding` + `AcBinaryOptions.StringEncoding`** opciók hozzáadása (default = `Default`).
+4. **`WriteStringUtf16` / `ReadStringUtf16` context-helper-ek** — `MemoryMarshal.Cast` direct copy + length-prefix (VarUInt char-count).
+5. **Runtime writer reflection** — `BinarySerializeTypeMetadata` cache: `IntEncoding`, `StringEncoding`-per-property flag-ek (attribute-alapján). Encoding-emit a precedencia szerint.
+6. **SGen writer template** — attribute-feldolgozás `EmitWriteValue`-ban: ha attribute → compile-time hard-coded emit; ha nincs → runtime-branch emit a `context` option-en.
+7. **SGen reader template** — `EmitReadValue` marker-dispatch-szel (object-marker scope-encoding-mode tracking + string-marker per-property dispatch).
+8. **Runtime reader update** — object-marker dispatch a scope-encoding-state-be (pl. `BinaryDeserializationContext.CurrentIntEncoding`), string-marker per-property dispatch.
+9. **Cross-mode tesztek** — minden write-read kombináció (SGen↔SGen, SGen↔Runtime, Runtime↔SGen, Runtime↔Runtime) minden encoding-kombinációban (default, attr-only, option-only, attr+option, mixed payload).
+10. **Doc**: `BINARY_FORMAT.md` wire-format spec, `BINARY_OPTIONS.md` új opciók, `BINARY_SGEN.md` precedencia + szerepkörök táblázat.
+
+### Acceptance
+
+- 5 új BinaryTypeCode marker, naming-konvenció dokumentált
+- `AcBinaryEncodingAttribute` + 2 enum + 2 opció extension working
+- Round-trip teszt minden cross-mode kombinációban zöld
+- Wire-bloat default-encoding-on **0 byte** (nincs új per-property marker)
+- Latin1Long Small bench: AcBinary `[IntEncoding=VarUInt]` típuson a slowdown ≤ MemPack +0.5pp (jelenleg +2.6%)
+- `BINARY_FORMAT.md`/`BINARY_OPTIONS.md`/`BINARY_SGEN.md` szinkronban a wire- és attribute-világgal
+- A meglévő `WireMode=Fast/Compact` distinction-ek kompatibilisek maradnak (vagy migrálódnak az új encoding-attribute-okra — külön döntés implementációkor)
+
+### Trigger / Sorrend
+
+Implementáció **ne kezdődjön** azonnal — a `ACCORE-BIN-T-P3X7` A+F szekciói (SGen ensure-batching + HasPropertyFilter lift-out) **előbb mérendő**. Ha az A+F már lehozza a SGen `WriteProperties` Self CPU-t ≤ 8%-ra, és a kis-adat slowdown ettől már ≤ +1pp, akkor ez a Q5T2 entry **alacsony prioritásra** kerül. Ha a kis-adat slowdown az A+F után is megmarad → Q5T2 implementáció **érdemi**.
+
+Egyéb prerekvizit: `ACCORE-BIN-T-W9F1` (compile-time metadata) szinkronizálás — a Runtime writer reflection-attribute-read-je beleilleszthető a generált metadata-ba, ezzel a runtime path is gyorsabb attribute-alapú encoding-választás-on.
+
+### Open kérdések (implementációkor eldöntendő)
+
+- **Marker naming**: `ObjectVarUInt` (semantic, az encoding alapján) vagy `ObjectNoZZ` (rövidebb)?
+- **`[AcBinarySerializable]`-on belül** vegyük fel a `IntEncoding` paramétert, vagy **külön `[AcBinaryEncoding]` attribute** legyen object-szinten is (és a `[AcBinarySerializable]` változatlan)?
+- **`AcBinaryOptions.WireMode` jövője**: a régi `Fast`/`Compact` enum migrálódjon az új `IntEncoding`/`StringEncoding`-ra (BC-break) vagy maradjon mint shortcut-default?
+