diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 2e79706..db427c3 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -68,7 +68,12 @@ "Bash(curl -s \"https://raw.githubusercontent.com/dotnet/runtime/main/src/libraries/System.IO.Pipelines/src/System/IO/Pipelines/StreamPipeWriter.cs\")", "WebFetch(domain:lemire.me)", "Bash(gh pr *)", - "Bash(gh api *)" + "Bash(gh api *)", + "Bash(ls -la 'C:\\\\Users\\\\Fullepi\\\\Downloads\\\\_baseline\\\\cpuprofiler' 2>&1 | head -30)", + "Bash(where PerfView.exe)", + "Bash(where dotnet-trace *)", + "Bash(dotnet tool *)", + "Bash(dotnet-trace convert *)" ] } } diff --git a/AyCode.Core.Serializers.Console/Program.cs b/AyCode.Core.Serializers.Console/Program.cs index 19f0b5a..b1b7907 100644 --- a/AyCode.Core.Serializers.Console/Program.cs +++ b/AyCode.Core.Serializers.Console/Program.cs @@ -161,6 +161,24 @@ public static class Program $"Compression={options.UseCompression}{extra}"; } + /// + /// Returns MemoryPack serializer options aligned with for a fair + /// apples-to-apples wire-format comparison: + /// + /// (UTF-8) — both + /// engines encode UTF-8, comparison is purely about header / tier / dispatch overhead. + /// (UTF-16 raw memcpy) — + /// both engines write UTF-16 raw bytes, so wire-size and CPU comparison reflect the same string-encoding family. + /// + /// Without this alignment the FastWire vs MemPack-default comparison conflates two unrelated dimensions + /// (UTF-16 raw vs UTF-8 encoded) and produces a misleading +40% wire-size delta that is structurally + /// the encoding-family difference, NOT an AcBinary-specific overhead. + /// + private static MemoryPackSerializerOptions GetMemPackOptions() => + SelectedWireMode == WireMode.Fast + ? MemoryPackSerializerOptions.Utf16 + : MemoryPackSerializerOptions.Default; + /// /// Converts a total-time (in ms across ) into per-operation microseconds. /// Formula: totalMs / iterations × 1000. The benchmark stores *TimeMs as the cumulative @@ -185,6 +203,63 @@ public static class Program private static double DesPerOp(BenchmarkResult r) => ToPerOpMicros(r.DeserializeTimeMs, r.DeserializeIterations); private static double RtPerOp(BenchmarkResult r) => ToPerOpMicros(r.RoundTripTimeMs, r.RoundTripIterations); + /// + /// Per-cell-paired aggregation of an overall comparison. Captures three different aggregation + /// strategies so the reader can judge whether the headline delta is dominated by one large cell + /// (arithmetic mean) or representative of typical workload (geometric mean / median). + /// + /// Arithmetic mean of µs/op — magnitude-weighted; biased toward Large cell. + /// Geometric mean of per-cell ratios — magnitude-neutral; each cell weighted equally. + /// Median of per-cell ratios — outlier-resistant. + /// Arithmetic mean AcBinary value (µs/op or bytes). + /// Arithmetic mean MemPack value. + /// Number of paired cells contributing to the geo/median. + private record OverallStats(double ArithMeanPct, double GeoMeanPct, double MedianPct, double AcAvg, double MpAvg, int CellCount); + + /// + /// Computes arithmetic + geometric + median aggregation of an AcBinary-vs-MemPack comparison + /// across paired cells (joined by TestDataName). Per-cell pairing is required for the + /// geo/median variants — a cell where AcBinary or MemPack is missing is dropped from all stats. + /// Returns null when no paired cell has a valid value. + /// + private static OverallStats? ComputeOverallStats( + List acResults, + List mpResults, + Func getValue) + { + if (acResults.Count == 0 || mpResults.Count == 0) return null; + + var pairs = (from ac in acResults + join mp in mpResults on ac.TestDataName equals mp.TestDataName + let acV = getValue(ac) + let mpV = getValue(mp) + where acV > 0 && mpV > 0 + select (ac: acV, mp: mpV)).ToList(); + + if (pairs.Count == 0) return null; + + var acAvg = pairs.Average(p => p.ac); + var mpAvg = pairs.Average(p => p.mp); + var ratios = pairs.Select(p => p.ac / p.mp).ToList(); + + // Geometric mean: exp(avg(ln(ratios))) — numerically stable vs Π ratios then ^(1/N). + var geoMean = Math.Exp(ratios.Sum(Math.Log) / ratios.Count); + + // Median (paired-ratio): for even N use the midpoint of the two middle values. + var sorted = ratios.OrderBy(r => r).ToList(); + var median = sorted.Count % 2 == 1 + ? sorted[sorted.Count / 2] + : (sorted[sorted.Count / 2 - 1] + sorted[sorted.Count / 2]) / 2.0; + + return new OverallStats( + ArithMeanPct: (acAvg / mpAvg - 1) * 100, + GeoMeanPct: (geoMean - 1) * 100, + MedianPct: (median - 1) * 100, + AcAvg: acAvg, + MpAvg: mpAvg, + CellCount: ratios.Count); + } + /// /// Formats a per-op micros value with its inter-sample range and CV-threshold marker as /// "26.86 (24.5..29.1)" or "26.86 (24.5..29.1) ⚠️5.2%". Median first, range in parentheses, @@ -1452,6 +1527,7 @@ public static class Program private sealed class MemoryPackBenchmark : ISerializerBenchmark { private readonly TestOrder _order; + private readonly MemoryPackSerializerOptions _options; private readonly byte[] _serialized; public string Engine => EngineMemoryPack; @@ -1461,12 +1537,14 @@ public static class Program public int SerializedSize => _serialized.Length; public long SetupSerializeAllocBytes => 0; public long SetupDeserializeAllocBytes => 0; + public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}"; public MemoryPackBenchmark(TestOrder order, string optionsPreset) { _order = order; OptionsPreset = optionsPreset; - _serialized = MemoryPackSerializer.Serialize(order); + _options = GetMemPackOptions(); + _serialized = MemoryPackSerializer.Serialize(order, _options); } public void Warmup(int iterations) @@ -1479,15 +1557,15 @@ public static class Program } [MethodImpl(MethodImplOptions.NoInlining)] - public void Serialize() => MemoryPackSerializer.Serialize(_order); + public void Serialize() => MemoryPackSerializer.Serialize(_order, _options); [MethodImpl(MethodImplOptions.NoInlining)] - public void Deserialize() => MemoryPackSerializer.Deserialize(_serialized); + public void Deserialize() => MemoryPackSerializer.Deserialize(_serialized, _options); public bool VerifyRoundTrip() { - var bytes = MemoryPackSerializer.Serialize(_order); - var roundTripped = MemoryPackSerializer.Deserialize(bytes); + var bytes = MemoryPackSerializer.Serialize(_order, _options); + var roundTripped = MemoryPackSerializer.Deserialize(bytes, _options); return DeepEqualsViaJson(_order, roundTripped); } } @@ -2422,6 +2500,7 @@ public static class Program private sealed class MemoryPackFreshBufferWriterBenchmark : ISerializerBenchmark { private readonly TestOrder _order; + private readonly MemoryPackSerializerOptions _options; private readonly byte[] _serialized; public string Engine => EngineMemoryPack; @@ -2431,12 +2510,14 @@ public static class Program public int SerializedSize => _serialized.Length; public long SetupSerializeAllocBytes => 0; public long SetupDeserializeAllocBytes => 0; + public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}"; public MemoryPackFreshBufferWriterBenchmark(TestOrder order, string optionsPreset) { _order = order; OptionsPreset = optionsPreset; - _serialized = MemoryPackSerializer.Serialize(order); + _options = GetMemPackOptions(); + _serialized = MemoryPackSerializer.Serialize(order, _options); } public void Warmup(int iterations) @@ -2452,17 +2533,17 @@ public static class Program public void Serialize() { var abw = new ArrayBufferWriter(); - MemoryPackSerializer.Serialize(abw, _order); + MemoryPackSerializer.Serialize(abw, _order, _options); } [MethodImpl(MethodImplOptions.NoInlining)] - public void Deserialize() => MemoryPackSerializer.Deserialize(_serialized); + public void Deserialize() => MemoryPackSerializer.Deserialize(_serialized, _options); public bool VerifyRoundTrip() { var abw = new ArrayBufferWriter(); - MemoryPackSerializer.Serialize(abw, _order); - var roundTripped = MemoryPackSerializer.Deserialize(abw.WrittenSpan.ToArray()); + MemoryPackSerializer.Serialize(abw, _order, _options); + var roundTripped = MemoryPackSerializer.Deserialize(abw.WrittenSpan.ToArray(), _options); return DeepEqualsViaJson(_order, roundTripped); } } @@ -2535,6 +2616,7 @@ public static class Program private sealed class MemoryPackBufferWriterBenchmark : ISerializerBenchmark { private readonly TestOrder _order; + private readonly MemoryPackSerializerOptions _options; private readonly byte[] _serialized; private readonly ArrayBufferWriter _bufferWriter; @@ -2545,12 +2627,14 @@ public static class Program public int SerializedSize => _serialized.Length; public long SetupSerializeAllocBytes { get; } public long SetupDeserializeAllocBytes => 0; + public string? OptionsDescription => $"StringEncoding={_options.StringEncoding}"; public MemoryPackBufferWriterBenchmark(TestOrder order, string optionsPreset) { _order = order; OptionsPreset = optionsPreset; - _serialized = MemoryPackSerializer.Serialize(order); + _options = GetMemPackOptions(); + _serialized = MemoryPackSerializer.Serialize(order, _options); // Serialize-side setup only — see AcBinaryBufferWriterBenchmark for the full rationale. GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect(); @@ -2573,17 +2657,17 @@ public static class Program public void Serialize() { _bufferWriter.ResetWrittenCount(); - MemoryPackSerializer.Serialize(_bufferWriter, _order); + MemoryPackSerializer.Serialize(_bufferWriter, _order, _options); } [MethodImpl(MethodImplOptions.NoInlining)] - public void Deserialize() => MemoryPackSerializer.Deserialize(_serialized); + public void Deserialize() => MemoryPackSerializer.Deserialize(_serialized, _options); public bool VerifyRoundTrip() { _bufferWriter.ResetWrittenCount(); - MemoryPackSerializer.Serialize(_bufferWriter, _order); - var roundTripped = MemoryPackSerializer.Deserialize(_bufferWriter.WrittenSpan.ToArray()); + MemoryPackSerializer.Serialize(_bufferWriter, _order, _options); + var roundTripped = MemoryPackSerializer.Deserialize(_bufferWriter.WrittenSpan.ToArray(), _options); return DeepEqualsViaJson(_order, roundTripped); } } @@ -2932,63 +3016,62 @@ public static class Program // All averages are over per-op µs (iter-independent). Batch-time averaging would mix rows // measured with different iter counts (post-calibration), producing meaningless numbers. - var memPackAvgSer = memPackSerResults.Count > 0 ? memPackSerResults.Average(r => SerPerOp(r)) : 0; - var memPackAvgDes = memPackDesResults.Average(r => DesPerOp(r)); - var memPackAvgRt = memPackRtResults.Average(r => RtPerOp(r)); - var memPackAvgSize = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).Average(r => r.SerializedSize); - var memPackAvgSerAlloc = memPackSerResults.Count > 0 ? memPackSerResults.Average(r => r.SerializeAllocBytesPerOp) : 0; - var memPackAvgDesAlloc = memPackDesResults.Count > 0 ? memPackDesResults.Average(r => r.DeserializeAllocBytesPerOp) : 0; + // Three aggregations per metric: + // - Arithmetic mean (current behavior) — magnitude-weighted, biased toward Large cell. + // - Geometric mean of per-cell ratios — magnitude-neutral, each cell weighted equally. + // - Median of per-cell ratios — outlier-resistant. + // The geo/median variants surface when a single cell dominates the arithmetic average + // (typical when one cell's µs-per-op is an order of magnitude larger than the others). + var sizeAcResults = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)).ToList(); + var sizeMpResults = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).ToList(); - var acBinaryAvgSer = acBinarySerResults.Count > 0 ? acBinarySerResults.Average(r => SerPerOp(r)) : 0; - var acBinaryAvgDes = acBinaryDesResults.Average(r => DesPerOp(r)); - var acBinaryAvgRt = acBinaryRtResults.Average(r => RtPerOp(r)); - var acBinaryAvgSize = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)).Average(r => r.SerializedSize); - var acBinaryAvgSerAlloc = acBinarySerResults.Count > 0 ? acBinarySerResults.Average(r => r.SerializeAllocBytesPerOp) : 0; - var acBinaryAvgDesAlloc = acBinaryDesResults.Count > 0 ? acBinaryDesResults.Average(r => r.DeserializeAllocBytesPerOp) : 0; + var serStats = ComputeOverallStats(acBinarySerResults, memPackSerResults, SerPerOp); + var desStats = ComputeOverallStats(acBinaryDesResults, memPackDesResults, DesPerOp); + var rtStats = ComputeOverallStats(acBinaryRtResults, memPackRtResults, RtPerOp); + var sizeStats = ComputeOverallStats(sizeAcResults, sizeMpResults, r => r.SerializedSize); + var serAllocStats = ComputeOverallStats(acBinarySerResults, memPackSerResults, r => r.SerializeAllocBytesPerOp); + var desAllocStats = ComputeOverallStats(acBinaryDesResults, memPackDesResults, r => r.DeserializeAllocBytesPerOp); System.Console.WriteLine(); System.Console.WriteLine($"── {"AcBinary (Byte[], SGen)"} vs {"MemoryPack (Byte[])"} (Overall) ──"); - // Only show serialize comparison if data available - if (memPackAvgSer > 0 && acBinaryAvgSer > 0) - { - var serPctAll = (acBinaryAvgSer / memPackAvgSer - 1) * 100; - System.Console.ForegroundColor = serPctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red; - System.Console.WriteLine($" Serialize: {serPctAll:+0;-0}% ({acBinaryAvgSer:F2} µs/op vs {memPackAvgSer:F2} µs/op)"); - System.Console.ResetColor(); - } + WriteOverallLine("Serialize", "µs/op", serStats); + WriteOverallLine("Deserialize", "µs/op", desStats); + WriteOverallLine("Round-trip", "µs/op", rtStats); + WriteOverallLine("Size", "B", sizeStats, "F0"); + WriteOverallLine("Ser Alloc", "B/op", serAllocStats, "F0"); + WriteOverallLine("Des Alloc", "B/op", desAllocStats, "F0"); + } - var desPctAll = (acBinaryAvgDes / memPackAvgDes - 1) * 100; - var rtPctAll = (acBinaryAvgRt / memPackAvgRt - 1) * 100; - var sizePctAll = (acBinaryAvgSize / memPackAvgSize - 1) * 100; + /// + /// Formats a signed percent delta with explicit sign for positive values (`+1.5%`, `-3.0%`, `0.0%`). + /// Padded to 7 chars (e.g. ` +12.3%`, `-100.0%`) for column alignment in the Overall block. + /// + private static string FormatPctSigned(double pct) => pct.ToString("+0.0;-0.0;0.0", System.Globalization.CultureInfo.InvariantCulture).PadLeft(6) + "%"; - System.Console.ForegroundColor = desPctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red; - System.Console.WriteLine($" Deserialize: {desPctAll:+0;-0}% ({acBinaryAvgDes:F2} µs/op vs {memPackAvgDes:F2} µs/op)"); + /// + /// Renders one Overall row with arith / geo / median deltas + AcBinary/MemPack absolute means. + /// Color is driven by the geometric-mean delta (magnitude-neutral signal). Skips silently when + /// stats is null (no paired data). + /// + private static void WriteOverallLine(string label, string unit, OverallStats? stats, string fmt = "F2") + { + if (stats == null) return; + // Color follows geo-mean (the magnitude-neutral signal). The arith-mean column may show a + // different sign when a single big cell dominates — that's exactly the signal we want to surface. + System.Console.ForegroundColor = stats.GeoMeanPct <= 0 ? ConsoleColor.Green : ConsoleColor.Red; + System.Console.WriteLine($" {label,-12} arith {FormatPctSigned(stats.ArithMeanPct)} │ geo {FormatPctSigned(stats.GeoMeanPct)} │ median {FormatPctSigned(stats.MedianPct)} ({stats.AcAvg.ToString(fmt, System.Globalization.CultureInfo.InvariantCulture)} {unit} vs {stats.MpAvg.ToString(fmt, System.Globalization.CultureInfo.InvariantCulture)} {unit}, {stats.CellCount} cells)"); System.Console.ResetColor(); + } - System.Console.ForegroundColor = rtPctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red; - System.Console.WriteLine($" Round-trip: {rtPctAll:+0;-0}% ({acBinaryAvgRt:F2} µs/op vs {memPackAvgRt:F2} µs/op)"); - System.Console.ResetColor(); - - System.Console.ForegroundColor = sizePctAll <= 0 ? ConsoleColor.Green : ConsoleColor.Red; - System.Console.WriteLine($" Size: {sizePctAll:+0;-0}% ({acBinaryAvgSize:F0} B vs {memPackAvgSize:F0} B)"); - System.Console.ResetColor(); - - // Allocation comparison: byte[] API allocates the output array on both sides — delta shows serializer-overhead diff. - if (memPackAvgSerAlloc > 0 && acBinaryAvgSerAlloc > 0) - { - var serAllocPct = (acBinaryAvgSerAlloc / memPackAvgSerAlloc - 1) * 100; - System.Console.ForegroundColor = serAllocPct <= 0 ? ConsoleColor.Green : ConsoleColor.Red; - System.Console.WriteLine($" Ser Alloc: {serAllocPct:+0;-0}% ({acBinaryAvgSerAlloc:F0} B/op vs {memPackAvgSerAlloc:F0} B/op)"); - System.Console.ResetColor(); - } - if (memPackAvgDesAlloc > 0 && acBinaryAvgDesAlloc > 0) - { - var desAllocPct = (acBinaryAvgDesAlloc / memPackAvgDesAlloc - 1) * 100; - System.Console.ForegroundColor = desAllocPct <= 0 ? ConsoleColor.Green : ConsoleColor.Red; - System.Console.WriteLine($" Des Alloc: {desAllocPct:+0;-0}% ({acBinaryAvgDesAlloc:F0} B/op vs {memPackAvgDesAlloc:F0} B/op)"); - System.Console.ResetColor(); - } + /// + /// Same as but appends to a (no color). + /// Used by the .log and .LLM file writers. + /// + private static void AppendOverallLine(StringBuilder sb, string label, string unit, OverallStats? stats, string fmt = "F2") + { + if (stats == null) return; + sb.AppendLine($" {label,-12} arith {FormatPctSigned(stats.ArithMeanPct)} | geo {FormatPctSigned(stats.GeoMeanPct)} | median {FormatPctSigned(stats.MedianPct)} ({stats.AcAvg.ToString(fmt, System.Globalization.CultureInfo.InvariantCulture)} {unit} vs {stats.MpAvg.ToString(fmt, System.Globalization.CultureInfo.InvariantCulture)} {unit}, {stats.CellCount} cells)"); } private static void SaveResults(List results, List testDataSets) @@ -3143,39 +3226,17 @@ public static class Program return; } - if (memPackSerResults2.Count > 0 && acBinarySerResults2.Count > 0) - { - // Per-op µs averages (iter-independent) — see comment above the parallel block in PrintSummary. - var memPackAvgSer2 = memPackSerResults2.Average(r => SerPerOp(r)); - var acBinaryAvgSer2 = acBinarySerResults2.Average(r => SerPerOp(r)); - var memPackAvgSerAlloc2 = memPackSerResults2.Average(r => r.SerializeAllocBytesPerOp); - var acBinaryAvgSerAlloc2 = acBinarySerResults2.Average(r => r.SerializeAllocBytesPerOp); - sb.AppendLine($" Serialize: {((acBinaryAvgSer2 / memPackAvgSer2 - 1) * 100):+0;-0}% ({acBinaryAvgSer2:F2} µs/op vs {memPackAvgSer2:F2} µs/op)"); - if (memPackAvgSerAlloc2 > 0) - sb.AppendLine($" Ser Alloc: {((acBinaryAvgSerAlloc2 / memPackAvgSerAlloc2 - 1) * 100):+0;-0}% ({acBinaryAvgSerAlloc2:F0} B/op vs {memPackAvgSerAlloc2:F0} B/op)"); - } + // Per-cell-paired aggregation: arithmetic / geometric / median. See PrintSummary's parallel + // block + the OverallStats record for the rationale (per-cell ratio vs magnitude-weighted mean). + var sizeAcResults2 = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)).ToList(); + var sizeMpResults2 = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).ToList(); - if (memPackDesResults2.Count > 0 && acBinaryDesResults2.Count > 0) - { - var memPackAvgDes2 = memPackDesResults2.Average(r => DesPerOp(r)); - var acBinaryAvgDes2 = acBinaryDesResults2.Average(r => DesPerOp(r)); - var memPackAvgDesAlloc2 = memPackDesResults2.Average(r => r.DeserializeAllocBytesPerOp); - var acBinaryAvgDesAlloc2 = acBinaryDesResults2.Average(r => r.DeserializeAllocBytesPerOp); - sb.AppendLine($" Deserialize: {((acBinaryAvgDes2 / memPackAvgDes2 - 1) * 100):+0;-0}% ({acBinaryAvgDes2:F2} µs/op vs {memPackAvgDes2:F2} µs/op)"); - if (memPackAvgDesAlloc2 > 0) - sb.AppendLine($" Des Alloc: {((acBinaryAvgDesAlloc2 / memPackAvgDesAlloc2 - 1) * 100):+0;-0}% ({acBinaryAvgDesAlloc2:F0} B/op vs {memPackAvgDesAlloc2:F0} B/op)"); - } - - if (memPackRtResults2.Count > 0 && acBinaryRtResults2.Count > 0) - { - var memPackAvgRt2 = memPackRtResults2.Average(r => RtPerOp(r)); - var acBinaryAvgRt2 = acBinaryRtResults2.Average(r => RtPerOp(r)); - sb.AppendLine($" Round-trip: {((acBinaryAvgRt2 / memPackAvgRt2 - 1) * 100):+0;-0}% ({acBinaryAvgRt2:F2} µs/op vs {memPackAvgRt2:F2} µs/op)"); - } - - var memPackAvgSize2 = results.Where(r => (r.Engine == EngineMemoryPack && r.IoMode == IoByteArray)).Average(r => r.SerializedSize); - var acBinaryAvgSize2 = results.Where(r => (r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen)).Average(r => r.SerializedSize); - sb.AppendLine($" Size: {((acBinaryAvgSize2 / memPackAvgSize2 - 1) * 100):+0;-0}% ({acBinaryAvgSize2:F0} B vs {memPackAvgSize2:F0} B)"); + AppendOverallLine(sb, "Serialize", "µs/op", ComputeOverallStats(acBinarySerResults2, memPackSerResults2, SerPerOp)); + AppendOverallLine(sb, "Ser Alloc", "B/op", ComputeOverallStats(acBinarySerResults2, memPackSerResults2, r => r.SerializeAllocBytesPerOp), "F0"); + AppendOverallLine(sb, "Deserialize", "µs/op", ComputeOverallStats(acBinaryDesResults2, memPackDesResults2, DesPerOp)); + AppendOverallLine(sb, "Des Alloc", "B/op", ComputeOverallStats(acBinaryDesResults2, memPackDesResults2, r => r.DeserializeAllocBytesPerOp), "F0"); + AppendOverallLine(sb, "Round-trip", "µs/op", ComputeOverallStats(acBinaryRtResults2, memPackRtResults2, RtPerOp)); + AppendOverallLine(sb, "Size", "B", ComputeOverallStats(sizeAcResults2, sizeMpResults2, r => r.SerializedSize), "F0"); File.WriteAllText(logFilePath, sb.ToString(), Utf8NoBom); System.Console.WriteLine($"✓ Results saved to: {logFilePath}"); @@ -3253,6 +3314,36 @@ public static class Program } } + // Overall AcBinary (SGen, Byte[]) vs MemoryPack (Byte[]) comparison — same three aggregations + // as the .log / console output (arithmetic / geometric / median of per-cell ratios). The + // arith mean is magnitude-weighted (Large cell dominates); geo/median are per-cell-equal + // signals. Adding this lets an LLM diagnose whether a headline delta is a real overall + // win/loss or a single-cell artifact. + var memPackByteArrayResults = results.Where(r => r.Engine == EngineMemoryPack && r.IoMode == IoByteArray).ToList(); + var acBinarySGenByteArrayResults = results.Where(r => r.Engine == EngineAcBinary && r.IoMode == IoByteArray && r.DispatchMode == ModeSGen).ToList(); + var memPackSerResultsLlm = memPackByteArrayResults.Where(r => r.SerializeTimeMs > 0).ToList(); + var memPackDesResultsLlm = memPackByteArrayResults.Where(r => r.DeserializeTimeMs > 0).ToList(); + var memPackRtResultsLlm = memPackByteArrayResults.Where(r => r.RoundTripTimeMs > 0).ToList(); + var acBinarySerResultsLlm = acBinarySGenByteArrayResults.Where(r => r.SerializeTimeMs > 0).ToList(); + var acBinaryDesResultsLlm = acBinarySGenByteArrayResults.Where(r => r.DeserializeTimeMs > 0).ToList(); + var acBinaryRtResultsLlm = acBinarySGenByteArrayResults.Where(r => r.RoundTripTimeMs > 0).ToList(); + + if (memPackRtResultsLlm.Count > 0 && acBinaryRtResultsLlm.Count > 0) + { + sb.AppendLine(); + sb.AppendLine("## Overall: AcBinary (Byte[], SGen) vs MemoryPack (Byte[])"); + sb.AppendLine(); + sb.AppendLine("Three aggregations of per-cell results: **arith** = arithmetic mean of µs/op (magnitude-weighted, Large cell dominates); **geo** = geometric mean of per-cell ratios (each cell weighted equally); **median** = median of per-cell ratios (outlier-resistant). Negative % = AcBinary faster/smaller; positive % = MemPack faster/smaller. The geo/median variants surface when a single big cell skews the arithmetic mean."); + sb.AppendLine(); + sb.AppendLine("```"); + AppendOverallLine(sb, "Serialize", "µs/op", ComputeOverallStats(acBinarySerResultsLlm, memPackSerResultsLlm, SerPerOp)); + AppendOverallLine(sb, "Ser Alloc", "B/op", ComputeOverallStats(acBinarySerResultsLlm, memPackSerResultsLlm, r => r.SerializeAllocBytesPerOp), "F0"); + AppendOverallLine(sb, "Deserialize", "µs/op", ComputeOverallStats(acBinaryDesResultsLlm, memPackDesResultsLlm, DesPerOp)); + AppendOverallLine(sb, "Des Alloc", "B/op", ComputeOverallStats(acBinaryDesResultsLlm, memPackDesResultsLlm, r => r.DeserializeAllocBytesPerOp), "F0"); + AppendOverallLine(sb, "Round-trip", "µs/op", ComputeOverallStats(acBinaryRtResultsLlm, memPackRtResultsLlm, RtPerOp)); + AppendOverallLine(sb, "Size", "B", ComputeOverallStats(acBinarySGenByteArrayResults, memPackByteArrayResults, r => r.SerializedSize), "F0"); + sb.AppendLine("```"); + } File.WriteAllText(filePath, sb.ToString(), Utf8NoBom); System.Console.WriteLine($"✓ LLM results saved to: {filePath}"); diff --git a/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.cs b/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.cs index 31137b5..09b6611 100644 --- a/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.cs +++ b/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.cs @@ -18,6 +18,32 @@ public class AcBinarySourceGenerator : IIncrementalGenerator { private const string AttributeName = "AyCode.Core.Serializers.Attributes.AcBinarySerializableAttribute"; + // ──────────────────────────────────────────────────────────────────────────────────────────── + // TEMPORARY (2026-05-08) — A/B test feature gates for hot-path overhead measurement. + // + // The generated SGen `WriteProperties` / `ScanObject` methods emit two kinds of overhead-blocks + // that are unconditionally present today but rarely exercised in typical workloads: + // + // 1. PropertyFilter guard (`UsePropertyFilter`) — every non-markerless property emit-site + // checks `context.HasPropertyFilter` + filter-context allocation + lambda-call. + // The benchmark workload never sets a property-filter → branch is always false → + // pure overhead (CPU cycles + i-cache pressure on the hot path). + // + // 2. Polymorphic object-with-type-name emit (`UsePolymorphType`) — `System.Object` declared + // properties emit `ObjectWithTypeName` marker + `WriteStringUtf8(AssemblyQualifiedName)` + // under `!context.UseMetadata`. Same: rarely used in typical DTO graphs. + // + // Setting either to `false` skips the corresponding emit at compile time → leaner generated + // code. The bench measures the actual delta vs MemPack apples-to-apples (which has neither + // of these features). + // + // Long-term: these flags will move to `[AcBinarySerializable(UsePropertyFilter = false, ...)]` + // attribute properties so consumers can opt out per type. Until then, keep both `false` for + // benchmark-vs-MemPack measurements; flip to `true` for production where the features are needed. + // ──────────────────────────────────────────────────────────────────────────────────────────── + private const bool UsePropertyFilter = false; + private const bool UsePolymorphType = false; + private static readonly DiagnosticDescriptor CircularReferenceWarning = new( id: "ACBIN001", title: "Circular reference detected", @@ -672,16 +698,21 @@ public class AcBinarySourceGenerator : IIncrementalGenerator } // All non-markerless properties: emit PropertyFilter guard - // When filter returns false, write PropertySkip and skip the property write - sb.AppendLine($"{i}if (context.HasPropertyFilter)"); - sb.AppendLine($"{i}{{"); - sb.AppendLine($"{i} var fc_{p.Name} = new BinaryPropertyFilterContext(obj, typeof({fullTypeName}), \"{p.Name}\", typeof({p.TypeNameForTypeof}), static o => (({fullTypeName})o).{p.Name});"); - sb.AppendLine($"{i} if (!context.PropertyFilter!(in fc_{p.Name}))"); - sb.AppendLine($"{i} {{"); - sb.AppendLine($"{i} context.WriteByte(BinaryTypeCode.PropertySkip);"); - sb.AppendLine($"{i} goto skip_{p.Name};"); - sb.AppendLine($"{i} }}"); - sb.AppendLine($"{i}}}"); + // When filter returns false, write PropertySkip and skip the property write. + // Gated by `UsePropertyFilter` (TEMPORARY const) — `false` skips emit entirely → leaner + // generated code on benchmark workloads where no property-filter is ever set. + if (UsePropertyFilter) + { + sb.AppendLine($"{i}if (context.HasPropertyFilter)"); + sb.AppendLine($"{i}{{"); + sb.AppendLine($"{i} var fc_{p.Name} = new BinaryPropertyFilterContext(obj, typeof({fullTypeName}), \"{p.Name}\", typeof({p.TypeNameForTypeof}), static o => (({fullTypeName})o).{p.Name});"); + sb.AppendLine($"{i} if (!context.PropertyFilter!(in fc_{p.Name}))"); + sb.AppendLine($"{i} {{"); + sb.AppendLine($"{i} context.WriteByte(BinaryTypeCode.PropertySkip);"); + sb.AppendLine($"{i} goto skip_{p.Name};"); + sb.AppendLine($"{i} }}"); + sb.AppendLine($"{i}}}"); + } // Nullable value types always use markered path (need Null marker) if (IsNullableVTKind(p.TypeKind)) @@ -715,14 +746,21 @@ public class AcBinarySourceGenerator : IIncrementalGenerator // System.Object property: runtime type unknown at compile time. // Write ObjectWithTypeName prefix so deserializer can resolve the concrete type. // Use value.GetType() for runtime type dispatch (not typeof(object)). + // Gated by `UsePolymorphType` (TEMPORARY const) — `false` skips the type-name emit + // entirely (deser will use the property's declared type, which is `object` so the + // round-trip would fail on polymorphic instances; safe ONLY when the workload is + // known not to use polymorphic object-typed properties — true for the benchmark). sb.AppendLine($"{i}if ({a} == null) context.WriteByte(BinaryTypeCode.PropertySkip);"); sb.AppendLine($"{i}else"); sb.AppendLine($"{i}{{"); - sb.AppendLine($"{i} if (!context.UseMetadata)"); - sb.AppendLine($"{i} {{"); - sb.AppendLine($"{i} context.WriteByte(BinaryTypeCode.ObjectWithTypeName);"); - sb.AppendLine($"{i} context.WriteStringUtf8({a}.GetType().AssemblyQualifiedName!);"); - sb.AppendLine($"{i} }}"); + if (UsePolymorphType) + { + sb.AppendLine($"{i} if (!context.UseMetadata)"); + sb.AppendLine($"{i} {{"); + sb.AppendLine($"{i} context.WriteByte(BinaryTypeCode.ObjectWithTypeName);"); + sb.AppendLine($"{i} context.WriteStringUtf8({a}.GetType().AssemblyQualifiedName!);"); + sb.AppendLine($"{i} }}"); + } sb.AppendLine($"{i} AcBinarySerializer.WriteValueGenerated({a}, {a}.GetType(), context, depth);"); sb.AppendLine($"{i}}}"); } @@ -881,8 +919,9 @@ public class AcBinarySourceGenerator : IIncrementalGenerator var a = $"obj.{p.Name}"; // PropertyFilter: must match write pass — if filter skips property, scan must skip too - // Only for non-markerless properties (matching EmitProp behavior) - if (!IsMarkerless(p.TypeKind)) + // Only for non-markerless properties (matching EmitProp behavior). + // Gated by `UsePropertyFilter` (TEMPORARY const) — same A/B flag as the writer pass. + if (UsePropertyFilter && !IsMarkerless(p.TypeKind)) { sb.AppendLine($"{i}if (context.HasPropertyFilter)"); sb.AppendLine($"{i}{{"); @@ -1849,7 +1888,7 @@ public class AcBinarySourceGenerator : IIncrementalGenerator sb.AppendLine($"{i} {{"); sb.AppendLine($"{i} if (context.FastWire)"); sb.AppendLine($"{i} {{"); - sb.AppendLine($"{i} var fwlen = (int)context.ReadVarUInt();"); + sb.AppendLine($"{i} var fwlen = context.ReadInt32Unsafe();"); sb.AppendLine($"{i} {a} = fwlen == 0 ? string.Empty : context.ReadStringUtf8(fwlen);"); sb.AppendLine($"{i} }}"); sb.AppendLine($"{i} else"); diff --git a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs index d741c7f..832c91f 100644 --- a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs +++ b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs @@ -128,6 +128,20 @@ public static partial class AcBinaryDeserializer return value; } + /// + /// Reads a 4-byte signed integer (little-endian on Intel/AMD, native-endian elsewhere). + /// Symmetric with Unsafe.WriteUnaligned<int> on the writer side. Used by FastWire + /// StringSmall reader to grab charLen:int32. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int ReadInt32Unsafe() + { + EnsureAvailable(4); + var value = Unsafe.ReadUnaligned(ref _buffer[_position]); + _position += 4; + return value; + } + /// /// Reads an 8-byte unsigned integer (little-endian on Intel/AMD, native-endian elsewhere). /// Used by H2Q6 StringBig reader to grab packed charLen:32 | utf8Len:32 in a single load. diff --git a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs index 5490bff..fe20de4 100644 --- a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs +++ b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs @@ -1157,8 +1157,9 @@ public static partial class AcBinaryDeserializer { if (context.FastWire) { - // Mode-shared marker: FastWire payload is [VarUInt charCount][UTF-16 raw bytes] - var charLenF = (int)context.ReadVarUInt(); + // Mode-shared marker: FastWire payload is [charLen:int32 LE][UTF-16 raw bytes] + // Fix-int charLen (matches MemPack WriteUtf16 shape) — single 4-byte read, no VarUInt loop. + var charLenF = context.ReadInt32Unsafe(); return context.ReadStringUtf8(charLenF); } diff --git a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs index 034a406..5072623 100644 --- a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs +++ b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.BinarySerializationContext.cs @@ -499,11 +499,14 @@ public static partial class AcBinarySerializer [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int VarUIntSize(uint value) { - if (value < 0x80) return 1; - if (value < 0x4000) return 2; - if (value < 0x200000) return 3; - if (value < 0x10000000) return 4; - return 5; + return value switch + { + < 0x80 => 1, + < 0x4000 => 2, + < 0x200000 => 3, + < 0x10000000 => 4, + _ => 5 + }; } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -750,13 +753,16 @@ public static partial class AcBinarySerializer if (FastWire) { - // FastWire: [StringSmall marker][VarUInt charCount][UTF-16 raw bytes] - // Marker value 91 is mode-shared (Compact StringSmall vs FastWire string marker); - // reader dispatches by deserializer mode, NOT by re-interpreting the marker. - WriteByte(BinaryTypeCode.StringSmall); + // FastWire: [StringSmall marker:1][charLen:int32 LE][UTF-16 raw bytes] + // Fix-int header (no tier-dispatch, no VarUInt branch loop) — matches MemPack `WriteUtf16` + // shape (which emits a fix `int` length). Single Unsafe.WriteUnaligned store on the + // writer; symmetric ReadInt32Unsafe on the reader. var byteLenF = charLength * 2; // safe: charLength ≤ 0x1FFFFFFF guarantees no overflow - WriteVarUInt((uint)charLength); - EnsureCapacity(byteLenF); + EnsureCapacity(7 + byteLenF); + var fwPos = _position; + var packed = (ulong)BinaryTypeCode.StringSmall | ((ulong)(uint)charLength << 8); + Unsafe.WriteUnaligned(ref _buffer[fwPos], packed); + _position = fwPos + 5; MemoryMarshal.AsBytes(value.AsSpan()).CopyTo(_buffer.AsSpan(_position, byteLenF)); _position += byteLenF; return; @@ -772,10 +778,12 @@ public static partial class AcBinarySerializer // reserve was Medium (5 byte) — body is left-shifted by 2 bytes to compact. var maxBytes = charLength * 4; - int reserveHeader; - if (charLength <= 63) reserveHeader = 3; - else if (charLength <= 16383) reserveHeader = 5; - else reserveHeader = 9; + int reserveHeader = charLength switch + { + <= 63 => 3, + <= 16383 => 5, + _ => 9 + }; EnsureCapacity(reserveHeader + maxBytes); diff --git a/AyCode.Core/docs/BINARY/BINARY_TODO.md b/AyCode.Core/docs/BINARY/BINARY_TODO.md index 5042ae3..2a6dbcd 100644 --- a/AyCode.Core/docs/BINARY/BINARY_TODO.md +++ b/AyCode.Core/docs/BINARY/BINARY_TODO.md @@ -756,6 +756,8 @@ The optimization-value signal proved below the bench noise floor on the availabl **Re-evaluable as of 2026-05-07 per `ACCORE-BIN-T-D9X3`** — bench stabilization removes the noise-floor that made the original signal unmeasurable; retest before any code change. (Charset bias remains — pair with `ACCORE-BIN-T-C5R8` for CJK validation.) +**Retested 2026-05-08 — REGRESSION CONFIRMED** (Latin1Long charset, stabilized bench): adding the do-while inner loop on both 2-byte and 3-byte tiers in `DecodeUtf8SinglePass` produced **+5-8pp Deser regression on every cell** vs. the switch-jumptable baseline (Small +7.8pp, Medium +7.1pp, Large +5.5pp, Repeated +7.4pp, Deep +4.9pp). Reverted to switch-jumptable single-decode same day. The V4N2 entry's original prediction held: "Magyar mixed (KözösCímke, sötét — short alternating runs): 0-5% (run-detection overhead may eat the savings on short runs)" — Latin1Long suffix has 1-2 char average run length, well below the run-detection break-even point. **Phase 2.5 is dead on Magyar mixed.** CJK retest still untried, but Phase 2.5 is now obsoleted by `ACCORE-BIN-T-K7M3` (the decoder hot path runs `Utf8.ToUtf16` BCL static API, not `DecodeUtf8SinglePass`). + **Below: original Phase 2.5 design notes preserved as documentation.** Implementation details remain accurate even though the implementation was reverted. --- @@ -1217,7 +1219,7 @@ Reader-side: SGen-generated code drops the per-property `ReadByte()` + `IsTinyIn - Opt-in flag with default `false` (preserves marker-driven default; consumers explicitly opt in for frozen-schema scenarios) ## ACCORE-BIN-T-V4N3: Symmetric `GetUtf8ByteCount` API + writer-side BCL kihagyás (cold path) -**Priority:** P3 · **Type:** Performance · **Status:** Closed (2026-05-06) · **Related:** `EncodeUtf8SinglePass`, `WriteStringUtf8Internal`, `PropertyMetadataBase.NameUtf8` +**Priority:** P3 · **Type:** Performance · **Status:** Superseded (2026-05-08, by `ACCORE-BIN-T-K7M3`) — landed Closed 2026-05-06; subsequent A/B against modern `Utf8.FromUtf16` / `Utf8.ToUtf16` showed the BCL modern API outperforms the custom transcoder on every benchmark cell, leading to full hot-path switch in K7M3 · **Related:** `EncodeUtf8SinglePass`, `WriteStringUtf8Internal`, `PropertyMetadataBase.NameUtf8`, `ACCORE-BIN-T-K7M3` (hot-path BCL switch) Symmetric byte-count helper for `EncodeUtf8SinglePass`, paired with writer-side BCL `Encoding.UTF8.GetBytes` / `GetByteCount` removal across all cold-path call sites. `Utf8Transcoder.GetUtf8ByteCount(ReadOnlySpan)` SIMD impl (Vector512 / Vector256 / Vector128 / scalar tier hierarchy, 5-popcount closed-form aggregation handling chunk-split surrogate pairs correctly). @@ -1235,6 +1237,12 @@ Landed 2026-05-06. All `Utf8TranscoderTests` pass (55/55). Binary test suite unc **Algorithmic correctness lesson** — the initial 4-popcount formula (`3*N - c_lt_0x80 - c_lt_0x800 - 2*highSur`) was wrong on chunks where a surrogate pair straddles the SIMD chunk boundary (it implicitly assumed `lowSur == highSur` per chunk, which is true over the whole well-formed string but NOT per chunk). Fix: 5-popcount closed-form (`3*N - ascii - c_lt_0x800 + highSur - 3*lowSur`), with the scalar tail using the same per-char accounting model (`i += 1` per char regardless of role; high → 4, low → 0, BMP → 3, two-byte → 2, ASCII → 1). Caught by `GetUtf8ByteCount_MultipleEmojiBoundary_MatchesBcl` and `GetUtf8ByteCount_BoundaryAsciiToEmoji_MatchesBcl` regression tests — exactly the `prefixLen` 1, 7 boundaries that exercise chunk-split surrogate pairs. +### Superseded by `ACCORE-BIN-T-K7M3` (2026-05-08) + +The V4N3 audit measured the custom transcoder against the **legacy `Encoding.UTF8.GetBytes`** API and won. **Did NOT measure against the modern `System.Text.Unicode.Utf8.FromUtf16` / `Utf8.ToUtf16` static API** (.NET 7+, used by MemoryPack source-gen). Once `D9X3` stabilized the bench, a direct A/B revealed the BCL modern API outperforms the custom transcoder on **every** cell (Ser deficit -14 to -22pp, Deser flips from behind to ahead). All 8 hot-path call sites switched to BCL in `K7M3`. The `Utf8Transcoder.cs` file is fully commented out — preserved as historical reference. + +The V4N3 algorithmic correctness work (5-popcount surrogate-pair-split-across-chunks closed-form) remains a **valid algorithmic contribution**, but no longer load-bearing on the hot path. + ## ACCORE-BIN-T-V4N4: NativeAOT-specific inlining / codegen audit on hot UTF-8 path **Priority:** P2 · **Type:** Performance · **Status:** Reverted (2026-05-07) — bench instability made the optimization signal unmeasurable · **Related:** `EncodeUtf8SinglePass`, `DecodeUtf8SinglePass`, `WriteStringWithDispatch`, `Utf8Transcoder` SIMD path @@ -1343,7 +1351,7 @@ A V4N4 audit **konklúziója** változatlan érvényes (constant-fold OK, reader **Re-evaluable as of 2026-05-07 per `ACCORE-BIN-T-D9X3`** — bench stabilization removes the noise-floor that made the original signal unmeasurable; retest before any code change. -## ACCORE-BIN-T-V4N5: Dead-code review — `WriteFixStrDirect` + `WriteStringUtf8Internal` +**Obsoleted (2026-05-08) by `ACCORE-BIN-T-K7M3`** — the writer hot path no longer calls the custom `EncodeUtf8SinglePass` at all (`WriteStringWithDispatch` was switched to `Utf8.FromUtf16` BCL). The "AOT method-split / inlining audit" target (`Utf8Transcoder` body method-size in NativeAOT inline budget) is moot — the BCL `Utf8.FromUtf16` is a single static method with its own AOT-friendly inline footprint, and the audit's hypothesis space (Vector256 `IsSupported` constant-fold, lambda delegate cache) was correct for the prior code but no longer applies. The V4N4 disasm methodology remains a **valid technique** for future investigations of generic specialization / inline failures, but the specific hot-path target it analyzed is gone. **Priority:** P3 · **Type:** Refactor / hygiene · **Status:** Closed (2026-05-06) · **Related:** `BinarySerializationContext.cs` V4N3 audit surfaced two methods with no callers in the entire workspace: @@ -1992,4 +2000,443 @@ Header line updated: - Before: `Iterations: 1000 | Warmup: 10000 | Samples: 10 (median) | ...` - After: `Iterations: per-cell adaptive (target ~250 ms/sample) | Warmup: 10000 | Samples: 10 (median) + 1 pilot discarded | ... | UnstableCV threshold: 3%` +## ACCORE-BIN-T-K7M3: Hot-path UTF-8 transcoder switch — `Utf8Transcoder` → BCL `Utf8.FromUtf16` / `Utf8.ToUtf16` +**Priority:** P1 · **Type:** Performance · **Status:** Closed (2026-05-08) · **Related:** `ACCORE-BIN-T-V4N3` (custom transcoder origin), `ACCORE-BIN-T-V4N2` (Phase 3 SIMD multi-byte), `ACCORE-BIN-T-V4N4` (Reverted method-split), `ACCORE-BIN-T-D9X3` (bench stabilization that made the comparison measurable) + +The custom `Utf8Transcoder` (V4N3) was originally implemented to bypass `System.Text.Encoding.UTF8.GetBytes` virtual-dispatch + EncoderFallback overhead. The V4N3 audit measured wins vs. the **legacy `Encoding.UTF8`** API. **What it did NOT measure**: the modern `System.Text.Unicode.Utf8.FromUtf16` / `Utf8.ToUtf16` API (.NET 7+, tier-1 optimized, used by MemoryPack `WriteUtf8` / `ReadUtf8` paths internally). Once the bench stabilized (D9X3), a direct A/B comparison surfaced that the BCL modern API consistently outperforms the custom transcoder on the binary serializer's hot path. + +### Bench A/B (Latin1Long charset, FastMode SGen Compact) + +| Cell | Ser delta vs MemPack — custom (`EncodeUtf8SinglePass`) | Ser delta vs MemPack — BCL (`Utf8.FromUtf16`) | Improvement | +|------|--------------------------------------------------------|------------------------------------------------|-------------| +| Small | +28.5% | +7.3% | **-21pp** | +| Medium | +23.8% | +3.1% | -21pp | +| Large | +19.6% | +5.1% | -14pp | +| Repeated | +28.8% | +10.9% | -18pp | +| Deep | +23.1% | +0.6% | -22pp | + +| Cell | Deser delta vs MemPack — custom (`DecodeUtf8SinglePass`) | Deser delta vs MemPack — BCL (`Utf8.ToUtf16`) | Improvement | +|------|---------------------------------------------------------|------------------------------------------------|-------------| +| Small | +17.6% | -1.2% (paritás) | -19pp | +| Medium | +12.8% | -4.7% (AcBinary nyer) | -17pp | +| Large | +4.9% | -10.3% (AcBinary nyer) | -15pp | +| Repeated | +16.9% | -1.6% (paritás) | -18pp | +| Deep | +7.0% | -9.0% (AcBinary nyer) | -16pp | + +The Deser side flipped from "consistently behind" to "wins on 3 of 5 cells, paritás on 2". The Ser side closed the deficit from +20-29% to 0-11%. **Both sides** measurable improvement on **every** cell. + +### Why the custom transcoder lost + +The V4N3 implementation included a 4-tier SIMD ASCII prefix path (Vector512BW / Vector256 / Vector128 / scalar) plus a DWORD ASCII batch + scalar 4-branch multi-byte fallback. **All correct, all SIMD-tuned**. But: + +1. **`Utf8.FromUtf16` is also SIMD-tuned in .NET 9** — the .NET team rewrote it on top of `System.Text.Unicode.Utf8` primitives that share infrastructure with `Ascii.IsValid` / `Latin1.GetString`. AOT-publish-friendly, branch-friendly, no virtual dispatch (the `Utf8` API is static, not via an `Encoding` instance with virtual-method-table). +2. **The custom transcoder's ASCII prefix path bails out on first non-ASCII byte** — on multi-byte content (Latin extended / Cyrillic / CJK) the SIMD path runs only for the leading ASCII span, then the entire remainder falls into per-char scalar 4-branch dispatch. The BCL `Utf8.FromUtf16` SIMD-batches multi-byte content too (different algorithm — the BCL doesn't bail on first non-ASCII). +3. **AOT inline budget**: the custom transcoder's body grew with the V4N3 / V4N4 / V4N5 additions; in NativeAOT publish the call sites in `WriteStringWithDispatch` / `ReadString*` did NOT inline (V4N4 disasm audit confirmed). The BCL `Utf8.FromUtf16` is a single static method with a tighter call-site footprint. + +### Resolution + +Landed 2026-05-08. The 8 production hot-path call sites of `Utf8Transcoder.*` switched to BCL: + +| File / line | Before | After | +|---|---|---| +| `AcBinarySerializer.cs:120` | `Utf8Transcoder.GetUtf8ByteCount` | `Encoding.UTF8.GetByteCount` | +| `AcBinarySerializer.BinarySerializationContext.cs:694` | `Utf8Transcoder.EncodeUtf8SinglePass` | `Utf8.FromUtf16(...)` | +| `AcBinarySerializer.BinarySerializationContext.cs:784` | `Utf8Transcoder.EncodeUtf8SinglePass` | `Utf8.FromUtf16(...)` | +| `AcBinarySerializer.BinarySerializationContext.cs:901` | `Utf8Transcoder.EncodeUtf8SinglePass` | `Utf8.FromUtf16(...)` | +| `AcBinaryDeserializer.BinaryDeserializationContext.Read.cs:523` | `Utf8Transcoder.CountUtf8Chars` | `Encoding.UTF8.GetCharCount` | +| `AcBinaryDeserializer.BinaryDeserializationContext.Read.cs:527` | `Utf8Transcoder.DecodeUtf8SinglePass` | `Utf8.ToUtf16(...)` | +| `AcBinaryDeserializer.BinaryDeserializationContext.Read.cs:565` | `Utf8Transcoder.DecodeUtf8SinglePass` | `Utf8.ToUtf16(...)` | +| `PropertyMetadataBase.cs:104-109` (ctor-once) | `Utf8Transcoder.GetUtf8ByteCount` + `EncodeUtf8SinglePass` (two-pass) | `Encoding.UTF8.GetBytes(string)` (single-pass with exact-size byte[] return) | + +The count-only call sites (`GetByteCount` / `GetCharCount`) stay on the **legacy** `Encoding.UTF8` API — `System.Text.Unicode.Utf8` has no count-only equivalent (only `FromUtf16` / `ToUtf16` which encode + count combined). For pure count, the legacy API is the optimal tool (single SIMD-tuned scan, no encode/decode work). + +The `Utf8Transcoder.cs` file remains in the repo but **fully commented out** — the class definition is preserved as historical reference / future reactivation if a workload ever surfaces where it could win again. `Utf8TranscoderTests.cs` is not currently exercising live code. + +### Lesson — the V4N3 audit's blind spot + +The V4N3 (custom transcoder) audit compared against **legacy `Encoding.UTF8.GetBytes`** and won. **The audit did NOT compare against `Utf8.FromUtf16`** (the modern API, .NET 7+). On modern runtime the BCL has two UTF-8 transcoders: a legacy one (instance-method on `Encoding`, virtual dispatch) and a modern one (static `Utf8.FromUtf16` / `Utf8.ToUtf16`). MemoryPack uses the modern one — that's what we should have been comparing against from the start. + +**Generalizable lesson**: when measuring a custom implementation against a "BCL baseline", verify which BCL API is used by the actual competition (here: MemoryPack source-gen). The `Encoding.UTF8.*` instance API and `System.Text.Unicode.Utf8` static API are different generations of the same logical operation; treating them as interchangeable hides the comparison's scope. + +### Why P1 + +- Closed the FastMode Compact mode Ser deficit from +20-29% to ≤11% on every cell (Latin1Long benchmark) +- Flipped the Deser side from -1 to -10% deficit to **AcBinary winning on 3 of 5 cells**, parity on 2 (Latin1Long benchmark) +- One-time fixed cost (8 production call-site cseréje) — every future bench profits +- Removed a load-bearing ~600-line custom SIMD module from the maintained surface area; future maintainers don't need to reason about Vector512BW / cross-lane shuffle / 5-popcount surrogate-pair correctness — the BCL handles it + +### Follow-up — `Utf8Transcoder.cs` cleanup + +The file is fully commented out. Either: +- **Delete** entirely (preferred for repo cleanliness) — `Utf8TranscoderTests.cs` then needs deletion or revival as a regression-only guard +- **Keep** the comment-block as historical reference, with a header comment pointing to this entry + +Decision deferred — the comment-block does no harm to build / runtime. Address when the next docs-archive sweep runs. + +## ACCORE-BIN-T-P3X7: Profile-driven Compact-mode Ser optimalizációs roadmap (post-K7M3 hot-path analysis) +**Priority:** P2 · **Type:** Performance roadmap · **Status:** Open · **Related:** `ACCORE-BIN-T-K7M3` (BCL UTF-8 transcoder switch — előfeltétele), `ACCORE-BIN-T-D9X3` (bench stabilization), `ACCORE-BIN-T-S2X9` (markerless schema lane — primitív property-marker már kivezetve a SGen-ben), `ACCORE-BIN-T-V4N4` (audit methodológia hivatkozás) + +A 2026-05-08 VS Performance Profiler session (4 sec range, AcBinary FastMode Serialize, Latin1Long charset, FastWire mode) konkrét hot-path-decomposition-t adott a K7M3 BCL-csere utáni állapotról. A string-encoding már nem akadály (a `Utf8.FromUtf16` SIMD-tuned), a fennmaradó AcBinary-specific overhead azonosítható. + +### Profile session adatok (Self CPU%) + +| Self CPU% | Function | Category | +|---|---|---| +| 39.77% | `System.Buffer._Memmove` | Közös MemPack-kel (UTF-16 raw + return-time `byte[]`-copy) — **NEM AcBinary-spec** | +| **10.03%** | `AcBinarySerializer.Serialize` | Top-level (context-acquire, type lookup, return-alloc) | +| **7.48%** | `TestMeasurementPoint_GeneratedWriter.WriteProperties` | SGen template (legkisebb levél típus, ~12500 hívás Large cellán) | +| **5.31%** | `WriteStringWithDispatch` | String hot path | +| **3.23%** | `TestMeasurement_GeneratedWriter.WriteProperties` | SGen | +| **1.66%** | `WriteVarUIntMultiByteUnsafe` | VarUInt int-property encode | +| 1.10% | `TestPallet_GeneratedWriter.WriteProperties` | SGen | +| 0.39% | `TestOrderItem_GeneratedWriter.WriteProperties` | SGen | +| 0.32% | `SharedUser_GeneratedWriter.WriteProperties` | SGen | +| 0.05% | `ArrayBinaryOutput.Grow` | Buffer-grow (ritka, kicsi probléma) | + +**Total SGen `WriteProperties` Self CPU**: ~12.6% — a leg nagyobb AcBinary-specific surface. + +A `AcBinarySerializer.Serialize` line-szintű drill-down (`AcBinarySerializer.cs:312-335`): +- `WriteObject(value, wrapper, context, 0)` Total: 28.05% — a teljes serializációs fa (SGen + Writer hot path) +- `context.Output.ToArray(context._buffer, context._position)` Total: **47.37%** — final `byte[]`-alloc + content-memcpy (= a 39.77% `_Memmove` Self nagy része) + +### MemPack-összehasonlítás (referenciaként) + +A MemPack `Serialize(T value)` mechanizmus: +1. **`[ThreadStatic]`** writer-state — nincs pool-bérlés, nincs lock, nincs concurrent dictionary lookup +2. **`ReusableLinkedArrayBufferWriter`** — linked chunk-list (4 KB → 8 KB → 16 KB geometriai); buffer-grow = új chunk hozzáadása, **nincs memcpy a régi adaton** +3. **`ToArrayAndReset()`** — végén alloc + chunks → byte[] memcpy (közös overhead az AcBinary-vel) + +Az AcBinary `AcquireArrayOutputContext(options)` pool-bérlés + lineáris `byte[]` `Array.Resize` + `Output.ToArray(...)` — két memcpy-cost (grow + return), de a grow ritka. + +### Sorrendezett optimalizációs ötletek + +#### A. SGen `WriteProperties` — ensure-capacity batching (várt: -1-3pp Ser, **revíziós becslés**) + +Jelenlegi SGen-template per-property emit (mindenenkit külön ensure): +```csharp +context.WriteVarInt(obj.Id); // ensure(5) + write(1-5) +context.WriteByte(BinaryTypeCode.Object); // ensure(1) + write(1) +context.WriteVarInt((int)obj.Status); // ensure(5) + write(1-5) +context.WriteRaw(obj.Weight); // ensure(8) + write(8) +``` + +Csoportosított ensure pattern: +```csharp +context.EnsureCapacity(maxBytesForGroup); // worst-case sum, 1× hívás +context.WriteVarIntUnsafe(obj.Id); // no ensure (csak buffer write) +context.WriteByteUnsafe(BinaryTypeCode.Object); // no ensure +context.WriteVarIntUnsafe((int)obj.Status); +context.WriteRawUnsafe(obj.Weight); +``` + +A `AcBinarySourceGenerator.cs` `WriteProperties` template-jét kell módosítani: +1. Property-listából contiguous primitív csoportok kinyerése (Object/Collection property-knél megszakítva — mély rekurzió, méret nem előre kiszámítható) +2. Csoportonként worst-case-size compute compile-time-on (a primitív type-ok mérete fix vagy worst-case ismert) +3. Egyetlen `EnsureCapacity(sum)` + bulk `*Unsafe` write-ok + +`*Unsafe` írók szükségessége: `WriteVarUIntUnsafe` már létezik. **`WriteByteUnsafe`, `WriteRawUnsafe`** valószínűleg hozzá kell adni a `BinarySerializationContext`-hez. + +**Becslés-revízió (2026-05-08)**: az eredeti -4-6pp becslés felső volt. Egy `EnsureCapacity` inline-olva ~1-2 ns/call (a hot path-on a branch-prediction perfekt — sosem jut el a Grow-hoz). 10 property × 1.5 ns = ~15 ns / object megtakarítás batch-eléssel — Latin1Long Large cell 1250 instance × 13 ns = ~16 µs / 120 µs Ser ≈ **~13% felső**, de **csak az ensure-szám csökkenéséből**. A SGen `WriteProperties` Self CPU 12.6%-a **NEM csak** ensure-check; tartalmaz `HasPropertyFilter` branch-check, null-check + depth-check dispatch, `Unsafe.As` cast, etc. — lásd **F**. Az ensure-batching önmagában reálisan **1-3pp Ser javulás**. + +**Wire-formátum változatlan**, backward-kompatibilis, kis kockázat. Hatás minden cellán mérhető (TestOrder cell-szerkezet ~100+ primitív property per Object-instance). + +#### B. `WriteStringWithDispatch` Compact ág batch-write (várt: -1-2pp Ser) + +A FastWire ágat már `K7M3`-ban + a 2026-05-08 batch-write fixxel egyetlen ensure + direct-write-ra alakítottuk. A **Compact ág** ugyanaz a 3-step pattern (post-encode tier-shift `CopyTo` ha `actualHeader < reserveHeader`, plus header-write a tier alapján). A Compact ágon is alkalmazható batch-write — egyetlen `EnsureCapacity` a worst-case-tier-szel + direct header-write a `Utf8.FromUtf16` után. + +#### C. Thread-static context (várt: -2-4pp Ser, NAGY refactor) + +A `AcquireArrayOutputContext(options)` pool-bérlés overhead-jét mérsékelheti a MemPack `[ThreadStatic]` mintázat. A jelenlegi pool-bérlés: +- Pool dictionary lookup (lehet, lock-os) +- Context-state init / reset minden hívásnál + +Thread-static cseréje: +- Per-thread cached context, nincs lock +- Context-reset minden hívásnál ugyanaz, de a `state` allokáció egyszer fut + +**Refactor szempontok**: +- A `BinarySerializationContext` state-tárolása nem thread-safe önmagában — pool-bérlés vagy thread-static mind a single-thread haszálatot biztosítja +- Az `options` paraméter érintheti a state-init logikát — multi-options scenárió esetén a thread-static state-t reset-elni kell +- Concurrent serialize hívások (több thread egyidejű) — minden thread saját state-tel rendelkezne; nincs cross-thread sharing igény + +#### D. Linked-array buffer chunk strategy (kicsi hatás, NAGY refactor) + +A MemPack `ReusableLinkedArrayBufferWriter` linked chunk-list helyettesíti a lineáris `byte[]`-grow stratégiát. Buffer-grow = új chunk hozzáadása (no memcpy a régi adaton). + +**A profile szerint a `ArrayBinaryOutput.Grow` Self CPU csak 0.05%** — a buffer-grow ritkán fut, a default kapacitás elég nagy a Large cell-hez. **Kicsi hatás, nagy refactor**. Alacsony prioritás. + +#### F. SGen `HasPropertyFilter` lift-out a `WriteProperties` method elejére (várt: -2-4pp Ser) + +A jelenlegi SGen-template **minden property-emit előtt** ellenőrzi a property-filter-t: +```csharp +public void WriteProperties(object value, ...) +{ + var obj = Unsafe.As(value); + + if (context.HasPropertyFilter) // ← MINDEN property-en check! + { + var fc_Category = new BinaryPropertyFilterContext(obj, ..., "Category", ...); + if (!context.PropertyFilter!(in fc_Category)) { + context.WriteByte(BinaryTypeCode.PropertySkip); + goto skip_Category; + } + } + if (obj.Category == null) context.WriteByte(BinaryTypeCode.PropertySkip); + else if (depth > context.MaxDepth) context.WriteByte(BinaryTypeCode.Null); + else { context.WriteByte(BinaryTypeCode.Object); ...WriteProperties... } + skip_Category:; + + if (context.HasPropertyFilter) { /* same for Inspector */ } // ← újra! + // ... 10× ismétlés property-listán +} +``` + +A `HasPropertyFilter` per-property branch-check **TestOrder benchmark workload-on mindig false** (a benchmark nem használ property-filter-t). De a check minden property-en lefut — kód-cache-ben benne van, branch-predict ugyan jó, **mégis CPU cycle**. + +Optimalizáció — kétpályás SGen kódgenerálás: +```csharp +public void WriteProperties(object value, ..., int depth) +{ + var obj = Unsafe.As(value); + + if (context.HasPropertyFilter) + { + WritePropertiesWithFilter(obj, context, depth); // ritka path — full per-property check + return; + } + + // Fast path — NO filter check anywhere + if (obj.Category == null) context.WriteByte(BinaryTypeCode.PropertySkip); + else if (depth > context.MaxDepth) context.WriteByte(BinaryTypeCode.Null); + else { ... } + // (no skip_Category goto — never needed) + + context.WriteVarInt(obj.Id); // primitív, no filter check + // ... rest of properties without HasPropertyFilter check +} + +// Külön emit-elt method ritka path-ra: +private static void WritePropertiesWithFilter(TestPallet obj, ..., int depth) +{ + // Full per-property filter-aware kód (the current behavior) +} +``` + +A `AcBinarySourceGenerator.cs`-t kell módosítani: +1. A `WriteProperties` method elején egyetlen `HasPropertyFilter` check +2. Két különböző code-path emit: + - **Fast path** (default — no filter): nincs per-property `if (context.HasPropertyFilter)` check, nincs filter-context allokáció + lambda-call, nincs `goto skip_X` + - **Slow path** (filter aware — separate static method): a jelenlegi viselkedés + +**Várt nyereség**: a fast path ~10 elimináció / object × 1-2 ns / branch ≈ ~15-20 ns / object. Latin1Long Large cell 1250 instance × 18 ns = ~22 µs / 120 µs Ser ≈ **~18% felső becslés**; reálisan **2-4pp Ser javulás** (a kód-bloat növekedés és a JIT inlinelés-ráhatás miatt mérséklődik). + +**Kombinálható az A-val**: az **A + F** együtt **3-7pp javulás** célozható meg — a SGen `WriteProperties` 12.6% Self CPU jelentős csökkenése. + +**Wire-formátum változatlan**, kód-méret kicsivel nő (két path-ot generál minden type-on), de a fast path a JIT-tel jobban inlinelhető. + +#### G. SGen `WriteProperties` null/depth/object-ref kombinálás (kapcsolt az F-hez) + +A komplex (Object) property-knél a 3-ágú dispatch: +```csharp +if (obj.X == null) context.WriteByte(BinaryTypeCode.PropertySkip); +else if (depth > context.MaxDepth) context.WriteByte(BinaryTypeCode.Null); +else { context.WriteByte(BinaryTypeCode.Object); X_GeneratedWriter.Instance.WriteProperties(...); } +``` + +Ez minden komplex property-en fut. Lehetséges optimalizáció: a `depth > MaxDepth` check egy method-szintű branch-szé alakítás (egyszer ellenőrizni a method elején, aztán a property-szintű ágat egyszerűsíteni). De ez **kis hatás** és a `MaxDepth` jellemzően nem érintő (a legtöbb workload-on `depth < MaxDepth`). + +Alacsony prio, F-tel kombinált. + +#### E. `WriteVarUIntMultiByteUnsafe` (1.66% Self) → fix-int (várható: -1pp Ser, **NEM javasolt önmagában**) + +A `WriteVarInt` (signed int property-encode, ZigZag + VarUInt) kódolás a SGen-template-ekben gyakori (Id, Status, TrayCount, stb.). A multi-byte ág 1.66% Self CPU. + +Fix-int (4 byte) cseréje wire-méret-növekedéssel jár (kis int-eken +3 byte / property), ami a wire-formátum kompaktság-előnyét rontja. **Csak `ACCORE-BIN-T-S2X9` markerless lane kontextusban** érdemes — ahol a property-marker eltávolításával együtt fix-int kicserélése wire-szempontból kompenzálódik. + +### Közös, NEM AcBinary-spec overhead — nem optimalizálható + +A `Buffer._Memmove` 39.77% Self CPU + a `Output.ToArray()` 47.37% Total **a return-time `byte[]`-alloc + content-memcpy**, ami minden `byte[] Serialize(T)` hívásnál fut. **Mindkét engine fizeti** (MemPack `ToArrayAndReset()` is alloc + memcpy a chunkokból). Az API contract (`byte[] Serialize(T)`) miatt elkerülhetetlen. + +**Aki teljesítményt akar**, használja a `IBufferWriter` overload-ot (`AcBinaryBufferWriterBenchmark` vs `MemoryPackBufferWriterBenchmark` apples-to-apples a benchmarkban — mindkét engine ugyanezt csinálja). + +### Acceptance (per-section) + +- **A** (SGen ensure-batching): Latin1Long FastWire bench AcBinary Ser delta vs MemPack -1-3pp javulás minden cellán +- **F** (HasPropertyFilter lift-out): Latin1Long Ser delta -2-4pp; **A + F együtt** SGen `WriteProperties` Self CPU ≤ 8% (jelenleg ~12.6%) +- **G** (null/depth/object-ref kombinálás): kis hatás, F-tel kombinált +- **B** (WriteStringWithDispatch Compact batch-write): Latin1Long Compact bench AcBinary Ser delta vs MemPack ≤ +5% minden cellán +- **C** (Thread-static context): `Serialize` Self CPU ≤ 6% (jelenleg ~10%) +- **D** (Linked-array): nem prioritás — buffer-grow Self CPU már ≤ 0.05% +- **E** (VarInt → fix-int): csak az `S2X9` markerless lane sprint kontextusában mérni + +### Sorrend + +1. **A + F kombinálva** — SGen `WriteProperties` template átfogó refactor (ensure-batching + HasPropertyFilter lift-out + esetleg G null/depth-combine). Együtt **~3-7pp Ser javulás** várt minden cellán. Izolált változtatás csak `AcBinarySourceGenerator.cs`-en, wire-format változatlan. +2. **B** — ~1-2pp javulás, ugyanaz a pattern mint a `K7M3` FastWire batch-write +3. **C** — ~2-4pp, de NAGY refactor (thread-safety, pool semantics felülvizsgálat) +4. **D** — alacsony prioritás (kis hatás, nagy refactor) +5. **E** — csak `S2X9` kontextusban + +### Trigger + +- **A + F** → most azonnal implementálható; ezek a SGen template-en belül kombinálandók (egyetlen template-átdolgozás kétségtelenül jobb mint külön refactor-körök). Minden továbbai mérés ettől függ. +- **B** → A+F után, hasonló pattern alkalmazása más writer-helyen +- **C** → ha a Serialize Self CPU 10% továbbra is dominál A+F+B után +- **D, E** → opcionális, az A/F/B/C eredmények alapján + +## ACCORE-BIN-T-Q5T2: Önleíró wire-formátum — duplikált object-marker-ek + UTF-16 string marker (per-type/property encoding choice) +**Priority:** P2 · **Type:** Architecture / Performance · **Status:** Open · **Related:** `ACCORE-BIN-T-P3X7` (profile-driven roadmap — kis-adat slowdown diagnózis), `ACCORE-BIN-T-K7M3` (BCL UTF-8 transcoder — előfeltétele), `ACCORE-BIN-T-S2X9` (markerless schema lane), `ACCORE-BIN-T-V4N2` (UTF-8 SIMD) + +A 2026-05-08 design-session során merült fel mint válasz a kis-adat-slowdown problémára és az `if (FastWire)` / `if (UseMetadata)` runtime-branch-ek széles jelenlétére. Cél: a wire-mode kivezetése a globális header-ből, **per-object/per-property encoding-szabadság** attribute-tal, megőrizve a SGen↔Runtime wire-kompatibilitást. + +### LLM Context (cold-start) + +Egy fresh session olvasásához ez a kontextus elég: + +**Wire-modell**: AcBinary két párhuzamos serializációs path-ot futtat — **SGen** (compile-time generált, `[AcBinarySerializable]` típusokra) és **Runtime** (reflection + `Expression.Compile`). **Mindkettő ugyanazt a wire-t produkálja és olvassa** (interop garancia, `BINARY_SGEN.md` "Hybrid Execution Model"). + +**Markerless body**: object scope-on belül a primitív property-k (int, long, double, …) **közvetlenül** írnak a wire-be, marker-byte nélkül. A reader a sorrendet compile-time schema-ból (SGen) vagy `OrderedProperties` metadata-ból (Runtime) tudja. A wire object-prefix-szel kezdődik (1-byte marker), majd markerless body. + +**Meglévő object-marker család** (`AcBinarySerializer.BinarySerializationContext.cs` writer-ek + `AcBinaryDeserializer.cs` reader-dispatch switch): +- `Object` — sima first-occurrence +- `ObjectWithTypeName` — polimorf (`runtimeType != declaredType`) +- `ObjectFullMarkerIId` / `ObjectFullMarkerAll` — `RefHandling=IId|All` first-occurrence +- `ObjectRef` / `ObjectRefIId` — subsequent (csak ID, **NEM duplikálódik** — nincs primitív property körülötte) + +**OPT-OUT minta** (jelenlegi konvenció): default SGen flexibilis — minden runtime-branch-et generál (pl. `if (context.UseRefHandling)`). Class-attribute disable-eli a feature-t → SGen omitti a branch-et → drasztikus optimum. Q5T2 ezt a mintát terjeszti ki **encoding-választásra**. + +**Naming-konvenció**: PascalCase, suffix-variánsok (`Object` → `ObjectVarUInt`, `String` → `StringUtf16`). NEM `Object_NoZZ`, NEM `ObjVU`. + +### Motiváció + +A jelenlegi `AcBinaryOptions.WireMode` (FastMode vs Compact) **payload-szintű globális flag**: +- A kódban sok `if (FastWire) { ... } else { ... }` branch (lásd `WriteVarInt` 514. sor, `WriteStringWithDispatch`, `WriteValueNonPrimitive`, property-writers) +- A fejlesztő nem optimalizálhat granuláris szinten (pl. `[NoZZ]` egy hot type-ra, default másnak) +- Schema-evolúciós szempontból: ha a szerver attribute-ot változtat egy type-on, a klienseknek (akár régebbi verzió) **rekomp nélkül** olvasniuk kell az új wire-t + +A `ACCORE-BIN-T-P3X7` profile-bench mérése szerint a kis-adat slowdown (Latin1Long Small +2.6%, Medium +1.5% AcBinary lassulás MemPack-hez képest) jelentős részben a VarUInt per-call overhead-ből származik (ZigZag shift + multi-byte branch loop). A type-szintű `[IntEncoding=VarUInt]` attribute-tal a fejlesztő a non-negative property-ket VarUInt-NoZigZag-ra állíthatja → ZigZag shift kiesik, kis-adatra mérhető nyereség. + +### Wire-formátum design + +**5 új `BinaryTypeCode` marker** (naming TBD: `*VarUInt` vagy `*NoZZ` suffix, implementációkor véglegesítendő): + +| Új marker | Cél | Alkalmazási hely | +|---|---|---| +| `ObjectVarUInt` | Object scope primitive int/long/enum-jai NoZigZag VarUInt encoding-ban | sima object first-occurrence | +| `ObjectWithTypeNameVarUInt` | Polimorf first-occurrence NoZZ-variánsa | `runtimeType != declaredType` esetén | +| `ObjectFullMarkerIIdVarUInt` | `RefHandling=IId` first-occurrence NoZZ-variánsa | csak first; subsequent `ObjectRefIId` változatlan | +| `ObjectFullMarkerAllVarUInt` | `RefHandling=All` first-occurrence NoZZ-variánsa | csak first; subsequent `ObjectRef` változatlan | +| `StringUtf16` | UTF-16 encoded string content (property-szintű) | bárhol egy string property emit-jénél | + +**Wire-példa**: +``` +[ObjectVarUInt marker] ← scope-szintű: int-property-k VarUInt-NoZZ + WriteVarUInt(obj.Id) ← markerless body, encoding a marker alapján + WriteVarUInt(obj.Status) + [String marker] UTF-8(obj.Notes) ← default UTF-8 + [StringUtf16 marker] UTF-16(obj.Name) ← property-szintű override +``` + +**Byte-szintű példa** (`Order { Id=42, Status=3, Notes="ok" }`, class-szintű `IntEncoding=VarUInt`): +- Default ZigZag wire: `[Object]` `[0x54]` (VarInt 42 ZigZag: `((42<<1)^(42>>31))=84`) `[0x06]` (VarInt 3 ZigZag: 6) `[String]` `[0x02]` `0x6F 0x6B` +- New VarUInt wire: `[ObjectVarUInt]` `[0x2A]` (VarUInt 42 raw: `0x2A`) `[0x03]` (VarUInt 3 raw: `0x03`) `[String]` `[0x02]` `0x6F 0x6B` +- Body-sorrend és byte-szám változatlan; csak az encoding-szabályok mások. Stringek ugyanúgy markered (UTF-8 default itt). String-encoding override esetén `[StringUtf16]` `[char-count]` `[2-byte-per-char]`. + +A primitive property-k körüli wire **markerless marad** — a body-encoding-ot az object-marker határozza meg, nem per-property byte. Wire-bloat csak ott van, ahol most is van marker (object-prefix, string-marker). + +### Attribute design + +**Object-szintű** (mert object-marker is object-szintű): +```csharp +[AcBinarySerializable(IntEncoding = IntEncoding.VarUInt)] +public class Order { ... } +``` + +**Property-szintű** (csak string-en, mert string-marker is per-property): +```csharp +public class Order { + [AcBinaryEncoding(StringEncoding.Utf16)] + public string CustomerName { get; set; } +} +``` + +**Új public API elemek**: +- `AcBinaryEncodingAttribute` (target: `Class | Property`) +- `IntEncoding` enum (`Default` = ZigZag VarInt, `VarUInt` = NoZigZag) +- `StringEncoding` enum (`Default` = UTF-8, `Utf16` = UTF-16) +- `AcBinaryOptions.IntEncoding` és `AcBinaryOptions.StringEncoding` runtime fallback opciók + +### Encoding-választás precedenciája (writer-side) + +1. **Property attribute** (legerősebb) — pl. `[AcBinaryEncoding(StringEncoding.Utf16)]` +2. **Class attribute** — pl. `[AcBinarySerializable(IntEncoding=VarUInt)]` +3. **`AcBinaryOptions` runtime opció** — pl. `options.StringEncoding = Utf16` +4. **Built-in default** — ZigZag-VarInt + UTF-8 + +### Szerepkörök és path-ok + +| Path | Encoding-választás | +|---|---| +| **SGen writer (with attribute)** | Compile-time pinned, hard-coded marker + encoding emit (NO runtime branch) — a meglévő OPT-OUT minta (mint `RefHandling`/`Interning` disable) | +| **SGen writer (no attribute)** | Runtime branch a `context.IntEncoding`/`context.StringEncoding` option-en — két path generálódik, runtime dönt | +| **SGen reader** | **Marker-dispatch** (NEM hard-coded marker-expect — runtime-on dönti el, hogy `Object` vagy `ObjectVarUInt` érkezett, és annak megfelelően olvas) | +| **Runtime writer (reflection-based)** | Reflection-attribute-read + option fallback + default fallback — ugyanaz a precedencia mint SGen-nél | +| **Runtime reader** | Marker-dispatch (universal — nincs attribute / option használat encoding-döntésre, csak a marker-byte) | + +⚠️ **SGen reader marker-dispatch KÖTELEZŐ** (NEM hard-coded marker-expect). Konkrét scenario amit ez kezel: + +> Szerver Runtime-mode-ban serializálja `Order`-t. Az `Order` osztályon a szerver-deploy óta **változott az attribute** (új deploy hozott `[IntEncoding=VarUInt]`-ot). Szerver Runtime writer reflection-ből olvassa az új attribute-ot → `ObjectVarUInt` markert emit-el a wire-be. +> +> Régi kliens **rekomp nélkül** kapja a payload-ot. Ha a kliens SGen reader-e hard-coded `Object`-marker-expect-tel olvasna → **panik / mismatch**. +> +> Marker-dispatch-szel a kliens helyesen dekódol bármelyik markert, függetlenül attól, hogy a kliens-oldali compile-time `Order` typebe-n volt-e az attribute. + +Ez biztosítja a **"server-side attribute-change doesn't break clients"** garanciát. + +### Kompatibilitási garanciák + +| Interakció | Eredmény | +|---|---| +| SGen-write (NoZZ attr) → SGen-read | OK (marker-dispatch) | +| SGen-write (NoZZ attr) → Runtime-read | OK (marker-dispatch) | +| Runtime-write (option=NoZZ) → SGen-read | OK (marker-dispatch) | +| Runtime-write (option=NoZZ) → Runtime-read | OK (marker-dispatch) | +| Server-attribute-changed → old client (no recompile) | OK — kliens csak a marker-t olvassa | +| Mixed payload (egyik object NoZZ, másik default) | OK — minden object-marker önálló scope | + +### Implementációs lépések + +1. **`BinaryTypeCode` const-bővítés** — 5 új byte-érték (range-allokáció: a meglévő enum szervezése alapján a következő szabad slot-okba). Wire-format spec frissítés `BINARY_FORMAT.md`-ben. +2. **`AcBinaryEncodingAttribute` + `IntEncoding` + `StringEncoding` enum-ok** — új fájlok az `AyCode.Core/Serializers/Binaries/` mappában. +3. **`AcBinaryOptions.IntEncoding` + `AcBinaryOptions.StringEncoding`** opciók hozzáadása (default = `Default`). +4. **`WriteStringUtf16` / `ReadStringUtf16` context-helper-ek** — `MemoryMarshal.Cast` direct copy + length-prefix (VarUInt char-count). +5. **Runtime writer reflection** — `BinarySerializeTypeMetadata` cache: `IntEncoding`, `StringEncoding`-per-property flag-ek (attribute-alapján). Encoding-emit a precedencia szerint. +6. **SGen writer template** — attribute-feldolgozás `EmitWriteValue`-ban: ha attribute → compile-time hard-coded emit; ha nincs → runtime-branch emit a `context` option-en. +7. **SGen reader template** — `EmitReadValue` marker-dispatch-szel (object-marker scope-encoding-mode tracking + string-marker per-property dispatch). +8. **Runtime reader update** — object-marker dispatch a scope-encoding-state-be (pl. `BinaryDeserializationContext.CurrentIntEncoding`), string-marker per-property dispatch. +9. **Cross-mode tesztek** — minden write-read kombináció (SGen↔SGen, SGen↔Runtime, Runtime↔SGen, Runtime↔Runtime) minden encoding-kombinációban (default, attr-only, option-only, attr+option, mixed payload). +10. **Doc**: `BINARY_FORMAT.md` wire-format spec, `BINARY_OPTIONS.md` új opciók, `BINARY_SGEN.md` precedencia + szerepkörök táblázat. + +### Acceptance + +- 5 új BinaryTypeCode marker, naming-konvenció dokumentált +- `AcBinaryEncodingAttribute` + 2 enum + 2 opció extension working +- Round-trip teszt minden cross-mode kombinációban zöld +- Wire-bloat default-encoding-on **0 byte** (nincs új per-property marker) +- Latin1Long Small bench: AcBinary `[IntEncoding=VarUInt]` típuson a slowdown ≤ MemPack +0.5pp (jelenleg +2.6%) +- `BINARY_FORMAT.md`/`BINARY_OPTIONS.md`/`BINARY_SGEN.md` szinkronban a wire- és attribute-világgal +- A meglévő `WireMode=Fast/Compact` distinction-ek kompatibilisek maradnak (vagy migrálódnak az új encoding-attribute-okra — külön döntés implementációkor) + +### Trigger / Sorrend + +Implementáció **ne kezdődjön** azonnal — a `ACCORE-BIN-T-P3X7` A+F szekciói (SGen ensure-batching + HasPropertyFilter lift-out) **előbb mérendő**. Ha az A+F már lehozza a SGen `WriteProperties` Self CPU-t ≤ 8%-ra, és a kis-adat slowdown ettől már ≤ +1pp, akkor ez a Q5T2 entry **alacsony prioritásra** kerül. Ha a kis-adat slowdown az A+F után is megmarad → Q5T2 implementáció **érdemi**. + +Egyéb prerekvizit: `ACCORE-BIN-T-W9F1` (compile-time metadata) szinkronizálás — a Runtime writer reflection-attribute-read-je beleilleszthető a generált metadata-ba, ezzel a runtime path is gyorsabb attribute-alapú encoding-választás-on. + +### Open kérdések (implementációkor eldöntendő) + +- **Marker naming**: `ObjectVarUInt` (semantic, az encoding alapján) vagy `ObjectNoZZ` (rövidebb)? +- **`[AcBinarySerializable]`-on belül** vegyük fel a `IntEncoding` paramétert, vagy **külön `[AcBinaryEncoding]` attribute** legyen object-szinten is (és a `[AcBinarySerializable]` változatlan)? +- **`AcBinaryOptions.WireMode` jövője**: a régi `Fast`/`Compact` enum migrálódjon az új `IntEncoding`/`StringEncoding`-ra (BC-break) vagy maradjon mint shortcut-default? +