diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 6acd6b6..9a34168 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -116,7 +116,11 @@ "Bash(iconv -f UTF-16LE -t UTF-8 \"AyCode.Core.Serializers.SourceGenerator.csproj\")", "Bash(mv \"AyCode.Core.Serializers.SourceGenerator.csproj.utf8\" \"AyCode.Core.Serializers.SourceGenerator.csproj\")", "Bash(rm -rf .vs/AyCode.Core/v17 .vs/AyCode.Core/v16 .vs/ProjectEvaluation .vs/CopilotSnapshots)", - "Bash(find .vs -maxdepth 2 -type d)" + "Bash(find .vs -maxdepth 2 -type d)", + "Bash(git -C \"H:/Applications/Aycode/Source/AyCode.Core\" diff HEAD -- \"AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs\" \"AyCode.Core/Serializers/Binaries/AcBinarySerializer.cs\")", + "Bash(DOTNET_TieredCompilation=0 DOTNET_JitDisasm='*GeneratedWriter*' dotnet run --project AyCode.Benchmark/AyCode.Benchmark.csproj -c Release -- --jitasm)", + "Bash(echo \"EXIT=$?\")", + "Bash(awk -F: '$1>8289')" ] } } diff --git a/AyCode.Benchmark/AcBinaryVsMemPackBenchmark.cs b/AyCode.Benchmark/AcBinaryVsMemPackBenchmark.cs index 2dbe082..7d832d3 100644 --- a/AyCode.Benchmark/AcBinaryVsMemPackBenchmark.cs +++ b/AyCode.Benchmark/AcBinaryVsMemPackBenchmark.cs @@ -50,6 +50,13 @@ public class AcBinaryVsMemPackBenchmark [GlobalSetup] public void Setup() { + // BDN runs each benchmark in an isolated child process — the parent's charset selection (a static + // field) does NOT cross the process boundary, so the child would otherwise fall back to the + // compile-time default (Latin1Long). Pin the BDN serializer benchmark to Latin1Short here so its + // cells line up with the Console Latin1Short runs. (Mirrored in BdnSummaryAdapter.WriteResults + // for the parent process — .LLM charset label + Size(B) column.) + BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Latin1Short; + var allTestData = BenchmarkTestDataProvider_All_False.CreateTestDataSets(); var testDataSet = (TestDataSet)allTestData.First(t => t.Name.StartsWith(TestData)); diff --git a/AyCode.Benchmark/BdnSummaryAdapter.cs b/AyCode.Benchmark/BdnSummaryAdapter.cs index 2396492..5bf8c7e 100644 --- a/AyCode.Benchmark/BdnSummaryAdapter.cs +++ b/AyCode.Benchmark/BdnSummaryAdapter.cs @@ -40,6 +40,12 @@ public static class BdnSummaryAdapter /// public static void WriteResults(Summary summary) { + // Parent-process counterpart of AcBinaryVsMemPackBenchmark.Setup's charset pin: the BDN child + // processes ran Latin1Short, but this adapter runs in the parent process where LongStringSuffix + // would still be the compile-time default (Latin1Long). Set it so GetCharsetName() labels the + // .LLM correctly AND the CreateTestDataSets()/CreateWorkload calls below compute matching Size(B). + BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Latin1Short; + var allTestData = BenchmarkTestDataProvider_All_False.CreateTestDataSets(); var results = Translate(summary, allTestData); var ctx = CreateContext(); @@ -74,9 +80,10 @@ public static class BdnSummaryAdapter /// /// Looks up the human-readable name for the currently-active - /// charset. Mirrors Console's Configuration.GetCurrentCharsetName — Console's Menu sets the - /// charset before invoking the bench; for BDN the default charset (Latin1Long) is in effect unless the user - /// overrides at runtime. + /// charset. Mirrors Console's Configuration.GetCurrentCharsetName. The BDN serializer benchmark + /// pins the charset to Latin1Short — set in (parent process) and in + /// AcBinaryVsMemPackBenchmark.Setup (child process); see those sites for the process-isolation + /// rationale (BDN's per-benchmark child processes don't inherit a parent static-field mutation). /// private static string GetCharsetName() { diff --git a/AyCode.Benchmark/JitDisassemblyBenchmark.cs b/AyCode.Benchmark/JitDisassemblyBenchmark.cs index 431cc4d..6e5e1c5 100644 --- a/AyCode.Benchmark/JitDisassemblyBenchmark.cs +++ b/AyCode.Benchmark/JitDisassemblyBenchmark.cs @@ -1,91 +1,59 @@ +using AyCode.Core.Serializers; using AyCode.Core.Serializers.Binaries; using AyCode.Core.Tests.TestModels; -using BenchmarkDotNet.Attributes; -using BenchmarkDotNet.Diagnosers; -using BenchmarkDotNet.Jobs; namespace AyCode.Core.Benchmarks; /// -/// JIT disassembly benchmark for AcBinarySerializer hot path analysis. -/// Shows actual x64 assembly generated by the JIT to verify inlining decisions. -/// -/// Usage: dotnet run -c Release -- --filter *JitDisassemblyBenchmark* -/// Or from Program.cs: --jitasm -/// -/// Output: BenchmarkDotNet artifacts folder contains .asm files with full disassembly. -/// Look for: -/// - WritePropertyOrSkip / WritePropertyMarkerless: are they inlined or called? -/// - WriteInt32 / WriteFloat64Unsafe / etc.: inlined into the caller or separate calls? -/// - context parameter passing: register usage (RCX/RDX/R8/R9) +/// Direct JIT-disassembly harness for the AcBinary Large Serialize hot path — the cell where +/// the PGO-driven inline bistability shows up (~120 µs/op fast mode ⇄ ~142 µs/op slow mode, same +/// source, run-to-run). +/// +/// Not a BenchmarkDotNet benchmark. BDN's DisassemblyDiagnoser produced no output +/// ("No benchmarks were disassembled"); this harness leans on the runtime's own JIT disassembler +/// instead. builds the workload and exercises the Large Ser FastMode path — when the +/// process is launched with DOTNET_JitDisasm=<pattern> the JIT dumps the x64 assembly of +/// every matching method to stdout as it compiles them. +/// +/// Run it (via the --jitasm switch). Set: +/// +/// DOTNET_TieredCompilation=0 — each method compiled once, straight to full-opt Tier-1: +/// deterministic codegen, no tiering/PGO lottery (so the disasm is reproducible). +/// DOTNET_JitDisasm=<pattern> — e.g. *GeneratedWriter* for the SGen writer +/// hot loop. The un-inlined calls in that loop are the candidates for the PGO-flipped inline +/// site; pinning the right callee with [MethodImpl(AggressiveInlining)] locks in the fast mode. +/// +/// +/// The workload mirrors exactly — Large (5×5×5×10) +/// graph, AsciiShort charset, FastMode + Compact wire. /// -[SimpleJob(RuntimeMoniker.Net90)] -[DisassemblyDiagnoser(maxDepth: 4, printSource: true, exportGithubMarkdown: true)] -[MemoryDiagnoser(displayGenColumns: false)] -public class JitDisassemblyBenchmark +public sealed class JitDisassemblyBenchmark { - private TestOrder_All_True _order = null!; - private AcBinarySerializerOptions _fastModeOptions = null!; - private AcBinarySerializerOptions _defaultOptions = null!; - private byte[] _serializedFastMode = null!; - private byte[] _serializedDefault = null!; - - [GlobalSetup] - public void Setup() - { - TestDataFactory.ResetIdCounter(); - var sharedTag = TestDataFactory.CreateTag("SharedTag_All_True"); - var sharedUser = TestDataFactory.CreateUser("shareduser"); - - // Medium data: enough properties to show loop behavior, not too large for disassembly - _order = TestDataFactory.CreateOrder( - itemCount: 3, - palletsPerItem: 3, - measurementsPerPallet: 3, - pointsPerMeasurement: 4, - sharedTag: sharedTag, - sharedUser: sharedUser); - - _fastModeOptions = AcBinarySerializerOptions.FastMode; - _defaultOptions = AcBinarySerializerOptions.Default; - _serializedFastMode = AcBinarySerializer.Serialize(_order, _fastModeOptions); - _serializedDefault = AcBinarySerializer.Serialize(_order, _defaultOptions); - } - /// - /// FastMode serialize — no ref tracking, no string interning. + /// Builds the Large workload and JITs + exercises the Large Ser FastMode hot path. With + /// DOTNET_JitDisasm set, the JIT emits the matching methods' disassembly to stdout on + /// first compile; the loop guarantees every reachable serializer method is JIT-compiled (and, + /// if tiering is left on, promoted to Tier-1). /// - [Benchmark(Baseline = true)] - public byte[] Serialize_FastMode() + public void Run() { - return AcBinarySerializer.Serialize(_order, _fastModeOptions); - } + // Mirror AcBinaryVsMemPackBenchmark exactly: AsciiShort charset (where the Large-Ser bimodality + // was observed), Large (5×5×5×10) TestOrder_All_False graph, FastMode + Compact wire. + BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.AsciiShort; - /// - /// FastMode deserialize. - /// - [Benchmark] - public TestOrder_All_True Deserialize_FastMode() - { - return AcBinaryDeserializer.Deserialize(_serializedFastMode, _fastModeOptions); - } + var allTestData = BenchmarkTestDataProvider_All_False.CreateTestDataSets(); + var largeSet = (TestDataSet)allTestData.First(t => t.Name.StartsWith("Large")); + var order = largeSet.Order; - /// - /// Default serialize — ref tracking + string interning (scan pass + write pass). - /// Shows IdentityMap lookup overhead in hot path. - /// - [Benchmark] - public byte[] Serialize_Default() - { - return AcBinarySerializer.Serialize(_order, _defaultOptions); - } + var options = AcBinarySerializerOptions.FastMode; + options.WireMode = WireMode.Compact; - /// - /// Default deserialize — ref tracking + string interning. - /// - [Benchmark] - public TestOrder_All_True Deserialize_Default() - { - return AcBinaryDeserializer.Deserialize(_serializedDefault, _defaultOptions); + Console.WriteLine("=== JIT-DISASM HARNESS: Large Ser FastMode (TestOrder_All_False, AsciiShort) — start ==="); + + byte[] last = null!; + for (var i = 0; i < 50; i++) + last = AcBinarySerializer.Serialize(order, options); + + Console.WriteLine($"=== JIT-DISASM HARNESS: done — 50 Large Ser ops, last payload {last.Length} bytes ==="); } } diff --git a/AyCode.Benchmark/Program.cs b/AyCode.Benchmark/Program.cs index f36379a..bd7d97b 100644 --- a/AyCode.Benchmark/Program.cs +++ b/AyCode.Benchmark/Program.cs @@ -116,8 +116,11 @@ namespace AyCode.Benchmark if (args.Length > 0 && args[0] == "--jitasm") { - WithProcessStabilization(() => - RunBenchmark(config, benchmarkDir, memDiagDir, "JitDisassemblyBenchmark")); + // Direct JIT-disasm harness — NOT BenchmarkDotNet. BDN's DisassemblyDiagnoser produced + // nothing here ("No benchmarks were disassembled"); this leans on the runtime's own JIT + // disassembler instead. Launch with DOTNET_TieredCompilation=0 + DOTNET_JitDisasm= + // (e.g. *GeneratedWriter*) — the JIT dumps the matching methods' x64 asm to stdout. + new JitDisassemblyBenchmark().Run(); return; } diff --git a/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.GenReader.cs b/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.GenReader.cs index a7ccfd2..e68326c 100644 --- a/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.GenReader.cs +++ b/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.GenReader.cs @@ -99,15 +99,15 @@ public partial class AcBinarySourceGenerator return; } - // ACCORE-BIN-T-K9M3 Ötlet A (refined) — caller-driven hot/cold split. SGen-emit reads the marker - // byte locally + dispatches FastWire/PropertySkip checks at the call site; the shared - // BinaryDeserializationContext.TryReadStringProperty handles only the hot marker switch - // (small body → high inline confidence). Cold markers go through TryReadStringColdPath - // (AggressiveOptimization, Tier-1 direct). The || short-circuit ensures cold is called only - // when hot didn't match — common case has zero method-call overhead beyond the inlined Try body. - // PropertySkip lands in the cold path's "return false" sink, so the property is left at default - // (don't-touch contract preserved). enableInternString stays a no-op at the emit site (StringInterned - // sits inside the cold path body now — writer-side feature gating handles non-emission). + // ACCORE-BIN-T-K9M3 — caller-driven string marker dispatch. SGen-emit reads the marker byte + // locally + handles FastWire on a separate branch; BinaryDeserializationContext.TryReadStringProperty + // decodes every non-interning marker (FixStrAscii / StringAscii / StringSmall/Medium/Big / Null / + // StringEmpty) in one inlinable body. The 3 interning markers go through TryReadStringColdPath + // (AggressiveOptimization, Tier-1 direct). enableInternString gates the `|| TryReadStringColdPath` + // emit: interning-enabled types get the short-circuit; non-interning types omit the cold call + // entirely — the writer never produces interning markers for them, so TryReadStringProperty alone + // is total. PropertySkip / unknown → TryReadStringProperty returns false → property left at + // default (don't-touch contract preserved). if (p.TypeKind == PropertyTypeKind.String) { sb.AppendLine($"{i}if (context.FastWire)"); @@ -118,7 +118,10 @@ public partial class AcBinarySourceGenerator sb.AppendLine($"{i}{{"); sb.AppendLine($"{i} var tc_{p.Name} = context.ReadByte();"); sb.AppendLine($"{i} string? v_{p.Name};"); - sb.AppendLine($"{i} if (context.TryReadStringProperty(tc_{p.Name}, out v_{p.Name}) || context.TryReadStringColdPath(tc_{p.Name}, out v_{p.Name}))"); + if (enableInternString) + sb.AppendLine($"{i} if (context.TryReadStringProperty(tc_{p.Name}, out v_{p.Name}) || context.TryReadStringColdPath(tc_{p.Name}, out v_{p.Name}))"); + else + sb.AppendLine($"{i} if (context.TryReadStringProperty(tc_{p.Name}, out v_{p.Name}))"); sb.AppendLine($"{i} {{"); sb.AppendLine($"{i} {a} = v_{p.Name}!;"); sb.AppendLine($"{i} }}"); diff --git a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs index 6e4f645..c0d47f3 100644 --- a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs +++ b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs @@ -437,7 +437,7 @@ public static partial class AcBinaryDeserializer return null; // len < 0 (sentinel -1) } - [MethodImpl(MethodImplOptions.NoInlining)] + [MethodImpl(MethodImplOptions.NoInlining)] public string ReadStringUtf8(int length) { if (length == 0) @@ -692,10 +692,14 @@ public static partial class AcBinaryDeserializer var packed = ReadUInt64Unsafe(); var charLength = (int)(uint)packed; var byteLength = (int)(uint)(packed >> 32); + +#if DEBUG // Single bitwise-OR + sign-test catches negative casts from corrupted-wire uint values // (when the wire-side uint > Int32.MaxValue, the (int)(uint) cast yields a negative int). // Predict-friendly: always false on a valid wire. if ((charLength | byteLength) < 0) ThrowCorruptedBigWire(charLength, byteLength); +#endif + return ReadStringUtf8WithCharLen(charLength, byteLength); } @@ -775,86 +779,117 @@ public static partial class AcBinaryDeserializer } /// - /// ACCORE-BIN-T-K9M3 Ötlet A (refined) — property-level string **hot**-marker dispatch. - /// The caller is responsible for reading the marker byte and handling FastWire; this method - /// dispatches the hot markers only (FixStrAscii, StringSmall, Null, StringEmpty) inline. - /// Caller protocol (from SGen-emit): - /// - /// if (context.FastWire) { - /// obj.X = context.ReadStringUtf16Markerless()!; - /// } else { - /// var tc = context.ReadByte(); - /// string? v; - /// if (context.TryReadStringProperty(tc, out v) || context.TryReadStringColdPath(tc, out v)) { - /// obj.X = v!; - /// } - /// // else: PropertySkip / unknown marker → property left at default (don't-touch contract) - /// } - /// - /// Returns: true if a hot marker matched ( set — - /// includes deliberate null on ); false if the - /// marker is not in the hot set — caller short-circuits via || to - /// . - /// Body kept minimal so AggressiveInlining stays effective: only the marker dispatch - /// (4-case hot switch + FixStrAscii range check in default). FastWire short-circuit, ReadByte, - /// PropertySkip and cold-marker dispatch are all the caller's responsibility — splitting these - /// out of the body keeps the inliner's complexity-budget calculation favourable. ACCORE-BIN-T-K9M3 - /// Ötlet A v1 (which kept FastWire/PropertySkip/cold-call inside the body) regressed by ~3-5% on - /// the Des side because the JIT bailed on inlining; this refined split aims to fit the inline budget. + /// ACCORE-BIN-T-K9M3 — property-level string marker dispatch for all **non-interning** markers + /// (FixStrAscii, StringAscii, StringSmall/Medium/Big, Null, StringEmpty). The 3 interning markers + /// (StringInterned, StringInternFirst{Small,Medium}) are handled by the companion + /// , which the SGen-emit calls via || ONLY for + /// interning-enabled types — non-interning types omit the cold call entirely. + /// Size discipline: each UTF-8 tier case decodes only the header (charLen + utf8Len) + /// into locals; the heavy decode runs at a SINGLE post-switch site per family — + /// for the UTF-8 tiers, + /// for ASCII (discriminated by the byteLength < 0 sentinel). Adding a tier costs one tiny + /// header decode, not a duplicated decode body — so the 7-marker body still fits AggressiveInlining. + /// Caller reads the marker byte + handles FastWire on a separate branch (markerless decode); + /// by the time reaches here FastWire is guaranteed false. Returns true + /// if a non-interning marker matched ( set — incl. deliberate null on + /// ); false for an interning marker / PropertySkip / unknown. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] internal bool TryReadStringProperty(byte tc, out string? value) { // Hot-path invariant: SGen-emit + property-marker callers MUST short-circuit FastWire on a // separate ag (markerless decode) — so by the time the marker byte reaches this switch, - // FastWire is guaranteed false. The StringSmall case therefore calls ReadStringSmallCompact - // directly. Mode-aware call sites (Dictionary key/value emit, runtime cross-type populate, - // TypeReaderTable lambda) inline the `FastWire ? FW : Compact` ternary themselves. + // FastWire is guaranteed false (the StringSmall body is the Compact-mode decode). value = null; + int charLength; + int byteLength; + switch (tc) { - case BinaryTypeCode.StringSmall: value = ReadStringSmallCompact(); return true; - case BinaryTypeCode.Null: return true; - case BinaryTypeCode.StringEmpty: value = string.Empty; return true; + case BinaryTypeCode.StringSmall: + { + // [charLen:8][utf8Len:8] + var header = ReadTwoBytesUnsafe(); + charLength = (byte)header; + byteLength = (byte)(header >> 8); + break; + } + case BinaryTypeCode.StringMedium: + { + // [charLen:16 LE][utf8Len:16 LE] — single uint load + var packed = ReadUInt32Unsafe(); + charLength = (ushort)packed; + byteLength = (ushort)(packed >> 16); + break; + } + case BinaryTypeCode.StringBig: + { + // [charLen:32 LE][utf8Len:32 LE] — single ulong load + corrupted-wire guard + var packed = ReadUInt64Unsafe(); + charLength = (int)(uint)packed; + byteLength = (int)(uint)(packed >> 32); + +#if DEBUG + if ((charLength | byteLength) < 0) ThrowCorruptedBigWire(charLength, byteLength); +#endif + + break; + } + case BinaryTypeCode.StringAscii: + // Long ASCII: [VarUInt byteLen]. byteLength = -1 sentinel → routes to the ASCII tail. + charLength = (int)ReadVarUInt(); + byteLength = -1; + + break; + case BinaryTypeCode.Null: + return true; + case BinaryTypeCode.StringEmpty: + value = string.Empty; + + return true; default: - // Hot path: FixStrAscii (short ASCII string values — property codes, IDs, names). + // FixStrAscii (short ASCII — property codes, IDs, names): the marker carries the length. if (BinaryTypeCode.IsFixStrAscii(tc)) { - var falen = BinaryTypeCode.DecodeFixStrAsciiLength(tc); - value = falen == 0 ? string.Empty : ReadAsciiBytesAsString(falen); - return true; + charLength = BinaryTypeCode.DecodeFixStrAsciiLength(tc); + byteLength = -1; // ASCII sentinel + break; } - break; + // Interning marker, PropertySkip, or unknown — caller continues via short-circuit || + // to TryReadStringColdPath (interning types) or leaves the property at default. + return false; } - // Cold marker, PropertySkip, or unknown — caller continues via short-circuit || - // to ; value left at null. - return false; + + // Single per-family decode site. ASCII (byteLength < 0): charLength IS the byte count + // (1:1 widen, no UTF-8 decode). UTF-8 tiers: 1-pass decode with both lengths from the wire. + value = byteLength < 0 ? ReadAsciiBytesAsString(charLength) : ReadStringUtf8WithCharLen(charLength, byteLength); + return true; } /// - /// Cold-path companion to . Dispatches the **cold** markers - /// (StringMedium / StringBig / StringAscii long / StringInterned / InternFirst*). Returns - /// true if a cold marker matched (caller assigns to the - /// property); false if the marker is or an - /// unknown / corrupted value (caller leaves the property untouched — the safer behaviour for - /// wire corruption). - /// forces Tier-1 direct compilation - /// — the body is too large for AggressiveInlining (6 marker cases + decode-helpers), but the - /// compile-once Tier-1 quality makes the rare-marker dispatch path predictable and tight. The - /// caller pays one method-call cost only when the wire actually carries a cold marker. + /// Interning-marker companion to — dispatches the 3 interning + /// markers only (StringInterned, StringInternFirstSmall, StringInternFirstMedium). Every other + /// string marker (FixStrAscii, StringAscii, StringSmall/Medium/Big, Null, StringEmpty) is handled + /// by ; this method is emitted into generated readers ONLY for + /// types whose string-interning feature flag is enabled — non-interning types skip it entirely + /// (the writer never produces interning markers for them, so + /// alone is total). + /// Returns true if an interning marker matched; false for + /// or an unknown / corrupted value (caller leaves the + /// property untouched — the safer behaviour for wire corruption). + /// forces Tier-1 direct compilation — + /// the caller pays one method-call cost only when the wire actually carries an interning marker. /// [MethodImpl(MethodImplOptions.AggressiveOptimization)] internal bool TryReadStringColdPath(byte tc, out string? value) { switch (tc) { - case BinaryTypeCode.StringMedium: value = ReadStringMedium(); return true; - case BinaryTypeCode.StringBig: value = ReadStringBig(); return true; - case BinaryTypeCode.StringAscii: value = ReadPlainStringAscii(); return true; case BinaryTypeCode.StringInterned: value = GetInternedString((int)ReadVarUInt()); return true; case BinaryTypeCode.StringInternFirstSmall: value = ReadAndRegisterInternedStringSmall(); return true; case BinaryTypeCode.StringInternFirstMedium: value = ReadAndRegisterInternedStringMedium(); return true; } + // PropertySkip OR unknown marker — caller leaves the property at default value // (safer than the previous silent null-assignment on unknown). value = null;