From b8d0d85c9977173e0c7d228b542d4008376d49a1 Mon Sep 17 00:00:00 2001 From: Loretta Date: Tue, 19 May 2026 12:58:22 +0200 Subject: [PATCH] Refactor charset profiles; split StringSmall decode paths - Benchmark charset profiles are now length-consistent: all *Short = 40 chars, all *Long = 280 chars, across ASCII, Latin1, CJK BMP, Cyrillic, and Mixed. - `CharsetSuffixes` was rewritten with new profiles and base-string repetition for compile-time constants. - Menu/configuration updated for new profiles, selection logic, and improved descriptions. - Docs updated to reflect new profiles, lengths, and serialization tier impacts. - `StringSmall` deserialization split into `ReadStringSmallCompact` and `ReadStringSmallFastWire`; all call sites now dispatch by mode, clarifying the hot path. - SGen codegen and runtime dispatch tables updated for the new decode split. - Binary marker docs clarified: only Intern/Metadata/Polymorph features are wire-symmetric for reader case omission; RefHandling is not. - Added `BINARY_STRICT_SGEN.md` planning doc for a SGen-only, attribute-required, AOT-friendly NuGet package. --- AyCode.Benchmark/BdnSummaryAdapter.cs | 11 +- .../Configuration.cs | 11 +- AyCode.Core.Serializers.Console/Menu.cs | 61 +++++++--- AyCode.Core.Serializers.Console/README.md | 25 ++-- .../AcBinarySourceGenerator.GenReader.cs | 5 +- .../TestModels/BenchmarkTestDataProvider.cs | 79 +++++++++---- ...lizer.BinaryDeserializationContext.Read.cs | 52 +++++--- .../Binaries/AcBinaryDeserializer.cs | 11 +- .../BINARY/BINARY_BYTECODE_OPTIMIZATION.md | 10 +- AyCode.Core/docs/BINARY/BINARY_STRICT_SGEN.md | 111 ++++++++++++++++++ AyCode.Core/docs/BINARY/BINARY_TODO.md | 55 +++++++-- AyCode.Core/docs/BINARY/README.md | 1 + 12 files changed, 347 insertions(+), 85 deletions(-) create mode 100644 AyCode.Core/docs/BINARY/BINARY_STRICT_SGEN.md diff --git a/AyCode.Benchmark/BdnSummaryAdapter.cs b/AyCode.Benchmark/BdnSummaryAdapter.cs index 41ce16c..746a970 100644 --- a/AyCode.Benchmark/BdnSummaryAdapter.cs +++ b/AyCode.Benchmark/BdnSummaryAdapter.cs @@ -82,11 +82,16 @@ public static class BdnSummaryAdapter return s switch { CharsetSuffixes.Latin1FixAscii => "Latin1FixAscii", + CharsetSuffixes.AsciiShort => "AsciiShort", + CharsetSuffixes.AsciiLong => "AsciiLong", CharsetSuffixes.Latin1Short => "Latin1Short", CharsetSuffixes.Latin1Long => "Latin1Long", - CharsetSuffixes.CjkBmp => "CjkBmp", - CharsetSuffixes.Cyrillic => "Cyrillic", - CharsetSuffixes.Mixed => "Mixed", + CharsetSuffixes.CjkBmpShort => "CjkBmpShort", + CharsetSuffixes.CjkBmpLong => "CjkBmpLong", + CharsetSuffixes.CyrillicShort => "CyrillicShort", + CharsetSuffixes.CyrillicLong => "CyrillicLong", + CharsetSuffixes.MixedShort => "MixedShort", + CharsetSuffixes.MixedLong => "MixedLong", _ => "Custom" }; } diff --git a/AyCode.Core.Serializers.Console/Configuration.cs b/AyCode.Core.Serializers.Console/Configuration.cs index 29a3da7..f456372 100644 --- a/AyCode.Core.Serializers.Console/Configuration.cs +++ b/AyCode.Core.Serializers.Console/Configuration.cs @@ -85,11 +85,16 @@ internal static class Configuration return s switch { CharsetSuffixes.Latin1FixAscii => "Latin1FixAscii", + CharsetSuffixes.AsciiShort => "AsciiShort", + CharsetSuffixes.AsciiLong => "AsciiLong", CharsetSuffixes.Latin1Short => "Latin1Short", CharsetSuffixes.Latin1Long => "Latin1Long", - CharsetSuffixes.CjkBmp => "CjkBmp", - CharsetSuffixes.Cyrillic => "Cyrillic", - CharsetSuffixes.Mixed => "Mixed", + CharsetSuffixes.CjkBmpShort => "CjkBmpShort", + CharsetSuffixes.CjkBmpLong => "CjkBmpLong", + CharsetSuffixes.CyrillicShort => "CyrillicShort", + CharsetSuffixes.CyrillicLong => "CyrillicLong", + CharsetSuffixes.MixedShort => "MixedShort", + CharsetSuffixes.MixedLong => "MixedLong", _ => "Custom" }; } diff --git a/AyCode.Core.Serializers.Console/Menu.cs b/AyCode.Core.Serializers.Console/Menu.cs index 25f1aed..7dce829 100644 --- a/AyCode.Core.Serializers.Console/Menu.cs +++ b/AyCode.Core.Serializers.Console/Menu.cs @@ -107,12 +107,19 @@ internal static class Menu System.Console.WriteLine("─────────────────────────────────────────────"); System.Console.WriteLine($"Current: {Configuration.GetCurrentCharsetName()}"); System.Console.WriteLine(); - System.Console.WriteLine(" [1] Latin1FixAscii — empty suffix; short FixStr-fast-path stress (Latin1 baseline values stay short)"); - System.Console.WriteLine(" [2] Latin1Short — \" árvíztűrő tükörfúrógép\" (~24 char Hungarian mixed)"); - System.Console.WriteLine(" [3] Latin1Long — ~47-char Latin1 mixed (default; exceeds FixStr boundary)"); - System.Console.WriteLine(" [4] CjkBmp — CJK BMP (long 3-byte runs)"); - System.Console.WriteLine(" [5] Cyrillic — Russian Cyrillic (long 2-byte runs)"); - System.Console.WriteLine(" [6] Mixed — Hungarian + CJK + Cyrillic + emoji (full-spectrum + surrogate pairs)"); + System.Console.WriteLine(" All *Short = 40 char, all *Long = 280 char (= Short × 7) — length-consistent across charsets."); + System.Console.WriteLine(); + System.Console.WriteLine(" [1] Latin1FixAscii — empty suffix; baseline-only short values → FixStrAscii tier"); + System.Console.WriteLine(" [2] AsciiShort — 40 char pure ASCII (quic × 8) → StringAscii tier"); + System.Console.WriteLine(" [3] AsciiLong — 280 char pure ASCII → StringAscii tier"); + System.Console.WriteLine(" [4] Latin1Short — 40 char Hungarian (árví × 8) → StringSmall tier"); + System.Console.WriteLine(" [5] Latin1Long — 280 char Hungarian (default) → StringMedium tier"); + System.Console.WriteLine(" [6] CjkBmpShort — 40 char CJK BMP (3-byte runs) → StringSmall tier"); + System.Console.WriteLine(" [7] CjkBmpLong — 280 char CJK BMP → StringMedium tier"); + System.Console.WriteLine(" [8] CyrillicShort — 40 char Cyrillic (2-byte runs) → StringSmall tier"); + System.Console.WriteLine(" [9] CyrillicLong — 280 char Cyrillic → StringMedium tier"); + System.Console.WriteLine(" [0] MixedShort — 40 char multi-codepage → StringSmall tier"); + System.Console.WriteLine(" [A] MixedLong — 280 char multi-codepage → StringMedium tier"); System.Console.WriteLine(" [B] Back"); System.Console.Write("\nSelection: "); @@ -126,24 +133,44 @@ internal static class Menu System.Console.WriteLine("✓ Charset set to Latin1FixAscii"); return; case '2': + BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.AsciiShort; + System.Console.WriteLine("✓ Charset set to AsciiShort"); + return; + case '3': + BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.AsciiLong; + System.Console.WriteLine("✓ Charset set to AsciiLong"); + return; + case '4': BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Latin1Short; System.Console.WriteLine("✓ Charset set to Latin1Short"); return; - case '3': + case '5': BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Latin1Long; System.Console.WriteLine("✓ Charset set to Latin1Long"); return; - case '4': - BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.CjkBmp; - System.Console.WriteLine("✓ Charset set to CjkBmp"); - return; - case '5': - BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Cyrillic; - System.Console.WriteLine("✓ Charset set to Cyrillic"); - return; case '6': - BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.Mixed; - System.Console.WriteLine("✓ Charset set to Mixed"); + BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.CjkBmpShort; + System.Console.WriteLine("✓ Charset set to CjkBmpShort"); + return; + case '7': + BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.CjkBmpLong; + System.Console.WriteLine("✓ Charset set to CjkBmpLong"); + return; + case '8': + BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.CyrillicShort; + System.Console.WriteLine("✓ Charset set to CyrillicShort"); + return; + case '9': + BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.CyrillicLong; + System.Console.WriteLine("✓ Charset set to CyrillicLong"); + return; + case '0': + BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.MixedShort; + System.Console.WriteLine("✓ Charset set to MixedShort"); + return; + case 'a': + BenchmarkTestDataProvider.LongStringSuffix = CharsetSuffixes.MixedLong; + System.Console.WriteLine("✓ Charset set to MixedLong"); return; case 'b': return; diff --git a/AyCode.Core.Serializers.Console/README.md b/AyCode.Core.Serializers.Console/README.md index fd4484a..80a7313 100644 --- a/AyCode.Core.Serializers.Console/README.md +++ b/AyCode.Core.Serializers.Console/README.md @@ -38,16 +38,23 @@ Workload + reporting types — `ISerializerBenchmark`, `BenchmarkResult`, `Bench ## Charset profiles (Menu → Settings → Charset) -Controls the `BenchmarkTestDataProvider.LongStringSuffix` — the string-tail appended to property values. Influences string-marker selection on the wire (FixStr vs StringSmall / Medium / Big), interning hit rates, and UTF-8 encode cost. +Controls the `BenchmarkTestDataProvider.LongStringSuffix` — the string-tail appended to property values. Influences string-marker selection on the wire (FixStrAscii vs StringSmall / Medium / Big / StringAscii), interning hit rates, and UTF-8 encode cost. -| Profile | Content | -|---|---| -| `Latin1FixAscii` | Empty suffix (short FixStr fast-path stress) | -| `Latin1Short` | "árvíztűrő tükörfúrógép" (~24 char Hungarian mixed) | -| `Latin1Long` | ~47-char Latin1 mixed (default) | -| `CjkBmp` | CJK BMP (3-byte UTF-8 runs) | -| `Cyrillic` | Russian Cyrillic (2-byte UTF-8 runs) | -| `Mixed` | Hungarian + CJK + Cyrillic + emoji (full-spectrum + surrogate pairs) | +**Consistent length across all charsets** (UTF-16 char count): every `*Short` = 40 char, every `*Long` = 280 char (= Short × 7). Isolates the workload variable to UTF-8 byte content per charset (1-byte ASCII vs 2-byte Latin1 / Cyrillic vs 3-byte CJK vs mixed) — wire-size and encode/decode cost differences are pure charset effects, not length effects. + +| Profile | UTF-16 char | UTF-8 byte (approx) | Tier | +|---|---|---|---| +| `Latin1FixAscii` | 0 | 0 | FixStrAscii / FixStr-equivalent (baseline-only) | +| `AsciiShort` | 40 | 40 | StringAscii (167) | +| `AsciiLong` | 280 | 280 | StringAscii (167) | +| `Latin1Short` | 40 | ~72 | StringSmall (91) | +| `Latin1Long` (**default**) | 280 | ~504 | StringMedium (94) | +| `CjkBmpShort` | 40 | ~104 | StringSmall | +| `CjkBmpLong` | 280 | ~728 | StringMedium | +| `CyrillicShort` | 40 | ~72 | StringSmall | +| `CyrillicLong` | 280 | ~504 | StringMedium | +| `MixedShort` | 40 | ~88 | StringSmall | +| `MixedLong` | 280 | ~616 | StringMedium | ## CLI diff --git a/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.GenReader.cs b/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.GenReader.cs index 4336163..a7ccfd2 100644 --- a/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.GenReader.cs +++ b/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.GenReader.cs @@ -237,7 +237,10 @@ public partial class AcBinarySourceGenerator // These markers are feature-independent: writer emits them on any string property regardless of // intern setting (intern is opt-in per-property via [AcStringIntern] + InternBit). sb.AppendLine($"{i} case BinaryTypeCode.StringSmall:"); - sb.AppendLine($"{i} {a} = context.ReadStringSmall();"); + // FastWire mode reuses the StringSmall (=91) marker but with a different body — emit + // inline ternary so call sites that can run in either mode (Dictionary key/value, runtime + // cross-type populate) dispatch without an extra method-frame. + sb.AppendLine($"{i} {a} = context.FastWire ? context.ReadStringSmallFastWire() : context.ReadStringSmallCompact();"); sb.AppendLine($"{i} break;"); sb.AppendLine($"{i} case BinaryTypeCode.StringMedium:"); sb.AppendLine($"{i} {a} = context.ReadStringMedium();"); diff --git a/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs b/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs index 5837619..89560d7 100644 --- a/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs +++ b/AyCode.Core.Tests/TestModels/BenchmarkTestDataProvider.cs @@ -18,33 +18,70 @@ namespace AyCode.Core.Tests.TestModels; /// public static class CharsetSuffixes { - /// Empty suffix — short Hungarian baseline strings (e.g. "SharedTag") stay short, hitting - /// the FixStr fast-path. Stress-test for FixStr / short-string code paths. Note: the baseline - /// property values remain Hungarian; only the suffix is empty. Despite the "FixAscii" name, this - /// option does NOT change baseline values to ASCII — it suppresses the suffix that would otherwise - /// push every property past the FixStr boundary. + // ───────────────────────────────────────────────────────────────────────── + // Consistent length across all charsets (UTF-16 char count, NOT UTF-8 byte count): + // *Short = 40 char (5-char base × 8 repetitions) → StringSmall / StringAscii tier + // *Long = 280 char (Short × 7) → StringMedium / StringAscii tier + // + // Same length across charsets isolates the workload variable to UTF-8 byte content + // (1-byte ASCII vs 2-byte Latin1 / Cyrillic vs 3-byte CJK vs mixed) — wire-size and + // encode/decode cost differences are pure charset effects, not length effects. + // + // Const-concat for compile-time evaluation (usable as attribute / DataRow source). + // ───────────────────────────────────────────────────────────────────────── + + /// Empty suffix — baseline string property values stay short, hitting the + /// FixStrAscii / short-string fast-path. Stress-test for short-string code paths. public const string Latin1FixAscii = ""; - /// Short Latin1 mixed (Hungarian, ~24 char) — typical European i18n payload, short - /// multi-byte runs. Below the 32-char FixStr boundary on the suffix alone, but combined with - /// baseline values pushes every property past it. - public const string Latin1Short = " árvíztűrő tükörfúrógép"; + // ── Pure ASCII (every byte < 0x80) ── + // Tier: StringAscii (167) — byte→char SIMD widening, zero UTF-8 decode. + // UTF-8 byte count: 40 byte (Short), 280 byte (Long) — 1:1 char:byte. + private const string AsciiBase = " quic"; // 5 char ASCII + public const string AsciiShort = AsciiBase + AsciiBase + AsciiBase + AsciiBase + + AsciiBase + AsciiBase + AsciiBase + AsciiBase; // 40 char + public const string AsciiLong = AsciiShort + AsciiShort + AsciiShort + AsciiShort + + AsciiShort + AsciiShort + AsciiShort; // 280 char - /// Long Latin1 mixed (~47 char) — exceeds the 32-char FixStr boundary on the suffix alone, - /// exercising the StringSmall+ tier path with Latin1 mixed content (Hungarian accented letters). - public const string Latin1Long = " árvíztűrő tükörfúrógép a magyar betűzés tesztje"; + // ── Latin1 (Hungarian proxy — ISO-8859-1 + Latin-2 ő/ű) ── + // Tier: StringSmall (91) Short / StringMedium (94) Long. + // UTF-8 byte count: ~72 byte Short (5 char base = 9 byte UTF-8: space+á+r+v+í), ~504 byte Long. + private const string Latin1Base = " árví"; // 5 char (space + á + r + v + í) — multi-byte mix + public const string Latin1Short = Latin1Base + Latin1Base + Latin1Base + Latin1Base + + Latin1Base + Latin1Base + Latin1Base + Latin1Base; // 40 char + public const string Latin1Long = Latin1Short + Latin1Short + Latin1Short + Latin1Short + + Latin1Short + Latin1Short + Latin1Short; // 280 char - /// CJK BMP (Chinese / Japanese / Korean Basic Multilingual Plane) — long homogeneous - /// 3-byte UTF-8 runs. Primary win region for V4N2 Phase 3 SIMD multi-byte transcoder work. - public const string CjkBmp = " 你好世界 こんにちは 안녕하세요"; + // ── CJK BMP (Chinese / Japanese / Korean Basic Multilingual Plane) ── + // Tier: StringSmall (91) Short / StringMedium (94) Long. + // UTF-8 byte count: ~104 byte Short (5 char base = 13 byte UTF-8: 1 ASCII space + 4×3-byte CJK), + // ~728 byte Long. Homogeneous 3-byte runs — primary win region for SIMD multi-byte transcoder. + private const string CjkBmpBase = " 你好世界"; // 5 char (space + 4 Chinese) + public const string CjkBmpShort = CjkBmpBase + CjkBmpBase + CjkBmpBase + CjkBmpBase + + CjkBmpBase + CjkBmpBase + CjkBmpBase + CjkBmpBase; // 40 char + public const string CjkBmpLong = CjkBmpShort + CjkBmpShort + CjkBmpShort + CjkBmpShort + + CjkBmpShort + CjkBmpShort + CjkBmpShort; // 280 char - /// Cyrillic (Russian / Ukrainian / etc.) — long homogeneous 2-byte runs, different shape - /// than Hungarian mixed (where 2-byte chars are short interspersed runs). - public const string Cyrillic = " Привет мир дорогой друг"; + // ── Cyrillic (Russian / Ukrainian) ── + // Tier: StringSmall (91) Short / StringMedium (94) Long. + // UTF-8 byte count: ~72 byte Short (5 char base = 9 byte UTF-8: 1 ASCII + 4×2-byte Cyrillic), + // ~504 byte Long. Homogeneous 2-byte runs — different shape than Latin1 interspersed. + private const string CyrillicBase = " Прив"; // 5 char (space + 4 Cyrillic) + public const string CyrillicShort = CyrillicBase + CyrillicBase + CyrillicBase + CyrillicBase + + CyrillicBase + CyrillicBase + CyrillicBase + CyrillicBase; // 40 char + public const string CyrillicLong = CyrillicShort + CyrillicShort + CyrillicShort + CyrillicShort + + CyrillicShort + CyrillicShort + CyrillicShort; // 280 char - /// Mixed full-spectrum (Hungarian + CJK + Cyrillic + emoji surrogate pairs) — multi-tier - /// coverage in one payload. Stresses surrogate-pair handling in the UTF-8 transcoder. - public const string Mixed = " árvíz 你好 Привет 😀"; + // ── Mixed (multi-codepage in one payload) ── + // Tier: StringSmall (91) Short / StringMedium (94) Long. + // UTF-8 byte count: ~88 byte Short (5 char base = 11 byte UTF-8: 1 ASCII + 1×2-byte Hungarian + // + 1×3-byte CJK + 2×2-byte Cyrillic), ~616 byte Long. No surrogate pairs (keeps UTF-16 + // length predictable); cross-tier transcoder coverage in one payload. + private const string MixedBase = " á你Пй"; // 5 char (space + Hungarian + Chinese + 2× Cyrillic) + public const string MixedShort = MixedBase + MixedBase + MixedBase + MixedBase + + MixedBase + MixedBase + MixedBase + MixedBase; // 40 char + public const string MixedLong = MixedShort + MixedShort + MixedShort + MixedShort + + MixedShort + MixedShort + MixedShort; // 280 char } // ============================================================================================ diff --git a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs index 4c23fcb..6e4f645 100644 --- a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs +++ b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.BinaryDeserializationContext.Read.cs @@ -625,30 +625,47 @@ public static partial class AcBinaryDeserializer } /// - /// H2Q6 StringSmall reader (Compact mode): wire [charLen:8][utf8Len:8][UTF-8 bytes] after the - /// marker has been consumed. 1-pass decode (no CountUtf8Chars). FastWire mode reuses the same - /// marker value (=91) but a different layout — [charLen:int32 LE][UTF-16 raw bytes]; this method - /// dispatches by FastWire flag. Single source of wire-decode shared by runtime TypeReaderTable, - /// cross-type populate, AND SGen-emit. + /// H2Q6 StringSmall reader — Compact-mode-only body: wire [charLen:8][utf8Len:8][UTF-8 bytes] + /// after the marker has been consumed. 1-pass decode (no CountUtf8Chars). + /// Call this directly when the call site has ALREADY established FastWire == false + /// (e.g. hot path, where the SGen-emit caller short-circuits + /// FastWire on a separate ag via ReadStringUtf16Markerless). Skips the redundant + /// FastWire branch — call sites that may run in either mode inline + /// FastWire ? ReadStringSmallFastWire() : ReadStringSmallCompact() ternary instead of a + /// shared dispatcher (no method-frame overhead). /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal string ReadStringSmall() + internal string ReadStringSmallCompact() { - if (FastWire) - { - // Mode-shared marker: FastWire payload is [charLen:int32 LE][UTF-16 raw bytes]. - // Fix-int charLen (matches MemPack WriteUtf16 shape) — single 4-byte read, no VarUInt loop. - var charLenF = ReadInt32Unsafe(); - return ReadStringUtf16(charLenF); - } + System.Diagnostics.Debug.Assert(!FastWire, "ReadStringSmallCompact called with FastWire=true — call sites that may run in FastWire mode must inline the `FastWire ? ReadStringSmallFastWire() : ReadStringSmallCompact()` ternary."); - // Compact mode — H2Q6 StringSmall: [charLen:8][utf8Len:8][bytes] + // H2Q6 StringSmall body: [charLen:8][utf8Len:8][UTF-8 bytes] var header = ReadTwoBytesUnsafe(); var charLength = (byte)header; var byteLength = (byte)(header >> 8); return ReadStringUtf8WithCharLen(charLength, byteLength); } + /// + /// H2Q6 StringSmall reader — FastWire-mode-only body: wire [charLen:int32 LE][UTF-16 raw bytes] + /// after the (mode-shared) marker has been consumed. Engaged only on the runtime + /// path when FastWire==true and the declared target + /// type is NOT string (the string-typed FastWire short-circuit in + /// bypasses the marker entirely via ReadStringUtf16Markerless). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal string ReadStringSmallFastWire() + { + // Mode-shared marker (=91) FastWire payload — fix-int charLen (matches MemPack WriteUtf16 shape). + var charLenF = ReadInt32Unsafe(); + return ReadStringUtf16(charLenF); + } + + // No combined ReadStringSmall() dispatcher — every call site already has the FastWire flag + // in scope (compile-time invariant on the SGen-emit hot path; runtime field check on the + // dispatcher-callers). Call sites inline the ternary `FastWire ? ReadStringSmallFastWire() + // : ReadStringSmallCompact()` when they need mode-awareness, saving a method-call frame. + /// /// H2Q6 StringMedium reader: wire [charLen:16 LE][utf8Len:16 LE][UTF-8 bytes] after the marker /// has been consumed. 1-pass decode. Header read in a single uint load (vs 2 ushort loads). Shared @@ -788,10 +805,15 @@ public static partial class AcBinaryDeserializer [MethodImpl(MethodImplOptions.AggressiveInlining)] internal bool TryReadStringProperty(byte tc, out string? value) { + // Hot-path invariant: SGen-emit + property-marker callers MUST short-circuit FastWire on a + // separate ag (markerless decode) — so by the time the marker byte reaches this switch, + // FastWire is guaranteed false. The StringSmall case therefore calls ReadStringSmallCompact + // directly. Mode-aware call sites (Dictionary key/value emit, runtime cross-type populate, + // TypeReaderTable lambda) inline the `FastWire ? FW : Compact` ternary themselves. value = null; switch (tc) { - case BinaryTypeCode.StringSmall: value = ReadStringSmall(); return true; + case BinaryTypeCode.StringSmall: value = ReadStringSmallCompact(); return true; case BinaryTypeCode.Null: return true; case BinaryTypeCode.StringEmpty: value = string.Empty; return true; default: diff --git a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs index 29d4b8f..abb03d6 100644 --- a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs +++ b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs @@ -98,8 +98,9 @@ public static partial class AcBinaryDeserializer readers[BinaryTypeCode.Decimal] = static (ctx, _) => ctx.ReadDecimalUnsafe(); readers[BinaryTypeCode.Char] = static (ctx, _) => ctx.ReadCharUnsafe(); // H2Q6 non-ASCII tier readers (Compact mode): fixed-width header [charLen][utf8Len] + 1-pass decode. - // FastWire mode dispatches the StringSmall (=91) marker through the same handler — see ReadStringSmall. - readers[BinaryTypeCode.StringSmall] = static (ctx, _) => ctx.ReadStringSmall(); + // FastWire mode reuses the StringSmall (=91) marker but with a different body — inline ternary + // dispatches by ctx.FastWire (no method-frame overhead vs the old ReadStringSmall dispatcher). + readers[BinaryTypeCode.StringSmall] = static (ctx, _) => ctx.FastWire ? ctx.ReadStringSmallFastWire() : ctx.ReadStringSmallCompact(); readers[BinaryTypeCode.StringMedium] = static (ctx, _) => ctx.ReadStringMedium(); readers[BinaryTypeCode.StringBig] = static (ctx, _) => ctx.ReadStringBig(); readers[BinaryTypeCode.StringInterned] = static (ctx, _) => ctx.GetInternedString((int)ctx.ReadVarUInt()); @@ -146,7 +147,8 @@ public static partial class AcBinaryDeserializer // V4N5 cleanup (2026-05-06): CreateFixStrReader removed — non-ASCII short strings now use - // StringSmall tier reader (see ReadStringSmall below). + // StringSmall tier reader (see ReadStringSmallCompact + ReadStringSmallFastWire in + // BinaryDeserializationContext.Read.cs). /// /// Creates a reader for FixStrAscii with the given byte length (also char count, ASCII = 1:1). @@ -1049,7 +1051,8 @@ public static partial class AcBinaryDeserializer switch (typeCode) { case BinaryTypeCode.StringSmall: - propInfo.SetValue(target, context.ReadStringSmall()); + // FastWire reuses StringSmall (=91) marker — inline mode dispatch (no method-frame overhead). + propInfo.SetValue(target, context.FastWire ? context.ReadStringSmallFastWire() : context.ReadStringSmallCompact()); return true; case BinaryTypeCode.StringMedium: propInfo.SetValue(target, context.ReadStringMedium()); diff --git a/AyCode.Core/docs/BINARY/BINARY_BYTECODE_OPTIMIZATION.md b/AyCode.Core/docs/BINARY/BINARY_BYTECODE_OPTIMIZATION.md index 4044006..8e8735f 100644 --- a/AyCode.Core/docs/BINARY/BINARY_BYTECODE_OPTIMIZATION.md +++ b/AyCode.Core/docs/BINARY/BINARY_BYTECODE_OPTIMIZATION.md @@ -4,22 +4,24 @@ Working notes for a wire-format reorganization where the BinaryTypeCode marker s > Not a TODO entry. LLM-only working notes. Sibling to `BINARY_SGEN_OPTIMIZATION.md` (per-property emit condensation companion). +> ⚠️ **Needs revision** — several sections (esp. "Boundary aligns with feature flags", "Strict no-feature collapse") assume per-type `EnableRefHandlingFeature = false` removes `ObjectRef*` markers from the wire. That assumption is **false**: the runtime writer emits `ObjectRef*` based on child-type metadata, NOT the parent's per-type flag. The parent-flag governs only self-tracking emit. See `RefAwareEmitPredicate` in `AcBinarySourceGenerator.Models.cs` and `AcBinarySerializableAttribute.EnableRefHandlingFeature` doc for the correct scope. Re-evaluate the hot/cold boundary candidates accordingly: `EnableInternString` / `EnableMetadata` stripping is still wire-symmetric and valid; `EnableRefHandling` stripping is NOT. + ## Core concept The marker-byte tier reflects the wire-format's perf-architecture: hot markers serve the no-feature workload at maximum speed (small inline switch in SGen-emit), cold markers carry the feature-engaged variants behind a single method call. **Boundary aligns with feature flags:** -- `EnableRefHandling=false` → `ObjectRef*`, ref-aware element/dict wrappers never on wire +- ~~`EnableRefHandling=false` → `ObjectRef*`, ref-aware element/dict wrappers never on wire~~ — **not true**: the parent-flag controls only self-tracking emit; `ObjectRef*` markers may still appear on the wire from child-type ref-handling decisions. Excluded from the hot/cold boundary candidates. - `EnableInternString=false` → `StringInterned`, `StringInternFirst*` never on wire - `EnableMetadata=false` → `ObjectWithMetadata*` never on wire - `EnablePolymorphDetect=false` → `ObjectWithTypeName*`, FixObj slot fallback never on wire -A type with all features off emits **only hot markers**; the reader's cold branch is provably dead code for that type — JIT-foldable when SGen-emit can prove it at compile time. +A type with the symmetric features off (Intern / Metadata / Polymorph) emits **only hot markers** in those categories; the reader's cold branch for those is provably dead code for that type — JIT-foldable when SGen-emit can prove it at compile time. `RefHandling` does NOT participate in this collapse. ### Strict no-feature collapse — zero regression baseline -When SGen proves all four feature flags off for a type (`EnableRefHandling = EnableInternString = EnableMetadata = EnablePolymorphDetect = false`), the cold branch AND the `IsHot` check itself **both** drop from the generated emit: +When SGen proves the **symmetric** feature flags off for a type (`EnableInternString = EnableMetadata = EnablePolymorphDetect = false`) AND the type has no children with active ref-tracking (`ChildNeedsRefScan` / `ElementNeedsRefScan` / `DictValueNeedsRefScan` all false), the cold branch AND the `IsHot` check itself **both** drop from the generated emit: ```csharp // Strict no-feature emit — no IsHot check, no cold fallback @@ -237,7 +239,7 @@ Each category's Phase 1 step has its own disasm-verify checkpoint. If a category - Per-category cold method generic specialization vs `Type`-parameter erasure: bin-size vs Tier-1 reflection cost trade-off — measure before committing - Hot-range size budget: how many markers fit in the proposed hot range (e.g., 0x00-0x7F = 128 codes), accounting for FixStrAscii's 32-byte range consumption + TinyInt range -- ChunkedArray and other large-collection variants: which side of the boundary? (Hot if EnableRefHandling-independent, cold if feature-engaged) +- ChunkedArray and other large-collection variants: which side of the boundary? (Hot if independent of the symmetric stripping flags above, cold if feature-engaged) - Cost-benefit on the writer side: does the writer-side cold path (intern-cache hash + lookup, IdentityMap probe) carry the same +/- profile as the reader side? ## Cross-references diff --git a/AyCode.Core/docs/BINARY/BINARY_STRICT_SGEN.md b/AyCode.Core/docs/BINARY/BINARY_STRICT_SGEN.md new file mode 100644 index 0000000..7c8680a --- /dev/null +++ b/AyCode.Core/docs/BINARY/BINARY_STRICT_SGEN.md @@ -0,0 +1,111 @@ +# BINARY — Strict SGen NuGet package (working doc) + +Brainstorming / planning document for the **`AcBinary.Strict`** NuGet package — a SGen-only build target of the same source tree, opt-in for consumers who can guarantee `[AcBinarySerializable]` coverage and want max perf + min publish size (especially NativeAOT). Sibling to `BINARY_BYTECODE_OPTIMIZATION.md` + `BINARY_SGEN_OPTIMIZATION.md`. NOT a TODO entry — design notes. + +## Concept + +Two NuGet packages from the same source tree, distinguished by an MSBuild `DefineConstants`: + +| Package | Constants | Target audience | +|---|---|---| +| **`AcBinary`** (default) | (none) | Drop-in JSON-like UX — 3rd-party DTOs without attributes, gradual opt-in via `[AcBinarySerializable]` on hot-paths. The **market USP**. | +| **`AcBinary.Strict`** (opt-in) | `SGEN_ONLY` | Max perf + min AOT publish — every type must be `[AcBinarySerializable]`-decorated, Roslyn analyzer enforces at build time. | + +**Wire format**: identical. A wire emitted by `AcBinary.Strict` is fully readable by `AcBinary` (downward compatible). The reverse holds only when the Hybrid path emitted no runtime-only markers (polymorphic `ObjectWithTypeName`, etc.) — typically true when the consumer is fully `[AcBinarySerializable]`-decorated. + +## Market positioning — the Hybrid is NOT second-class + +Common reason teams avoid MemPack: **kötelező attribute mindenhol**, 3rd-party DTOs nem szerializálhatók, mindent vagy semmit. The AcBinary Hybrid removes this barrier: +- 3rd-party type → runtime reflection fallback, works out of the box +- gradual opt-in: add `[AcBinarySerializable]` only to identified hot-paths over time +- no big-bang refactor — coexistence of SGen + reflection types in the same graph + +The Strict package is **the optional max-perf niche**, not a replacement. Consumers who already disciplined every DTO (`[AcBinarySerializable]` on each, no polymorphic `object` properties, no runtime-typed collections) graduate to Strict for additional AOT-perf gains. + +## Performance gain — sources + +Naïve estimate: "remove the runtime fallback path" — JIT inline budget freed up, ~2-5% hot path gain. + +Realistic estimate is **larger**, because the SGen-emit itself can simplify when no runtime-cooperation is required: + +| Aspect | Hybrid (current) | Strict-only | Gain estimate | +|---|---|---|---| +| Property complex-child write | `WriteObjectGenerated(value, type, ctx)` → `GetWrapper(type)` → `WriteObject` → `UseTypeReferenceHandling` check → `WriteObjectWithRefHandling/Meta/Properties` | `ChildClass_GeneratedWriter.Instance.WriteProperties(value, ctx)` direct | ~5-8 method-frames eliminated | +| Wrapper-slot lookup | `context._wrapperSlots[s_wrapperSlot]` (array load + bounds check) | direct static field on the generated writer class | array dispatch eliminated | +| Tracker dispatch | `wrapper.TryTrackInt32(id, visitIndex, ref ctx.NextCacheIndexRef, ...)` virtual on wrapper field | static method on the generated writer class | virtual dispatch eliminated | +| Cold-start | `BinarySerializeTypeMetadata` ctor — reflection + Expression.Compile + GlobalMetadataCache | **eliminated entirely** | first-call latency ~50-100× faster (only JIT Tier-0 remains) | +| Generator registry | `GeneratedWriterRegistry.TryGet(type, out writer)` dict lookup | compile-time bind → static field | dict-overhead eliminated | +| Collection element emit | `EmitWriteCollectionElement` 4 branches (no-ref/ref/meta/both) | only 1-2 branches (no runtime-meta needed) | smaller emit IL, smaller JIT code | + +**Realistic Ser hot-path gain: -10..-20%** (not 2-5%). Plus **NativeAOT publish size -30..-50%** (no reflection IL preserved, no `Expression.Compile`, no `IDictionary`/`IEnumerable` interface dispatch tables). + +**Cold-start dramatic change**: first SignalR message / first cache deserialization ~100× faster on Strict (no reflection + no Expression.Compile + no metadata-dict build). + +## Refactor prerequisite — physical separation + +Scattered `#if !SGEN_ONLY` directives mid-method are a maintenance liability. Phase 0 (refactor-only, 0 behavior change): **physical file separation** via `partial class`: + +``` +AcBinarySerializer.cs ← SGen + shared (always compiled) +AcBinarySerializer.RuntimeFallback.cs ← Hybrid-only (file-level `#if !SGEN_ONLY`) +AcBinaryDeserializer.cs ← SGen + shared +AcBinaryDeserializer.RuntimeFallback.cs ← Hybrid-only +AcBinaryDeserializer.TypeReaderTable.cs ← Hybrid-only (256-slot dispatch table) +BinarySerializeTypeMetadata.cs ← shared (reduced ctor) +BinarySerializeTypeMetadata.Reflection.cs ← Hybrid-only (reflection + Expression.Compile) +``` + +`partial class` keeps the public surface unchanged; the body splits across files. **One `#if !SGEN_ONLY` directive per file at the top**, not 50 scattered through the method body. Bridge-methods (e.g. `WriteValueGenerated`, `ReadValueGenerated`) become `partial method` declarations — one body in `.RuntimeFallback.cs` (Hybrid), another in a `SGenBridge.cs` (Strict throws `NotSupportedException` or compile-error). + +This refactor is **valuable on its own** — even if the Strict NuGet never ships, the codebase is clearer (Hybrid path explicit, SGen path explicit, no comingled logic). + +## Phased implementation + +| Phase | Scope | Risk | Behavior change | +|---|---|---|---| +| **0 (refactor-only)** | Physical separation: runtime-fallback code → `.RuntimeFallback.cs` partial files. Same build target, no `#if` directives yet. | very low | none | +| **1 (Strict build target)** | New MSBuild `Configuration: SGenOnly` with `$(DefineConstants);SGEN_ONLY`. File-level `#if !SGEN_ONLY` on the runtime-only partial files. Bridge-method bodies are `partial` — Hybrid bind in `.RuntimeFallback.cs`, Strict bind throws/errors. | low | Strict consumers cannot serialize undecorated types (build / runtime error) | +| **2 (SGen-emit simplification — the BIG perf gain)** | Generator emits direct-call pattern in Strict mode instead of bridges. `EmitDirectCollectionWrite` / `EmitReadComplex` / `EmitWriteProp` branches collapse. Slot-array dispatch replaced with static field references. | medium | hot-path perf jumps -10..-20% on Strict | +| **3 (metadata-dict elimination)** | `BinarySerializeTypeMetadata` / `BinaryDeserializeTypeMetadata` ctor reflection paths gone in Strict. `GlobalMetadataCache` dictionary unused in Strict. Cold-start dramatic improvement. | medium | first-use latency ~100× faster on Strict | +| **4 (Roslyn analyzer)** | Strict-mode analyzer: build-error if a property type lacks `[AcBinarySerializable]` (and isn't a known primitive), if `object`-typed property has no `[AcBinaryPolymorphic(types: typeof(A), typeof(B))]`, etc. Hybrid-mode same checks → warning only (perf hint). | low | better developer experience | +| **5 (NuGet packaging + ADR)** | Separate `AcBinary.Strict.nupkg` publish target. ADR-level documentation. Strict-consumer guide. | low | ship to consumers | + +Phases can be **partially shipped** — e.g. Phase 0 alone is valuable as a codebase tidy-up. Phase 1 ships the build target without performance changes. Phase 2 is the actual perf-gain. Phase 5 is the consumer-facing release. + +## Wire-format compatibility matrix + +| Producer | Consumer | Compatible? | +|---|---|---| +| AcBinary (Hybrid, fully decorated graph) | AcBinary.Strict | ✓ (no runtime-only markers emitted) | +| AcBinary (Hybrid, with undecorated children) | AcBinary.Strict | ✗ — may contain `ObjectWithTypeName` / polymorph markers Strict doesn't decode | +| AcBinary.Strict | AcBinary (Hybrid) | ✓ always (Strict wire ⊂ Hybrid wire) | +| AcBinary.Strict | AcBinary.Strict | ✓ | + +The analyzer in Phase 4 enforces the "fully decorated graph" property on Strict consumers — so the practical incompatibility (row 2) cannot occur in disciplined Strict deployments. + +## Maintenance cost — ~10% CI overhead + +- Source repo: single +- Build targets: Release + SGenOnly (2× build) +- NuGet packages: 2 (`.nupkg`) +- Test suite: same source, conditional skip on runtime-fallback-only tests under `SGEN_ONLY` (e.g. polymorph-without-attribute tests) +- CI time: ~2× (acceptable) + +**Not a duplicated codebase** — same source tree, MSBuild constants differ. + +## Open questions / considerations + +- **Polymorph opt-in attribute**: Strict-mode requires `[AcBinaryPolymorphic(types: typeof(A), typeof(B))]` on `object`-typed properties (closed type set, compile-time bound). The Hybrid path uses runtime `value.GetType()` + `ObjectWithTypeName` — that flow doesn't exist in Strict. Spec out the attribute shape + emit pattern before Phase 4. +- **`IDictionary<,>` / `IEnumerable<>` runtime-typed**: Strict-mode demands closed generic type arguments. Generic `Dictionary` with polymorphic values may not be supported (or requires explicit poly-attribute on the value-type). +- **Existing reference-handling tracker**: `wrapper.TryTrackInt32(id, ...)` is a virtual call on the wrapper. Phase 2 needs a strategy — move tracker state to `[ThreadStatic]` on the context, or generate per-type static trackers (latter more JIT-friendly). +- **3rd-party type-coverage tooling**: Strict-consumers need a Roslyn diagnostic that lists every type in the project's type-graph that lacks `[AcBinarySerializable]` — guides the gradual migration from Hybrid to Strict. +- **Naming**: `AcBinary.Strict` is descriptive but verbose. Alternatives: `AcBinary.Sealed`, `AcBinary.AOT`, `AcBinary.Lean`. Decision deferred to Phase 5. + +## Cross-references + +- `BINARY_SGEN.md` — current SGen architecture (hybrid execution model, bridge methods, wrapper slot system) +- `BINARY_SGEN_OPTIMIZATION.md` — SGen per-property emit micro-optimization brainstorming +- `BINARY_BYTECODE_OPTIMIZATION.md` — wire-format marker layout reorganization (orthogonal — Strict NuGet doesn't change wire format) +- `BINARY_ISSUES.md#accore-bin-i-n6q3` — cold-start cost chain (Phase 3 addresses this directly) +- `BINARY_TODO.md#accore-bin-t-w9f1` — compile-time metadata generation (a precondition for Phase 3 — eliminates `Expression.Compile` cold-start) +- `BINARY_TODO.md#accore-bin-t-t5j8` — JIT Tier-1 warmup (residual cold-start after Phase 3) diff --git a/AyCode.Core/docs/BINARY/BINARY_TODO.md b/AyCode.Core/docs/BINARY/BINARY_TODO.md index bb70916..406b7e4 100644 --- a/AyCode.Core/docs/BINARY/BINARY_TODO.md +++ b/AyCode.Core/docs/BINARY/BINARY_TODO.md @@ -57,16 +57,17 @@ Result: bug-fix risk (three copies drift), ad-hoc divergence (the polymorph `Obj #### Phase C — Feature-conditional SGen-emit -`EmitReadProp` (and the symmetric emit paths) must consult the per-type `Enable*Feature` flags to **omit** case-branches for disabled features. Today the SGen reader handles every marker regardless of the type's feature opt-outs — wasteful, and worse, it silently accepts markers the writer would never emit (instead of fail-fast): +`EmitReadProp` (and the symmetric emit paths) may consult per-type `Enable*Feature` flags to **omit** case-branches for disabled features — but ONLY when the writer-side stripping is symmetric (the writer provably doesn't emit those markers either). Asymmetric stripping introduces a wire-misalignment bug; see `RefAwareEmitPredicate` in `AcBinarySourceGenerator.Models.cs` for the symmetric child-fact pattern. -| Disabled feature | Cases to skip in SGen reader emit | -|---|---| -| `EnableInternStringFeature = false` | `StringInterned`, `StringInternFirstSmall`, `StringInternFirstMedium` | -| `EnableRefHandlingFeature = false` | `ObjectRef`, `ObjectRefFirst`, `ObjectWithMetadataRefFirst` | -| `EnableMetadataFeature = false` | `ObjectWithMetadata`, `ObjectWithMetadataRefFirst` | -| `EnablePolymorphDetectFeature = false` | Already guarded by ACBIN002 (compile error if any `object` property remains on the type) — symmetric here. | +| Disabled feature | Cases to skip in SGen reader emit | Writer-side symmetric? | +|---|---|---| +| `EnableInternStringFeature = false` | `StringInterned`, `StringInternFirstSmall`, `StringInternFirstMedium` | Yes — writer's scan-pass skips string-property iteration; `StringInternEligible` forced false for this type | +| `EnableMetadataFeature = false` | `ObjectWithMetadata`, `ObjectWithMetadataRefFirst` | Yes — writer's `WriteObject` skips the metadata marker branch when the type's `EnableMetadataFeature=false` | +| `EnablePolymorphDetectFeature = false` | (no extra emit — ACBIN002 compile-error already guards `object` property declarations) | Yes — guarded at compile time | -After Phase C: leaner generated code per opt-out type AND wire-misuse (e.g. mixed writer/reader feature configurations) surfaces as **explicit fail-fast** in the `default` switch arm — same philosophy as ACBIN002. +**NOT applicable here**: `EnableRefHandlingFeature = false`. The runtime writer emits `ObjectRef` / `ObjectRefFirst` markers based on child-type metadata (`UseTypeReferenceHandling(childMetadata)`), NOT the parent type's per-type flag. The parent-flag governs ONLY the parent's self-tracking scan-emit. Reader-side `ObjectRef*` case-emit on child properties of this type MUST stay present — gated on the child compile-time fact (`ChildNeedsRefScan` / `ElementNeedsRefScan` / `DictValueNeedsRefScan`) via `RefAwareEmitPredicate`. + +After Phase C (revised scope): leaner generated code per opt-out type for the symmetric flags above; wire-misuse on those (e.g. mixed writer/reader feature configurations) surfaces as **explicit fail-fast** in the `default` switch arm — same philosophy as ACBIN002. ### Perf guardrails (NON-NEGOTIABLE) @@ -1805,4 +1806,42 @@ Egyéb prerekvizit: `ACCORE-BIN-T-W9F1` (compile-time metadata) szinkronizálás - **`[AcBinarySerializable]`-on belül** vegyük fel a `IntEncoding` paramétert, vagy **külön `[AcBinaryEncoding]` attribute** legyen object-szinten is (és a `[AcBinarySerializable]` változatlan)? - **`AcBinaryOptions.WireMode` jövője**: a régi `Fast`/`Compact` enum migrálódjon az új `IntEncoding`/`StringEncoding`-ra (BC-break) vagy maradjon mint shortcut-default? +## ACCORE-BIN-T-J8R4: ASCII Ser path — combined store optimization in `WriteStringWithDispatch` +**Priority:** P2 · **Type:** Performance + +`AcBinarySerializer.BinarySerializationContext.WriteStringWithDispatch` ASCII override branch (`bytesWritten == charLength`, `bytesWritten > 31`, currently around context.cs:919-930) emits separate marker store + `WriteVarUIntUnsafe` method-call. The non-ASCII `case <= 255` branch uses a combined 4-byte unaligned `uint` store for marker+charLen+utf8Len header. + +Benchmark evidence: `AsciiShort` charset benchmark (Console.FullBenchmark_Release_2026-05-19_09-55-05) showed Ser median **+3.8%** (MemPack faster) — regression vs Latin1Long Ser median -2.0%. Repeated-strings cell +13%. The method-call frame to `WriteVarUIntUnsafe` is the suspected cost. + +**Quick fix sketch**: split the ASCII path on `bytesWritten < 128` (VarUInt fits in 1 byte → 2-byte combined ushort header store, no method call) vs `>= 128` (current separate-write path, rare on workload). May also benefit from extending to a 4-byte combined store for `bytesWritten < 16384` (VarUInt fits in 2 bytes). + +Validate on AsciiShort + AsciiLong + Latin1FixAscii (no regression on FixStrAscii path). Cross-check the Repeated +13% cell. + +## ACCORE-BIN-T-M2L7: Per-mode `_typeReaders` table — eliminate residual mode-aware ternaries +**Priority:** P3 · **Type:** Performance · **Related:** `ACCORE-BIN-T-T8F6` (may subsume this — switch refactor inlines mode-dispatch on the `StringSmall` case directly) + +After the `ReadStringSmall` → `ReadStringSmallCompact` / `ReadStringSmallFastWire` split, 3 call sites still inline `FastWire ? FW : Compact` ternary: + +- `AcBinaryDeserializer.cs` — `Readers[BinaryTypeCode.StringSmall]` lambda (TypeReaderTable static init) +- `AcBinaryDeserializer.cs` — cross-type populate `PropertyAccessorType.String` case (~line 1055) +- `AcBinarySourceGenerator.GenReader.cs` — `EmitReadString` switch case `StringSmall` emit (~line 243) + +**Sketch**: a `_typeReaders` instance array on `BinaryDeserializationContext` initialized at `Reset/Initialize` time from one of two static tables (`TypeReaderTable.Readers` vs lazy-built `FastWireReaders`). The FastWire variant differs from the default only in the `StringSmall` slot (clone of the default + 1 overwrite). Hot path: `context._typeReaders[typeCode]` — zero ternary. + +Sibling SGen-emit pattern: emit `context._typeReaders` reference if the call site is in a switch case where the runtime mode can vary (Dictionary key/value, runtime populate). Direct `ReadStringSmallCompact()` stays in the SGen property hot-path (caller already short-circuits FastWire on a separate ag). + +Mode-check cost: 1 reference assignment per `Initialize` call vs 1 branch per `StringSmall` marker dispatched. Net win on string-heavy workloads. + +## ACCORE-BIN-T-T8F6: Replace `TypeReaderTable` delegate-array with switch dispatch in `ReadValue` +**Priority:** P2 · **Type:** Performance · **Related:** `ACCORE-BIN-T-M2L7` (subsumes — mode-dispatch becomes a case-local branch), `BINARY_STRICT_SGEN.md` (Phase 2 SGen-emit simplification — same direct-call philosophy) + +`AcBinaryDeserializer.cs:74-144` builds a 256-slot `Func, Type, object?>[]` dispatch table per `TInput` specialization. Every `ReadValue` marker dispatch is `Readers[typeCode](ctx, type)` — indirect call via delegate-instance, non-inlinable for JIT, opaque for NativeAOT (`IL_calli` lower). + +**No concrete reason for the table over a switch** (reviewed: generic-`TInput` specialization works either way; FixObj 0-63 / FixStrAscii 135-166 ranges become `case >= 0 and < 64: slot = typeCode` trivially; no consumer-extensibility pattern; no per-context override; no multi-version dispatch; cross-type populate doesn't share the table). The pattern is likely historical — first prototype that worked, never refactored. + +**Sketch**: `ReadValue` body becomes `switch (typeCode) { case BinaryTypeCode.Object: return ReadObject(ctx, type); ... }`. Range cases `case >= 0 and < BinaryTypeCode.SlotCount: ...` for FixObj, `case >= BinaryTypeCode.FixStrAsciiBase and <= BinaryTypeCode.FixStrAsciiMax: var length = ...; ...` for FixStrAscii. The `StringSmall` case inlines the mode-aware ternary directly (`context.FastWire ? ReadStringSmallFastWire() : ReadStringSmallCompact()`) — eliminates `M2L7`'s separate lazy-clone need. + +**Expected gain**: ~3-7 ns/dispatch (delegate-invocation overhead eliminated), branch-predictor-friendly (frequent markers as first cases), inline-friendly for small case-bodies (`Null` → `return null`). NativeAOT publish size benefit: ~128 lambda-class metadata entries removed per `TInput` (×4 = ~512 closure objects). + +**Mérni mielőtt belevágunk**: mikrobenchmark a `ReadValue` switch-variant vs delegate-variant 100K-iter loop-pal vegyes marker-mix-szel. Ha ≥2 ns/dispatch gain → érdemes; ha nincs különbség → marad ahogy van. diff --git a/AyCode.Core/docs/BINARY/README.md b/AyCode.Core/docs/BINARY/README.md index 26054eb..f6b0efc 100644 --- a/AyCode.Core/docs/BINARY/README.md +++ b/AyCode.Core/docs/BINARY/README.md @@ -13,6 +13,7 @@ AcBinary serialization system. Primary goal: **speed** (two-phase scan+serialize - [`BINARY_SGEN.md`](BINARY_SGEN.md) — Source generator (`AyCode.Core.Serializers.SourceGenerator`) - [`BINARY_SGEN_OPTIMIZATION.md`](BINARY_SGEN_OPTIMIZATION.md) — SGen per-property emit micro-optimization brainstorming / methodology notes (working doc, not a TODO) - [`BINARY_BYTECODE_OPTIMIZATION.md`](BINARY_BYTECODE_OPTIMIZATION.md) — Wire-format hot/cold marker layout reorganization (sibling working doc, feature-flag-aligned bytecode space partition) +- [`BINARY_STRICT_SGEN.md`](BINARY_STRICT_SGEN.md) — `AcBinary.Strict` NuGet package plan (sibling working doc, SGen-only build target, AOT-friendly opt-in next to the default Hybrid) - [`BINARY_ISSUES.md`](BINARY_ISSUES.md) — Known issues and limitations (binary serializer core) - [`BINARY_TODO.md`](BINARY_TODO.md) — Planned work / open tickets (binary serializer core) - [`BINARY_ASYNCPIPE_ISSUES.md`](BINARY_ASYNCPIPE_ISSUES.md) — Known issues and limitations (streaming I/O layer: `AsyncPipeReaderInput` + `AsyncPipeWriterOutput`)