[LOADED_DOCS: NONE]
Optimize string serialization: ASCII & UTF-8 fast paths - Refactored AcBinarySerializer/Deserializer to use single-pass UTF-8 encoding and a custom allocation-free UTF-8 decoder, improving performance for both ASCII and non-ASCII strings. - Expanded BinaryTypeCode with new ASCII string markers (FixStrAscii, StringAscii) and updated helpers for robust, branch-friendly string dispatch. - Updated settings.local.json with new diagnostic and plugin management commands.
This commit is contained in:
parent
2c73775389
commit
dc10315fc3
|
|
@ -65,7 +65,9 @@
|
|||
"PowerShell($appDataPaths = @\\(\"H:\\\\Applications\\\\Mango\\\\Source\\\\FruitBank\\\\Presentation\\\\Nop.Web\\\\App_Data\\\\plugins.json\", \"H:\\\\Applications\\\\Mango\\\\Source\\\\FruitBank\\\\Presentation\\\\Nop.Web\\\\App_Data\\\\plugins.installed.json\"\\); foreach \\($f in $appDataPaths\\) { if \\(Test-Path $f\\) { Write-Output \"=== $f ===\"; Get-Content $f -Raw } else { Write-Output \"NOT FOUND: $f\" } })",
|
||||
"Read(//h/Applications/Mango//**)",
|
||||
"Read(//h/Applications/Mango/LLM_PLAN//**)",
|
||||
"Bash(curl -s \"https://raw.githubusercontent.com/dotnet/runtime/main/src/libraries/System.IO.Pipelines/src/System/IO/Pipelines/StreamPipeWriter.cs\")"
|
||||
"Bash(curl -s \"https://raw.githubusercontent.com/dotnet/runtime/main/src/libraries/System.IO.Pipelines/src/System/IO/Pipelines/StreamPipeWriter.cs\")",
|
||||
"WebFetch(domain:lemire.me)",
|
||||
"Bash(gh pr *)"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -390,14 +390,131 @@ public static partial class AcBinaryDeserializer
|
|||
return string.Create(length, (Buffer: _buffer, Start: pos), static (chars, state) =>
|
||||
{
|
||||
var src = state.Buffer.AsSpan(state.Start, chars.Length);
|
||||
for (int i = 0; i < chars.Length; i++)
|
||||
for (var i = 0; i < chars.Length; i++)
|
||||
chars[i] = (char)src[i];
|
||||
});
|
||||
}
|
||||
|
||||
var value2 = Utf8NoBom.GetString(_buffer, _position, length);
|
||||
_position += length;
|
||||
return value2;
|
||||
// Non-ASCII path: custom UTF-8 decoder.
|
||||
// Beats Encoding.UTF8.GetString by skipping the virtual-dispatch + encoder-fallback
|
||||
// overhead the BCL adds for arbitrary inputs. Two passes (count + decode) over the
|
||||
// bytes — both passes are tight scalar loops the JIT can auto-vectorize for the
|
||||
// common 1-byte (ASCII) branch, with predictable branches for 2/3-byte sequences
|
||||
// (Latin extended, Cyrillic, Greek, CJK BMP). 4-byte sequences (supplementary plane:
|
||||
// emoji, rare CJK ext) decode to a UTF-16 surrogate pair.
|
||||
//
|
||||
// The bytes are guaranteed valid UTF-8 because we wrote them via Encoding.UTF8.GetBytes
|
||||
// — no validation needed beyond the bounds checks Span indexing already provides.
|
||||
// If a wire payload is corrupt, an IndexOutOfRangeException surfaces at the
|
||||
// continuation-byte read, which the calling deserializer propagates as a
|
||||
// deserialization failure (same exception class as the BCL path's malformed-input
|
||||
// handling).
|
||||
return DecodeUtf8(length);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Custom UTF-8 → UTF-16 string decoder. Single-allocation via <c>string.Create</c>;
|
||||
/// counts chars first (vectorizable scalar loop), then decodes directly into the
|
||||
/// allocated string's buffer.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.NoInlining)] // cold path; let JIT keep ReadStringUtf8 caller small
|
||||
private string DecodeUtf8(int byteLength)
|
||||
{
|
||||
var pos = _position;
|
||||
_position += byteLength;
|
||||
|
||||
var srcSpan = _buffer.AsSpan(pos, byteLength);
|
||||
var charCount = CountUtf8Chars(srcSpan);
|
||||
|
||||
return string.Create(charCount, (Buffer: _buffer, Pos: pos, Len: byteLength), static (chars, state) =>
|
||||
{
|
||||
DecodeUtf8ToChars(state.Buffer.AsSpan(state.Pos, state.Len), chars);
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Counts UTF-16 chars produced by decoding the given UTF-8 byte span.
|
||||
/// JIT-vectorizable scalar loop: every iteration is a constant-shape branch on bit patterns.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Char-count rules:
|
||||
/// • Continuation bytes (10xxxxxx, 0x80–0xBF) — produced no char, skip.
|
||||
/// • All other start bytes (0xxxxxxx, 110xxxxx, 1110xxxx) — produce 1 char each.
|
||||
/// • 4-byte start bytes (11110xxx, 0xF0–0xF7) — produce 2 chars (surrogate pair).
|
||||
/// </remarks>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private static int CountUtf8Chars(ReadOnlySpan<byte> bytes)
|
||||
{
|
||||
var count = 0;
|
||||
for (var i = 0; i < bytes.Length; i++)
|
||||
{
|
||||
var b = bytes[i];
|
||||
// Non-continuation byte: increments char count
|
||||
if ((b & 0xC0) != 0x80) count++;
|
||||
// 4-byte start (11110xxx): adds extra char for surrogate pair
|
||||
if ((b & 0xF8) == 0xF0) count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Decodes UTF-8 bytes into UTF-16 chars in place. Caller guarantees <paramref name="dst"/>
|
||||
/// has at least the char count returned by <see cref="CountUtf8Chars"/>.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private static void DecodeUtf8ToChars(ReadOnlySpan<byte> src, Span<char> dst)
|
||||
{
|
||||
int srcIdx = 0, dstIdx = 0;
|
||||
while (srcIdx < src.Length)
|
||||
{
|
||||
var b0 = src[srcIdx];
|
||||
switch (b0)
|
||||
{
|
||||
case < 0x80:
|
||||
// 1-byte ASCII (U+0000–U+007F)
|
||||
dst[dstIdx++] = (char)b0;
|
||||
srcIdx += 1;
|
||||
break;
|
||||
case < 0xE0:
|
||||
{
|
||||
// 2-byte sequence: 110xxxxx 10xxxxxx → U+0080–U+07FF
|
||||
// Latin extended (Hungarian, Polish, Czech, Spanish, French diacritics),
|
||||
// Greek, Cyrillic, Hebrew, Arabic, etc.
|
||||
var b1 = src[srcIdx + 1];
|
||||
|
||||
dst[dstIdx++] = (char)(((b0 & 0x1F) << 6) | (b1 & 0x3F));
|
||||
srcIdx += 2;
|
||||
break;
|
||||
}
|
||||
case < 0xF0:
|
||||
{
|
||||
// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx → U+0800–U+FFFF
|
||||
// CJK BMP (most Chinese, Japanese, Korean), various other scripts.
|
||||
var b1 = src[srcIdx + 1];
|
||||
var b2 = src[srcIdx + 2];
|
||||
|
||||
dst[dstIdx++] = (char)(((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F));
|
||||
srcIdx += 3;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000–U+10FFFF
|
||||
// Supplementary plane (emoji, rare CJK ext, ancient scripts) — encoded as
|
||||
// a UTF-16 surrogate pair.
|
||||
var b1 = src[srcIdx + 1];
|
||||
var b2 = src[srcIdx + 2];
|
||||
var b3 = src[srcIdx + 3];
|
||||
var codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
|
||||
|
||||
codepoint -= 0x10000;
|
||||
dst[dstIdx++] = (char)(0xD800 | (codepoint >> 10));
|
||||
dst[dstIdx++] = (char)(0xDC00 | (codepoint & 0x3FF));
|
||||
srcIdx += 4;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private string ReadStringUtf8Cached(int length)
|
||||
|
|
|
|||
|
|
@ -671,29 +671,41 @@ public static partial class AcBinarySerializer
|
|||
return;
|
||||
}
|
||||
|
||||
// D-2: single-pass UTF-8 encode with VarUInt backfill.
|
||||
// Replaces the prior try-ASCII-then-rewind-and-encode-UTF-8 pattern (1 scan ASCII / 3 scans
|
||||
// non-ASCII) with a single GetBytes call that works identically for both content classes.
|
||||
//
|
||||
// Layout: [reserved 5 bytes for max VarUInt][UTF-8 bytes...]
|
||||
// 1. EnsureCapacity for worst-case (5 + charLength*4)
|
||||
// 2. GetBytes directly into buffer at savedPos+5 → returns exact byteCount
|
||||
// 3. If actual VarUInt size < 5, memmove encoded bytes left to compact the gap
|
||||
// 4. WriteVarUInt at savedPos and advance
|
||||
//
|
||||
// Span<byte>.CopyTo is overlap-safe via Buffer.Memmove. For typical short strings
|
||||
// (≤127 bytes UTF-8 → 1-byte VarUInt), the shift is 4 bytes — a few ns memcopy cost
|
||||
// that's dwarfed by the saved ASCII-scan-then-rewind overhead on non-ASCII content,
|
||||
// and is essentially free on ASCII content (cache-resident write).
|
||||
var charLength = value.Length;
|
||||
const int maxVarUIntSize = 5;
|
||||
var maxBytes = charLength * 4;
|
||||
|
||||
// Pre-allocate VarUInt + ASCII body BEFORE savedPosition — if Grow happens,
|
||||
// it fires here, before the save. savedPosition is always in the current chunk.
|
||||
EnsureCapacity(VarUIntSize((uint)charLength) + charLength);
|
||||
var savedPosition = _position;
|
||||
EnsureCapacity(maxVarUIntSize + maxBytes);
|
||||
|
||||
WriteVarUIntUnsafe((uint)charLength);
|
||||
if (Ascii.FromUtf16(value.AsSpan(), _buffer.AsSpan(_position, charLength), out _) == OperationStatus.Done)
|
||||
var savedPos = _position;
|
||||
var encodeStart = savedPos + maxVarUIntSize;
|
||||
var bytesWritten = Utf8NoBom.GetBytes(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
|
||||
|
||||
var varUIntSize = VarUIntSize((uint)bytesWritten);
|
||||
if (varUIntSize < maxVarUIntSize)
|
||||
{
|
||||
_position += charLength;
|
||||
return;
|
||||
var shift = maxVarUIntSize - varUIntSize;
|
||||
_buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(encodeStart - shift, bytesWritten));
|
||||
}
|
||||
|
||||
// Non-ASCII fallback: safe rewind (no Grow happened since pre-allocate)
|
||||
_position = savedPosition;
|
||||
_position = savedPos;
|
||||
|
||||
var byteCount = Utf8NoBom.GetByteCount(value);
|
||||
EnsureCapacity(VarUIntSize((uint)byteCount) + byteCount);
|
||||
WriteVarUIntUnsafe((uint)byteCount);
|
||||
|
||||
Utf8NoBom.GetBytes(value.AsSpan(), _buffer.AsSpan(_position, byteCount));
|
||||
_position += byteCount;
|
||||
WriteVarUIntUnsafe((uint)bytesWritten); // advances _position by varUIntSize
|
||||
_position += bytesWritten;
|
||||
}
|
||||
|
||||
public void WriteFixStr(string value)
|
||||
|
|
|
|||
|
|
@ -85,16 +85,39 @@ internal static class BinaryTypeCode
|
|||
// Property skip marker (SlotCount + 38)
|
||||
public const byte PropertySkip = SlotCount + 38; // 102 — Marks a property with default/null value (skipped during serialization)
|
||||
|
||||
// FixStr range: SlotCount + 39 .. SlotCount + 70 (32 values for strings 0-31 bytes)
|
||||
// FixStr encoding: FixStrBase + length (0-31)
|
||||
// This saves 1 byte for short strings by combining type + length in single byte
|
||||
// FixStr range (UTF-8 short strings): 103..134 (32 values for byte lengths 0-31)
|
||||
// FixStr encoding: FixStrBase + byteLength
|
||||
// Saves 1 byte for short strings by combining type + length in single byte.
|
||||
// Content semantics: UTF-8 bytes (may be ASCII or multi-byte). The reader-side decoder dispatches
|
||||
// on content via the new ASCII variant range below — this range is the "universal short" / UTF-8 lane.
|
||||
public const byte FixStrBase = SlotCount + 39; // 103
|
||||
public const byte FixStrMax = FixStrBase + 31; // 134
|
||||
public const int FixStrMaxLength = 31;
|
||||
|
||||
// Flag-based header markers (must be 16-aligned for flag bits in lower nibble)
|
||||
// Header byte structure: (marker & 0xF0) == HeaderFlagsBase, flags in (marker & 0x0F)
|
||||
public const byte HeaderFlagsBase = 144; // 0x90 — next 16-aligned value after FixStrMax
|
||||
// FixStrAscii range (ASCII-only short strings): 135..166 (32 values for byte lengths 0-31)
|
||||
// FixStrAscii encoding: FixStrAsciiBase + byteLength
|
||||
// Content semantics: pure ASCII bytes (every byte < 0x80). Reader can use byte→char widening
|
||||
// without UTF-8 decode or ASCII validation — the marker itself is the validation contract.
|
||||
// Writer emits this when it can prove the content is ASCII (e.g., GetBytes returns byteCount == charLength).
|
||||
public const byte FixStrAsciiBase = SlotCount + 71; // 135
|
||||
public const byte FixStrAsciiMax = FixStrAsciiBase + 31; // 166
|
||||
public const int FixStrAsciiMaxLength = 31;
|
||||
|
||||
// Long ASCII string marker: 167
|
||||
// Layout: [StringAscii] [VarUInt byteCount] [ASCII bytes]
|
||||
// Counterpart to String (91) which is the universal/UTF-8 long-string marker.
|
||||
// Reader fast-widens via byte→char without UTF-8 decode or IsValid scan.
|
||||
public const byte StringAscii = SlotCount + 103; // 167
|
||||
|
||||
// Reserved slot block: 168..175 (8 slots) for future string-related markers
|
||||
// (e.g., StringCompressed, StringEncoded, StringMixedAscii, etc.). Keeping the 135..167 range
|
||||
// dedicated to ASCII variants for clean range-checks (see IsAsciiString below).
|
||||
|
||||
// Flag-based header markers (must be 16-aligned for flag bits in lower nibble).
|
||||
// Header byte structure: (marker & 0xF0) == HeaderFlagsBase, flags in (marker & 0x0F).
|
||||
// Moved from 144 → 176 (next 16-aligned value after the new ASCII string range) to keep all
|
||||
// string-related markers in one contiguous block 91..167 / FixStrBase..StringAscii.
|
||||
public const byte HeaderFlagsBase = 176; // 0xB0 — next 16-aligned value after StringAscii reserved block
|
||||
public const byte HeaderFlag_Metadata = 0x01; // Bit 0: property metadata included
|
||||
// Reference handling uses 2 separate bits:
|
||||
// Bit 1 (0x02): OnlyId - reference handling for IId objects only
|
||||
|
|
@ -113,58 +136,103 @@ internal static class BinaryTypeCode
|
|||
/// Check if type code represents a reference (string or object).
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool IsReference(byte code) => code is StringInterned or ObjectRef;
|
||||
public static bool IsReference(byte typeCode) => typeCode is StringInterned or ObjectRef;
|
||||
|
||||
/// <summary>
|
||||
/// Check if type code is a FixStr (short string with length encoded in type code).
|
||||
/// Check if type code is any string-related marker — long inline (String / StringAscii),
|
||||
/// interning markers (StringInterned, StringInternFirst), empty marker, or any FixStr variant
|
||||
/// (UTF-8 or ASCII). Centralized predicate so adding/removing string markers requires updating
|
||||
/// only this method, not every dispatch site.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool IsFixStr(byte code) => code is >= FixStrBase and <= FixStrMax;
|
||||
public static bool IsString(byte typeCode)
|
||||
=> (typeCode is >= String and <= StringInternFirst) // 91..94: String, StringInterned, StringEmpty, StringInternFirst
|
||||
|| (typeCode is >= FixStrBase and <= StringAscii); // 103..167: FixStr (UTF-8 short) + FixStrAscii (ASCII short) + StringAscii (ASCII long)
|
||||
|
||||
/// <summary>
|
||||
/// Decode FixStr length from type code.
|
||||
/// Check if type code is a FixStr (UTF-8 short string with byte length encoded in type code).
|
||||
/// Does NOT match FixStrAscii — use <see cref="IsFixStrAscii"/> for that, or <see cref="IsAsciiString"/>
|
||||
/// for the full ASCII-string range.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static int DecodeFixStrLength(byte code) => code - FixStrBase;
|
||||
public static bool IsFixStr(byte typeCode) => typeCode is >= FixStrBase and <= FixStrMax;
|
||||
|
||||
/// <summary>
|
||||
/// Encode FixStr type code for given byte length (0-31).
|
||||
/// Decode FixStr byte length from type code.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static int DecodeFixStrLength(byte typeCode) => typeCode - FixStrBase;
|
||||
|
||||
/// <summary>
|
||||
/// Encode FixStr type code for given byte length (0-31). Caller asserts UTF-8 content semantics.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static byte EncodeFixStr(int byteLength) => (byte)(FixStrBase + byteLength);
|
||||
|
||||
/// <summary>
|
||||
/// Check if byte length can be encoded as FixStr.
|
||||
/// Check if byte length can be encoded as FixStr (UTF-8 short string, 0..31 bytes).
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool CanEncodeAsFixStr(int byteLength) => byteLength is >= 0 and <= 31;
|
||||
|
||||
/// <summary>
|
||||
/// Check if type code is any ASCII string marker — FixStrAscii (short) or StringAscii (long).
|
||||
/// Single contiguous range (135..167) for branch-friendly dispatch on the reader hot path.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool IsAsciiString(byte typeCode) => typeCode is >= FixStrAsciiBase and <= StringAscii;
|
||||
|
||||
/// <summary>
|
||||
/// Check if type code is a FixStrAscii (ASCII short string with byte length encoded in type code).
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool IsFixStrAscii(byte typeCode) => typeCode is >= FixStrAsciiBase and <= FixStrAsciiMax;
|
||||
|
||||
/// <summary>
|
||||
/// Decode FixStrAscii byte length from type code. Length is also the char count (1 byte = 1 char for ASCII).
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static int DecodeFixStrAsciiLength(byte typeCode) => typeCode - FixStrAsciiBase;
|
||||
|
||||
/// <summary>
|
||||
/// Encode FixStrAscii type code for given byte length (0-31). Caller asserts ASCII content semantics
|
||||
/// (every byte less than 0x80). Misuse on non-ASCII content corrupts decode.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static byte EncodeFixStrAscii(int byteLength) => (byte)(FixStrAsciiBase + byteLength);
|
||||
|
||||
/// <summary>
|
||||
/// Check if byte length can be encoded as FixStrAscii (ASCII short string, 0..31 bytes).
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool CanEncodeAsFixStrAscii(int byteLength) => byteLength is >= 0 and <= 31;
|
||||
|
||||
/// <summary>
|
||||
/// Check if type code is a tiny int (single byte int32 encoding).
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool IsTinyInt(byte code) => code >= Int32Tiny;
|
||||
public static bool IsTinyInt(byte typeCode) => typeCode >= Int32Tiny;
|
||||
|
||||
/// <summary>
|
||||
/// Decode tiny int value from type code.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static int DecodeTinyInt(byte code) => code - Int32Tiny - 16;
|
||||
public static int DecodeTinyInt(byte typeCode) => typeCode - Int32Tiny - 16;
|
||||
|
||||
/// <summary>
|
||||
/// Encode small int value (-16 to 47) as type code.
|
||||
/// Returns true if value fits in tiny encoding.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool TryEncodeTinyInt(int value, out byte code)
|
||||
public static bool TryEncodeTinyInt(int value, out byte typeCode)
|
||||
{
|
||||
// Range: -16 to 47 (64 values total, fitting in 192-255)
|
||||
if (value is >= -16 and <= 47)
|
||||
{
|
||||
code = (byte)(value + 16 + Int32Tiny);
|
||||
typeCode = (byte)(value + 16 + Int32Tiny);
|
||||
return true;
|
||||
}
|
||||
code = 0;
|
||||
typeCode = 0;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue