Unify AcBinary string marker; prefix-tier VarUInt encoding
Refactored AcBinary to use a single String marker (167) for long-form strings, replacing StringLen8/16/32. Implemented prefix-tier VarUInt encoding for string lengths, introduced FixStrCount constant, and removed legacy LEB128 code paths. Updated all serialization/deserialization logic and documentation to match the new format. Includes related micro-optimizations and code cleanup.
This commit is contained in:
parent
cf92370bea
commit
4a6e101410
|
|
@ -242,10 +242,8 @@ public partial class AcBinarySourceGenerator
|
||||||
sb.AppendLine($"{i} case BinaryTypeCode.StringUtf16:");
|
sb.AppendLine($"{i} case BinaryTypeCode.StringUtf16:");
|
||||||
sb.AppendLine($"{i} {a} = context.ReadStringUtf16Marker();");
|
sb.AppendLine($"{i} {a} = context.ReadStringUtf16Marker();");
|
||||||
sb.AppendLine($"{i} break;");
|
sb.AppendLine($"{i} break;");
|
||||||
sb.AppendLine($"{i} case BinaryTypeCode.StringLen8:");
|
sb.AppendLine($"{i} case BinaryTypeCode.String:");
|
||||||
sb.AppendLine($"{i} case BinaryTypeCode.StringLen16:");
|
sb.AppendLine($"{i} {a} = context.ReadUniversalLongString();");
|
||||||
sb.AppendLine($"{i} case BinaryTypeCode.StringLen32:");
|
|
||||||
sb.AppendLine($"{i} {a} = context.ReadUniversalLongStringByMarker({tc});");
|
|
||||||
sb.AppendLine($"{i} break;");
|
sb.AppendLine($"{i} break;");
|
||||||
// Interning first-occurrence cases — see comment above.
|
// Interning first-occurrence cases — see comment above.
|
||||||
if (enableInternString)
|
if (enableInternString)
|
||||||
|
|
|
||||||
|
|
@ -270,67 +270,59 @@ public static partial class AcBinaryDeserializer
|
||||||
//if (FastWire) { return ReadRaw<int>(); }
|
//if (FastWire) { return ReadRaw<int>(); }
|
||||||
|
|
||||||
var raw = ReadVarUInt();
|
var raw = ReadVarUInt();
|
||||||
var value = (int)(raw >> 1) ^ -(int)(raw & 1);
|
return (int)(raw >> 1) ^ -(int)(raw & 1);
|
||||||
return value;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Prefix-tier VarUInt decode (UTF-8-style). Wire-size identical to legacy LEB128 across all
|
||||||
|
/// 5 tiers (7/14/21/28/32 bit); decode is loop-less — the first-byte prefix determines total
|
||||||
|
/// size in O(1), and each subsequent byte is read incrementally (no continuation-loop, no
|
||||||
|
/// per-byte shift cascade).
|
||||||
|
/// <para>Tier table (first-byte pattern → total bytes → value range):</para>
|
||||||
|
/// <list type="bullet">
|
||||||
|
/// <item><c>0xxxxxxx</c> → 1 byte → <c>0..127</c></item>
|
||||||
|
/// <item><c>10xxxxxx</c> → 2 byte → <c>128..16 383</c></item>
|
||||||
|
/// <item><c>110xxxxx</c> → 3 byte → <c>16 384..2 097 151</c></item>
|
||||||
|
/// <item><c>1110xxxx</c> → 4 byte → <c>2 097 152..268 435 455</c></item>
|
||||||
|
/// <item><c>1111xxxx</c> → 5 byte → <c>268 435 456..4 294 967 295</c> (prefix nibble unused)</item>
|
||||||
|
/// </list>
|
||||||
|
/// </summary>
|
||||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||||
public uint ReadVarUInt()
|
public uint ReadVarUInt()
|
||||||
{
|
{
|
||||||
//if (FastWire) { return ReadRaw<uint>(); }
|
//if (FastWire) { return ReadRaw<uint>(); }
|
||||||
|
|
||||||
// Multi-segment safety: ensure at least 1 byte before direct buffer access.
|
// ReadByte() routes through EnsureAvailable(1) — ArrayBinaryInput JIT-eliminates the
|
||||||
// ArrayBinaryInput: TryAdvanceSegment => false (JIT eliminates this branch).
|
// bounds-check, multi-segment / AsyncPipeReaderInput advances cross-segment as needed.
|
||||||
if (_position >= _bufferLength)
|
// All tiers are cross-segment safe without explicit segment-handling here.
|
||||||
{
|
|
||||||
if (!Input.TryAdvanceSegment(ref _buffer, ref _position, ref _bufferLength, 1))
|
|
||||||
throw new AcBinaryDeserializationException("Unexpected end of binary payload.", _position);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fast path: single byte (0-127) - ~70% of cases
|
// 1-byte tier (0..127) — ~70% of cases (the most common one for small ids, counts, indices).
|
||||||
var b0 = _buffer[_position];
|
var b0 = ReadByte();
|
||||||
if ((b0 & 0x80) == 0)
|
if (b0 < 0x80) return b0;
|
||||||
{
|
|
||||||
_position++;
|
|
||||||
return b0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fast path: two bytes (128-16383) - ~25% of cases
|
// 2-byte tier (128..16 383) — 10xxxxxx + 1B raw.
|
||||||
if (_position + 1 < _bufferLength)
|
var b1 = ReadByte();
|
||||||
{
|
if (b0 < 0xC0) return ((uint)(b0 & 0x3F) << 8) | b1;
|
||||||
var b1 = _buffer[_position + 1];
|
|
||||||
if ((b1 & 0x80) == 0)
|
|
||||||
{
|
|
||||||
_position += 2;
|
|
||||||
return (uint)(b0 & 0x7F) | ((uint)b1 << 7);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Slow path: 3+ bytes or cross-segment boundary — uses ReadByte() per byte
|
// 3-byte tier (16 384..2 097 151) — 110xxxxx + 2B LE.
|
||||||
return ReadVarUIntSlow();
|
var b2 = ReadByte();
|
||||||
|
if (b0 < 0xE0) return ((uint)(b0 & 0x1F) << 16) | ((uint)b2 << 8) | b1;
|
||||||
|
|
||||||
|
// 4 / 5-byte tiers (rare — value ≥ 2 097 152) — handed off to non-inlined slow path
|
||||||
|
// with already-read bytes passed as args (no re-read).
|
||||||
|
return ReadVarUIntSlow(b0, b1, b2);
|
||||||
}
|
}
|
||||||
|
|
||||||
private uint ReadVarUIntSlow()
|
[MethodImpl(MethodImplOptions.NoInlining)]
|
||||||
|
private uint ReadVarUIntSlow(byte b0, byte b1, byte b2)
|
||||||
{
|
{
|
||||||
uint value = 0;
|
// 4-byte tier (2 097 152 .268 435 455) — 1110xxxx + 3B LE.
|
||||||
var shift = 0;
|
var b3 = ReadByte();
|
||||||
while (true)
|
if (b0 < 0xF0) return ((uint)(b0 & 0x0F) << 24) | ((uint)b3 << 16) | ((uint)b2 << 8) | b1;
|
||||||
{
|
|
||||||
var b = ReadByte();
|
|
||||||
value |= (uint)(b & 0x7F) << shift;
|
|
||||||
if ((b & 0x80) == 0)
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
shift += 7;
|
// 5-byte tier (268 435 456..4 294 967 295) — 1111xxxx + 4B LE (prefix nibble unused).
|
||||||
if (shift > 35)
|
var b4 = ReadByte();
|
||||||
{
|
return ((uint)b4 << 24) | ((uint)b3 << 16) | ((uint)b2 << 8) | b1;
|
||||||
throw new AcBinaryDeserializationException("Invalid VarUInt encoding.", _position);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return value;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||||
|
|
@ -700,14 +692,7 @@ public static partial class AcBinaryDeserializer
|
||||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||||
internal string ReadUniversalLongString()
|
internal string ReadUniversalLongString()
|
||||||
{
|
{
|
||||||
ReadUniversalLongStringHeader(BinaryTypeCode.StringLen32, out var charLength, out var excess);
|
ReadUniversalLongStringHeader(out var charLength, out var excess);
|
||||||
return ReadStringByUnsignedExcess(charLength, excess);
|
|
||||||
}
|
|
||||||
|
|
||||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
|
||||||
internal string ReadUniversalLongStringByMarker(byte marker)
|
|
||||||
{
|
|
||||||
ReadUniversalLongStringHeader(marker, out var charLength, out var excess);
|
|
||||||
return ReadStringByUnsignedExcess(charLength, excess);
|
return ReadStringByUnsignedExcess(charLength, excess);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -718,24 +703,15 @@ public static partial class AcBinaryDeserializer
|
||||||
excess = ReadByte();
|
excess = ReadByte();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Reads the long-form string header — single <see cref="BinaryTypeCode.String"/> marker followed by
|
||||||
|
/// prefix-tier VarUInt charLength (offset by <see cref="BinaryTypeCode.FixStrCount"/>) and a 1/2/4-byte
|
||||||
|
/// unsigned excess slot whose width is derived from charLength (see <see cref="BinaryTypeCode.GetUniversalStringExcessSlotSize"/>).
|
||||||
|
/// </summary>
|
||||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||||
internal void ReadUniversalLongStringHeader(byte marker, out int charLength, out uint excess)
|
internal void ReadUniversalLongStringHeader(out int charLength, out uint excess)
|
||||||
{
|
{
|
||||||
if (marker == BinaryTypeCode.StringLen8)
|
charLength = (int)ReadVarUInt() + BinaryTypeCode.FixStrCount;
|
||||||
{
|
|
||||||
charLength = ReadByte();
|
|
||||||
}
|
|
||||||
else if (marker == BinaryTypeCode.StringLen16)
|
|
||||||
{
|
|
||||||
charLength = ReadUInt16Unsafe();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
charLength = ReadInt32Unsafe();
|
|
||||||
if (charLength < 0)
|
|
||||||
throw new AcBinaryDeserializationException($"Invalid string header: negative charLength ({charLength}).", _position);
|
|
||||||
}
|
|
||||||
|
|
||||||
var slotSize = BinaryTypeCode.GetUniversalStringExcessSlotSize(charLength);
|
var slotSize = BinaryTypeCode.GetUniversalStringExcessSlotSize(charLength);
|
||||||
|
|
||||||
if (slotSize == 1) excess = ReadByte();
|
if (slotSize == 1) excess = ReadByte();
|
||||||
|
|
@ -825,10 +801,8 @@ public static partial class AcBinaryDeserializer
|
||||||
case BinaryTypeCode.StringUtf16:
|
case BinaryTypeCode.StringUtf16:
|
||||||
value = ReadStringUtf16Marker();
|
value = ReadStringUtf16Marker();
|
||||||
return true;
|
return true;
|
||||||
case BinaryTypeCode.StringLen8:
|
case BinaryTypeCode.String:
|
||||||
case BinaryTypeCode.StringLen16:
|
ReadUniversalLongStringHeader(out charLength, out excess);
|
||||||
case BinaryTypeCode.StringLen32:
|
|
||||||
ReadUniversalLongStringHeader(tc, out charLength, out excess);
|
|
||||||
break;
|
break;
|
||||||
case BinaryTypeCode.Null:
|
case BinaryTypeCode.Null:
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -929,14 +903,13 @@ public static partial class AcBinaryDeserializer
|
||||||
// ArrayBinaryInput → if (true) return; → method body entirely eliminated
|
// ArrayBinaryInput → if (true) return; → method body entirely eliminated
|
||||||
// SequenceBinaryInput → if (false) return; → guard eliminated, bounds-check kept
|
// SequenceBinaryInput → if (false) return; → guard eliminated, bounds-check kept
|
||||||
// AsyncPipeReaderInput → if (false) return; → guard eliminated, bounds-check kept
|
// AsyncPipeReaderInput → if (false) return; → guard eliminated, bounds-check kept
|
||||||
if (TInput.IsTrustedSingleSegment) return;
|
|
||||||
|
|
||||||
if (_position > _bufferLength - length)
|
if (TInput.IsTrustedSingleSegment || _position <= _bufferLength - length) return;
|
||||||
{
|
|
||||||
if (!Input.TryAdvanceSegment(ref _buffer, ref _position, ref _bufferLength, length))
|
if (!Input.TryAdvanceSegment(ref _buffer, ref _position, ref _bufferLength, length))
|
||||||
throw new AcBinaryDeserializationException("Unexpected end of binary payload.", _position);
|
throw new AcBinaryDeserializationException("Unexpected end of binary payload.", _position);
|
||||||
AssertGuarantee(length);
|
|
||||||
}
|
AssertGuarantee(length);
|
||||||
}
|
}
|
||||||
|
|
||||||
[Conditional("DEBUG")]
|
[Conditional("DEBUG")]
|
||||||
|
|
|
||||||
|
|
@ -105,9 +105,7 @@ public static partial class AcBinaryDeserializer
|
||||||
// H2Q6 interning tier readers (Compact mode only — Big tier never engages on interning path)
|
// H2Q6 interning tier readers (Compact mode only — Big tier never engages on interning path)
|
||||||
readers[BinaryTypeCode.StringInternFirstSmall] = static (ctx, _) => ctx.ReadAndRegisterInternedStringSmall();
|
readers[BinaryTypeCode.StringInternFirstSmall] = static (ctx, _) => ctx.ReadAndRegisterInternedStringSmall();
|
||||||
readers[BinaryTypeCode.StringInternFirstMedium] = static (ctx, _) => ctx.ReadAndRegisterInternedStringMedium();
|
readers[BinaryTypeCode.StringInternFirstMedium] = static (ctx, _) => ctx.ReadAndRegisterInternedStringMedium();
|
||||||
readers[BinaryTypeCode.StringLen8] = static (ctx, _) => ctx.ReadUniversalLongStringByMarker(BinaryTypeCode.StringLen8);
|
readers[BinaryTypeCode.String] = static (ctx, _) => ctx.ReadUniversalLongString();
|
||||||
readers[BinaryTypeCode.StringLen16] = static (ctx, _) => ctx.ReadUniversalLongStringByMarker(BinaryTypeCode.StringLen16);
|
|
||||||
readers[BinaryTypeCode.StringLen32] = static (ctx, _) => ctx.ReadUniversalLongStringByMarker(BinaryTypeCode.StringLen32);
|
|
||||||
readers[BinaryTypeCode.DateTime] = static (ctx, _) => ctx.ReadDateTimeUnsafe();
|
readers[BinaryTypeCode.DateTime] = static (ctx, _) => ctx.ReadDateTimeUnsafe();
|
||||||
readers[BinaryTypeCode.DateTimeOffset] = static (ctx, _) => ctx.ReadDateTimeOffsetUnsafe();
|
readers[BinaryTypeCode.DateTimeOffset] = static (ctx, _) => ctx.ReadDateTimeOffsetUnsafe();
|
||||||
readers[BinaryTypeCode.TimeSpan] = static (ctx, _) => ctx.ReadTimeSpanUnsafe();
|
readers[BinaryTypeCode.TimeSpan] = static (ctx, _) => ctx.ReadTimeSpanUnsafe();
|
||||||
|
|
@ -1050,10 +1048,8 @@ public static partial class AcBinaryDeserializer
|
||||||
case BinaryTypeCode.StringUtf16:
|
case BinaryTypeCode.StringUtf16:
|
||||||
propInfo.SetValue(target, context.ReadStringUtf16Marker());
|
propInfo.SetValue(target, context.ReadStringUtf16Marker());
|
||||||
return true;
|
return true;
|
||||||
case BinaryTypeCode.StringLen8:
|
case BinaryTypeCode.String:
|
||||||
case BinaryTypeCode.StringLen16:
|
propInfo.SetValue(target, context.ReadUniversalLongString());
|
||||||
case BinaryTypeCode.StringLen32:
|
|
||||||
propInfo.SetValue(target, context.ReadUniversalLongStringByMarker(typeCode));
|
|
||||||
return true;
|
return true;
|
||||||
case BinaryTypeCode.StringEmpty:
|
case BinaryTypeCode.StringEmpty:
|
||||||
propInfo.SetValue(target, string.Empty);
|
propInfo.SetValue(target, string.Empty);
|
||||||
|
|
@ -2045,27 +2041,10 @@ public static partial class AcBinaryDeserializer
|
||||||
case BinaryTypeCode.Decimal:
|
case BinaryTypeCode.Decimal:
|
||||||
context.Skip(16);
|
context.Skip(16);
|
||||||
return;
|
return;
|
||||||
case BinaryTypeCode.StringLen8:
|
case BinaryTypeCode.String:
|
||||||
case BinaryTypeCode.StringLen16:
|
// Skip layout: [VarUInt(charLength - FixStrCount)][unsigned excess slot:1|2|4][bytes]
|
||||||
case BinaryTypeCode.StringLen32:
|
|
||||||
// Skip layout: [charLength:1|2|4 by marker][unsigned excess slot][bytes]
|
|
||||||
{
|
{
|
||||||
int charLength;
|
var charLength = (int)context.ReadVarUInt() + BinaryTypeCode.FixStrCount;
|
||||||
if (typeCode == BinaryTypeCode.StringLen8)
|
|
||||||
{
|
|
||||||
charLength = context.ReadByte();
|
|
||||||
}
|
|
||||||
else if (typeCode == BinaryTypeCode.StringLen16)
|
|
||||||
{
|
|
||||||
charLength = context.ReadUInt16Unsafe();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
charLength = context.ReadInt32Unsafe();
|
|
||||||
if (charLength < 0)
|
|
||||||
throw new AcBinaryDeserializationException($"Invalid string header while skipping: negative charLength ({charLength}).", context.Position);
|
|
||||||
}
|
|
||||||
|
|
||||||
var slotSize = BinaryTypeCode.GetUniversalStringExcessSlotSize(charLength);
|
var slotSize = BinaryTypeCode.GetUniversalStringExcessSlotSize(charLength);
|
||||||
var excess = slotSize switch
|
var excess = slotSize switch
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -606,18 +606,47 @@ public static partial class AcBinarySerializer
|
||||||
BufferAt(_position++) = (byte)value;
|
BufferAt(_position++) = (byte)value;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
WriteVarUIntMultiByteUnsafe(value);
|
WriteVarUIntMultiByteUnsafe(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Prefix-tier VarUInt encoding (UTF-8-style: first byte's high bits encode total size).
|
||||||
|
/// Compact path: <see cref="BitOperations.Log2"/> picks the tier (2/3/4) in O(1), then a single
|
||||||
|
/// <see cref="Unsafe.WriteUnaligned{T}"/><<see cref="uint"/>> stores [prefix-byte | value-bytes LE]
|
||||||
|
/// in one machine instruction. 5-byte tier uses one byte + one uint32 store.
|
||||||
|
/// Tier table:
|
||||||
|
/// 0xxxxxxx → 1 byte (handled inline by caller)
|
||||||
|
/// 10xxxxxx + 1B → 2 byte, 128..16 383 (14 bit)
|
||||||
|
/// 110xxxxx + 2B LE → 3 byte, 16 384..2 097 151 (21 bit)
|
||||||
|
/// 1110xxxx + 3B LE → 4 byte, 2 097 152..268 435 455 (28 bit)
|
||||||
|
/// 1111xxxx + 4B LE → 5 byte, 268 435 456..uint.MaxValue (32 bit; prefix nibble unused)
|
||||||
|
/// Caller MUST ensure ≥5 bytes of buffer space (interface contract) — the uint32 store on the
|
||||||
|
/// 2/3/4-byte tiers writes 4 bytes even though only `tier` bytes are advanced; the trailing
|
||||||
|
/// 1-2 bytes get overwritten by the next encoded element. Little-endian host assumed (all
|
||||||
|
/// shipping .NET 9 platforms).
|
||||||
|
/// </summary>
|
||||||
[MethodImpl(MethodImplOptions.NoInlining)]
|
[MethodImpl(MethodImplOptions.NoInlining)]
|
||||||
private void WriteVarUIntMultiByteUnsafe(uint value)
|
private void WriteVarUIntMultiByteUnsafe(uint value)
|
||||||
{
|
{
|
||||||
while (value >= 0x80)
|
if (value < 0x10000000)
|
||||||
{
|
{
|
||||||
BufferAt(_position++) = (byte)(value | 0x80);
|
// 2/3/4-byte tier: tier ∈ {2,3,4}, shift ∈ {8,16,24}, prefix ∈ {0x80,0xC0,0xE0}.
|
||||||
value >>= 7;
|
// Packed uint32: byte0 = prefix | (value >> shift); bytes 1..3 = value LE (low 24 bits).
|
||||||
|
var tier = (BitOperations.Log2(value) / 7) + 1;
|
||||||
|
var shift = (tier - 1) << 3;
|
||||||
|
var prefix = (0xFF00u >> (tier - 1)) & 0xFFu;
|
||||||
|
var packed = (prefix | (value >> shift)) | (value << 8);
|
||||||
|
|
||||||
|
Unsafe.WriteUnaligned(ref BufferAt(_position), packed);
|
||||||
|
_position += tier;
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
BufferAt(_position++) = (byte)value;
|
|
||||||
|
// 5-byte tier: 0xF0 marker + full uint32 LE (prefix nibble unused).
|
||||||
|
BufferAt(_position) = 0xF0;
|
||||||
|
Unsafe.WriteUnaligned(ref BufferAt(_position + 1), value);
|
||||||
|
_position += 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||||
|
|
@ -868,7 +897,7 @@ public static partial class AcBinarySerializer
|
||||||
/// Header is fully determined before encode:
|
/// Header is fully determined before encode:
|
||||||
/// <list type="bullet">
|
/// <list type="bullet">
|
||||||
/// <item><c>charLength <= 31</c>: <c>[FixStr(marker carries charLength)][unsigned excess:1]</c></item>
|
/// <item><c>charLength <= 31</c>: <c>[FixStr(marker carries charLength)][unsigned excess:1]</c></item>
|
||||||
/// <item><c>charLength > 31</c>: <c>[StringLen8|StringLen16|StringLen32][charLength:1|2|4][unsigned excess:1|2|4]</c></item>
|
/// <item><c>charLength > 31</c>: <c>[String][VarUInt(charLength - FixStrCount)][unsigned excess:1|2|4]</c> — single marker with prefix-tier VarUInt charLength</item>
|
||||||
/// </list>
|
/// </list>
|
||||||
/// Body is UTF-8-encoded exactly once to the final destination (<c>encodeStart</c>) — no post-encode
|
/// Body is UTF-8-encoded exactly once to the final destination (<c>encodeStart</c>) — no post-encode
|
||||||
/// body shift/copy. For the current path, <c>excess = bytesWritten - charLength</c> is expected to be
|
/// body shift/copy. For the current path, <c>excess = bytesWritten - charLength</c> is expected to be
|
||||||
|
|
@ -892,42 +921,46 @@ public static partial class AcBinarySerializer
|
||||||
|
|
||||||
// Tight UTF-8 upper bound for valid UTF-16 input: max 3 bytes per UTF-16 code unit.
|
// Tight UTF-8 upper bound for valid UTF-16 input: max 3 bytes per UTF-16 code unit.
|
||||||
var maxBytes = charLength * 3;
|
var maxBytes = charLength * 3;
|
||||||
var isFixStr = charLength <= BinaryTypeCode.FixStrMaxLength;
|
|
||||||
var charLengthSize = isFixStr ? 0 : charLength <= byte.MaxValue ? 1 : charLength <= ushort.MaxValue ? 2 : 4;
|
// Single branch on FixStr vs long-form — replaces the previous 4 ternary-on-isFixStr cascade.
|
||||||
// IMPORTANT: the slot VALUE (excess) is not known before UTF-8 encode, but the slot SIZE is.
|
// IMPORTANT: the slot VALUE (excess) is not known before UTF-8 encode, but the slot SIZE is.
|
||||||
// We reserve the slot by width (1/2/4) from charLength, so encodeStart is final and no body shift is needed.
|
// We reserve the slot by width (1/2/4) from charLength, so encodeStart is final and no body shift is needed.
|
||||||
var slotSize = isFixStr ? 1 : BinaryTypeCode.GetUniversalStringExcessSlotSize(charLength);
|
int slotSize, headerSize, headerPos, slotPos, encodeStart;
|
||||||
var headerSize = isFixStr ? 2 : 1 + charLengthSize + slotSize;
|
if (charLength <= BinaryTypeCode.FixStrMaxLength)
|
||||||
|
|
||||||
EnsureCapacity(headerSize + maxBytes);
|
|
||||||
|
|
||||||
var headerPos = _position;
|
|
||||||
var slotPos = isFixStr ? headerPos + 1 : headerPos + 1 + charLengthSize;
|
|
||||||
var encodeStart = headerPos + headerSize;
|
|
||||||
|
|
||||||
if (isFixStr)
|
|
||||||
{
|
{
|
||||||
// Universal short-form string marker with unsigned excess slot.
|
// FixStr: header = [marker:1][slot:1]
|
||||||
|
slotSize = 1;
|
||||||
|
headerSize = 2;
|
||||||
|
|
||||||
|
EnsureCapacity(headerSize + maxBytes);
|
||||||
|
|
||||||
|
headerPos = _position;
|
||||||
|
slotPos = headerPos + 1;
|
||||||
|
encodeStart = headerPos + 2;
|
||||||
|
|
||||||
BufferAt(headerPos) = BinaryTypeCode.EncodeFixStr(charLength);
|
BufferAt(headerPos) = BinaryTypeCode.EncodeFixStr(charLength);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Universal long-form markers with marker-coded charLength width.
|
// Long-form: header = [marker:1][VarUInt(charLength - FixStrCount)][slot:1|2|4]
|
||||||
if (charLengthSize == 1)
|
// FixStr already covers 0..FixStrMaxLength, so wireLen = charLength - FixStrCount
|
||||||
{
|
// keeps the small-band 1-byte VarUInt populated.
|
||||||
BufferAt(headerPos) = BinaryTypeCode.StringLen8;
|
slotSize = BinaryTypeCode.GetUniversalStringExcessSlotSize(charLength);
|
||||||
BufferAt(headerPos + 1) = unchecked((byte)charLength);
|
var varUIntSize = VarUIntSize((uint)(charLength - BinaryTypeCode.FixStrCount));
|
||||||
}
|
headerSize = 1 + varUIntSize + slotSize;
|
||||||
else if (charLengthSize == 2)
|
|
||||||
{
|
EnsureCapacity(headerSize + maxBytes);
|
||||||
BufferAt(headerPos) = BinaryTypeCode.StringLen16;
|
|
||||||
Unsafe.WriteUnaligned(ref BufferAt(headerPos + 1), unchecked((ushort)charLength));
|
headerPos = _position;
|
||||||
}
|
slotPos = headerPos + 1 + varUIntSize;
|
||||||
else
|
encodeStart = headerPos + headerSize;
|
||||||
{
|
|
||||||
BufferAt(headerPos) = BinaryTypeCode.StringLen32;
|
BufferAt(headerPos) = BinaryTypeCode.String;
|
||||||
Unsafe.WriteUnaligned(ref BufferAt(headerPos + 1), charLength);
|
|
||||||
}
|
_position = headerPos + 1;
|
||||||
|
WriteVarUIntUnsafe((uint)(charLength - BinaryTypeCode.FixStrCount));
|
||||||
|
// _position now == slotPos. Slot write below uses Unsafe.WriteUnaligned at slotPos;
|
||||||
|
// _position is finalized at the end via `_position = encodeStart + bytesWritten`.
|
||||||
}
|
}
|
||||||
|
|
||||||
var status = System.Text.Unicode.Utf8.FromUtf16(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes), out _, out var bytesWritten, replaceInvalidSequences: false);
|
var status = System.Text.Unicode.Utf8.FromUtf16(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes), out _, out var bytesWritten, replaceInvalidSequences: false);
|
||||||
|
|
|
||||||
|
|
@ -70,8 +70,8 @@ internal static class BinaryTypeCode
|
||||||
// Marker 91 is reserved for FastWire UTF-16 payloads:
|
// Marker 91 is reserved for FastWire UTF-16 payloads:
|
||||||
// [StringUtf16][charLen:int32 LE][UTF-16 raw bytes]
|
// [StringUtf16][charLen:int32 LE][UTF-16 raw bytes]
|
||||||
//
|
//
|
||||||
// Universal compact-mode strings use FixStr (135..166) + StringLen8/16/32 (167..169):
|
// Universal compact-mode strings use FixStr (135..166) + String (167):
|
||||||
// [FixStr] / [StringLenN][charLen:N][unsigned excess slot][UTF-8 bytes]
|
// [FixStr] / [String][VarUInt(charLength - FixStrCount)][unsigned excess slot][UTF-8 bytes]
|
||||||
//
|
//
|
||||||
// Interning tiers keep dedicated markers.
|
// Interning tiers keep dedicated markers.
|
||||||
public const byte StringUtf16 = SlotCount + 27; // 91 — FastWire UTF-16 marker payload
|
public const byte StringUtf16 = SlotCount + 27; // 91 — FastWire UTF-16 marker payload
|
||||||
|
|
@ -123,23 +123,31 @@ internal static class BinaryTypeCode
|
||||||
public const byte FixStrBase = SlotCount + 71; // 135
|
public const byte FixStrBase = SlotCount + 71; // 135
|
||||||
public const byte FixStrMax = FixStrBase + 31; // 166
|
public const byte FixStrMax = FixStrBase + 31; // 166
|
||||||
public const int FixStrMaxLength = 31;
|
public const int FixStrMaxLength = 31;
|
||||||
|
/// <summary>
|
||||||
|
/// Number of FixStr marker slots (= <see cref="FixStrMaxLength"/> + 1 = 32). Also used as the
|
||||||
|
/// wire-format offset for the long-form <see cref="String"/> marker's VarUInt-encoded charLength:
|
||||||
|
/// <c>wireLen = charLength - FixStrCount</c>. Single source of truth — if the FixStr range
|
||||||
|
/// ever expands (e.g. 32 → 64 slots), this constant updates the offset everywhere consistently.
|
||||||
|
/// </summary>
|
||||||
|
public const int FixStrCount = FixStrMaxLength + 1; // 32
|
||||||
// Backward-compatible aliases (old naming)
|
// Backward-compatible aliases (old naming)
|
||||||
public const byte FixStrAsciiBase = FixStrBase;
|
public const byte FixStrAsciiBase = FixStrBase;
|
||||||
public const byte FixStrAsciiMax = FixStrMax;
|
public const byte FixStrAsciiMax = FixStrMax;
|
||||||
public const int FixStrAsciiMaxLength = FixStrMaxLength;
|
public const int FixStrAsciiMaxLength = FixStrMaxLength;
|
||||||
|
|
||||||
// Long universal string markers (marker encodes charLength field width)
|
// Single universal long-form string marker (formerly split into StringLen8/16/32 at 167/168/169 —
|
||||||
// Layout:
|
// unified via prefix-tier VarUInt charLength encoding; magnitude-tier-elés is exactly what VarUInt
|
||||||
// StringLen8 (167): [marker][charLen:1][excess slot][bytes]
|
// does, so the 3-way marker discrimination became redundant).
|
||||||
// StringLen16 (168): [marker][charLen:2][excess slot][bytes]
|
//
|
||||||
// StringLen32 (169): [marker][charLen:4][excess slot][bytes]
|
// Wire layout: [String:1] [VarUInt(charLength - FixStrCount)] [excess slot:1|2|4] [UTF-8 bytes]
|
||||||
public const byte StringLen8 = SlotCount + 103; // 167
|
//
|
||||||
public const byte StringLen16 = SlotCount + 104; // 168
|
// charLength here is always > FixStrMaxLength (smaller values use FixStr).
|
||||||
public const byte StringLen32 = SlotCount + 105; // 169
|
public const byte String = SlotCount + 103; // 167
|
||||||
|
|
||||||
// Backward-compatible aliases
|
// 168..169 — Reserved (formerly StringLen16/32, freed by VarUInt-based length unification).
|
||||||
public const byte String = StringLen32;
|
// Available for future marker allocation per BINARY_TODO.md marker-tier reorganization plan.
|
||||||
public const byte StringAscii = StringLen32;
|
public const byte ReservedStringSlotMin = SlotCount + 104; // 168
|
||||||
|
public const byte ReservedStringSlotMax = SlotCount + 105; // 169
|
||||||
|
|
||||||
// Reserved slot block: 170..175 (6 slots) for future string-related markers
|
// Reserved slot block: 170..175 (6 slots) for future string-related markers
|
||||||
// (e.g., StringCompressed, StringEncoded, StringMixedAscii, etc.). Keeping the 135..169 range
|
// (e.g., StringCompressed, StringEncoded, StringMixedAscii, etc.). Keeping the 135..169 range
|
||||||
|
|
@ -193,7 +201,7 @@ internal static class BinaryTypeCode
|
||||||
|| typeCode == StringEmpty
|
|| typeCode == StringEmpty
|
||||||
|| typeCode == StringInternFirstSmall
|
|| typeCode == StringInternFirstSmall
|
||||||
|| typeCode == StringInternFirstMedium
|
|| typeCode == StringInternFirstMedium
|
||||||
|| (typeCode is >= FixStrBase and <= StringLen32); // 135..169: FixStr + StringLen8/16/32
|
|| (typeCode is >= FixStrBase and <= String); // 135..167: FixStr + String
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Check if type code is the FastWire UTF-16 string marker.
|
/// Check if type code is the FastWire UTF-16 string marker.
|
||||||
|
|
@ -210,11 +218,11 @@ internal static class BinaryTypeCode
|
||||||
=> typeCode == StringInternFirstSmall || typeCode == StringInternFirstMedium;
|
=> typeCode == StringInternFirstSmall || typeCode == StringInternFirstMedium;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Check if type code is any universal string marker — FixStr (short) or StringLen8/16/32 (long).
|
/// Check if type code is any universal string marker — FixStr (short, 135..166) or String (long, 167).
|
||||||
/// Single contiguous range (135..169) for branch-friendly dispatch on the reader hot path.
|
/// Single contiguous range (135..167) for branch-friendly dispatch on the reader hot path.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||||
public static bool IsStringUniversalMarker(byte typeCode) => typeCode is >= FixStrBase and <= StringLen32;
|
public static bool IsStringUniversalMarker(byte typeCode) => typeCode is >= FixStrBase and <= String;
|
||||||
|
|
||||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||||
public static bool IsAsciiString(byte typeCode) => IsStringUniversalMarker(typeCode);
|
public static bool IsAsciiString(byte typeCode) => IsStringUniversalMarker(typeCode);
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
using System;
|
using System;
|
||||||
using System.Buffers;
|
using System.Buffers;
|
||||||
|
using System.Numerics;
|
||||||
using System.Runtime.CompilerServices;
|
using System.Runtime.CompilerServices;
|
||||||
using System.Runtime.InteropServices;
|
using System.Runtime.InteropServices;
|
||||||
using System.Text;
|
using System.Text;
|
||||||
|
|
@ -221,15 +222,32 @@ public struct BufferWriterBinaryOutput : IBinaryOutputBase
|
||||||
WriteVarUIntMultiByteUnsafe(value);
|
WriteVarUIntMultiByteUnsafe(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Prefix-tier VarUInt encoding (UTF-8-style). Compact: 2/3/4-byte tiers packed into a single
|
||||||
|
/// <see cref="Unsafe.WriteUnaligned{T}"/><<see cref="uint"/>> store via
|
||||||
|
/// <see cref="BitOperations.Log2"/>-derived tier; 5-byte tier uses one byte + one uint32 store.
|
||||||
|
/// Caller must ensure ≥5 bytes of buffer space. See
|
||||||
|
/// <see cref="AcBinarySerializer.BinarySerializationContext{TOutput}.WriteVarUInt"/> for the
|
||||||
|
/// full tier table.
|
||||||
|
/// </summary>
|
||||||
[MethodImpl(MethodImplOptions.NoInlining)]
|
[MethodImpl(MethodImplOptions.NoInlining)]
|
||||||
private void WriteVarUIntMultiByteUnsafe(uint value)
|
private void WriteVarUIntMultiByteUnsafe(uint value)
|
||||||
{
|
{
|
||||||
while (value >= 0x80)
|
if (value < 0x10000000)
|
||||||
{
|
{
|
||||||
_buffer[_position++] = (byte)(value | 0x80);
|
// 2/3/4-byte tier: tier ∈ {2,3,4}, shift ∈ {8,16,24}, prefix ∈ {0x80,0xC0,0xE0}.
|
||||||
value >>= 7;
|
var tier = (BitOperations.Log2(value) / 7) + 1;
|
||||||
|
var shift = (tier - 1) << 3;
|
||||||
|
var prefix = (0xFF00u >> (tier - 1)) & 0xFFu;
|
||||||
|
var packed = (prefix | (value >> shift)) | (value << 8);
|
||||||
|
Unsafe.WriteUnaligned(ref _buffer[_position], packed);
|
||||||
|
_position += tier;
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
_buffer[_position++] = (byte)value;
|
// 5-byte tier: 0xF0 marker + full uint32 LE (prefix nibble unused).
|
||||||
|
_buffer[_position] = 0xF0;
|
||||||
|
Unsafe.WriteUnaligned(ref _buffer[_position + 1], value);
|
||||||
|
_position += 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||||
|
|
|
||||||
|
|
@ -30,20 +30,31 @@ The flags byte uses `0xB0` (176) as base with bit flags in the lower nibble. (Mo
|
||||||
|
|
||||||
## Variable-Length Encoding
|
## Variable-Length Encoding
|
||||||
|
|
||||||
### VarUInt (unsigned)
|
### VarUInt (unsigned 32-bit)
|
||||||
|
|
||||||
LEB128: 7 data bits per byte, MSB = continuation flag.
|
Prefix-tier encoding (UTF-8-style): the first byte's high bits determine total size,
|
||||||
|
then the remaining bytes are read as a fixed-size little-endian value. No continuation-loop.
|
||||||
|
|
||||||
```
|
| First byte | Total | Bit-budget | Value range |
|
||||||
value < 128 → 1 byte [0xxxxxxx]
|
|---|---|---|---|
|
||||||
value < 16384 → 2 bytes [1xxxxxxx] [0xxxxxxx]
|
| `0xxxxxxx` | 1 byte | 7 | `0 .. 127` |
|
||||||
value < 2097152 → 3 bytes ...
|
| `10xxxxxx` + 1B | 2 byte | 14 | `128 .. 16 383` |
|
||||||
(max 5 bytes for uint32)
|
| `110xxxxx` + 2B LE | 3 byte | 21 | `16 384 .. 2 097 151` |
|
||||||
```
|
| `1110xxxx` + 3B LE | 4 byte | 28 | `2 097 152 .. 268 435 455` |
|
||||||
|
| `1111xxxx` + 4B LE | 5 byte | 32 | `268 435 456 .. 4 294 967 295` (prefix nibble unused) |
|
||||||
|
|
||||||
### VarInt (signed)
|
Wire-size is identical to the legacy LEB128 encoding across all 5 tier-boundaries (7/14/21/28/32 bit) —
|
||||||
|
auto-inc IDs and other large values pay the same byte count. Decode wins: 1 if-ladder (max 4 branches,
|
||||||
|
JIT jump-table-friendly) + 1 fix-sized load on 3+ byte tiers, instead of N×ReadByte + N×shift +
|
||||||
|
N×continuation-check.
|
||||||
|
|
||||||
ZigZag encoding maps signed to unsigned, then LEB128:
|
Encode (`WriteVarUIntMultiByteUnsafe`) uses `BitOperations.Log2` to pick the tier in O(1), then
|
||||||
|
packs the prefix byte + value bytes into a single `Unsafe.WriteUnaligned<uint>` store on the 2/3/4-byte
|
||||||
|
tiers (5-byte tier: one byte + one uint32 store).
|
||||||
|
|
||||||
|
### VarInt (signed 32-bit)
|
||||||
|
|
||||||
|
ZigZag encoding maps signed to unsigned, then VarUInt (prefix-tier above):
|
||||||
|
|
||||||
```
|
```
|
||||||
encode: (value << 1) ^ (value >> 31)
|
encode: (value << 1) ^ (value >> 31)
|
||||||
|
|
@ -52,9 +63,12 @@ decode: (raw >> 1) ^ -(raw & 1)
|
||||||
|
|
||||||
Maps: `0 → 0`, `-1 → 1`, `1 → 2`, `-2 → 3`, etc.
|
Maps: `0 → 0`, `-1 → 1`, `1 → 2`, `-2 → 3`, etc.
|
||||||
|
|
||||||
### VarULong (unsigned 64-bit)
|
### VarULong / VarLong (unsigned / signed 64-bit)
|
||||||
|
|
||||||
Same LEB128 encoding, max 10 bytes for uint64.
|
Legacy LEB128 encoding (7 data bits per byte, MSB = continuation flag). Max 10 bytes for uint64.
|
||||||
|
The 64-bit variants kept LEB128 — the prefix-tier benefit is concentrated in the 32-bit `VarUInt`
|
||||||
|
hot-path (auto-inc IDs, cache-indices, length prefixes, charLength offsets); 64-bit values are
|
||||||
|
rarely VarUInt-encoded.
|
||||||
|
|
||||||
## Type Markers
|
## Type Markers
|
||||||
|
|
||||||
|
|
@ -116,17 +130,35 @@ Second occurrence of a referenced polymorphic object uses plain `ObjectRef(65)`
|
||||||
| 89 | Decimal | `[89] [16 bytes]` |
|
| 89 | Decimal | `[89] [16 bytes]` |
|
||||||
| 90 | Char | `[90] [VarUInt]` |
|
| 90 | Char | `[90] [VarUInt]` |
|
||||||
|
|
||||||
### Strings (91–94, 167)
|
### Strings (91, 92, 93, 104, 105, 167)
|
||||||
|
|
||||||
| Code | Name | Wire format |
|
| Code | Name | Wire format |
|
||||||
|------|------|-------------|
|
|------|------|-------------|
|
||||||
| 91 | String | `[91] [VarUInt byteLength] [UTF-8 bytes]` — generic UTF-8 (any content) |
|
| 91 | StringUtf16 | `[91] [charLen:int32 LE] [UTF-16 raw bytes]` — FastWire mode UTF-16 payload (no UTF-8 transcoding; speed > size) |
|
||||||
| 92 | StringInterned | `[92] [VarUInt cacheIndex]` — 2nd+ occurrence |
|
| 92 | StringInterned | `[92] [VarUInt cacheIndex]` — 2nd+ occurrence of an interned string |
|
||||||
| 93 | StringEmpty | `[93]` — no payload |
|
| 93 | StringEmpty | `[93]` — empty string, no payload |
|
||||||
| 94 | StringInternFirst | `[94] [VarUInt cacheIndex] [VarUInt byteLength] [UTF-8 bytes]` — 1st occurrence |
|
| 104 | StringInternFirstSmall | `[104] [VarUInt cacheIdx] [charLen:8][utf8Len:8] [UTF-8 bytes]` — 1st occurrence interning tier, packed dual-length (utf8Len ≤ 255) |
|
||||||
| 167 | StringAscii | `[167] [VarUInt byteLength] [ASCII bytes]` — pure ASCII (every byte < 0x80); reader byte→char widens, no UTF-8 decode |
|
| 105 | StringInternFirstMedium | `[105] [VarUInt cacheIdx] [charLen:16 LE][utf8Len:16 LE] [UTF-8 bytes]` — 1st occurrence interning tier, packed dual-length (utf8Len ≤ 65535) |
|
||||||
|
| 167 | String | `[167] [VarUInt(charLength - FixStrCount)] [unsigned excess:1\|2\|4] [UTF-8 bytes]` — universal long-form (charLength > FixStrMaxLength) |
|
||||||
|
|
||||||
The writer detects ASCII via `bytesWritten == charLength` after a single-pass UTF-8 encode (every UTF-16 char < 0x80 produces exactly 1 UTF-8 byte; non-ASCII chars always produce 2-4 bytes), then emits `StringAscii` (167) or `String` (91) accordingly. The reader uses the marker as the ASCII-validity contract — `StringAscii` bypasses UTF-8 decode entirely.
|
The 167 `String` marker is the universal long-form string. It unifies the former `StringLen8/16/32`
|
||||||
|
(at 167/168/169) into a single marker with prefix-tier VarUInt charLength encoding (offset by
|
||||||
|
`FixStrCount = FixStrMaxLength + 1 = 32`, since FixStr already covers 0..31). The excess slot
|
||||||
|
width (1 / 2 / 4 bytes) is selected from charLength via `GetUniversalStringExcessSlotSize` and
|
||||||
|
encodes `bytesWritten - charLength` (= UTF-8 byte excess over UTF-16 char count) for the reader's
|
||||||
|
`string.Create(charLen) + Utf8.ToUtf16` fast path (avoids the pre-scan `Encoding.UTF8.GetCharCount` pass).
|
||||||
|
|
||||||
|
The interning tiers (`StringInternFirstSmall` / `Medium` at 104/105) keep their packed dual-length
|
||||||
|
(charLen + utf8Len) format — the post-encode tier choice exploits the typical interning workload
|
||||||
|
(short property names, enum strings) for fast deserialize via single packed-read.
|
||||||
|
|
||||||
|
ASCII strings flow through the same `String` / FixStr markers — the reader uses `excess == 0` as
|
||||||
|
the ASCII-validity discriminator (every UTF-16 char < 0x80 produces exactly 1 UTF-8 byte; non-ASCII
|
||||||
|
chars always produce 2-4 bytes). On excess=0 the reader takes the byte→char widen fast path
|
||||||
|
(`Encoding.Latin1.GetString` SIMD); on excess>0 the UTF-8 decode runs.
|
||||||
|
|
||||||
|
Slots 168, 169 are **Reserved** — freed by the StringLen8/16/32 unification, available for future
|
||||||
|
marker allocation (see Reserved Ranges below).
|
||||||
|
|
||||||
### Date/Time (95–98)
|
### Date/Time (95–98)
|
||||||
|
|
||||||
|
|
@ -146,33 +178,41 @@ The writer detects ASCII via `bytesWritten == charLength` after a single-pass UT
|
||||||
| 101 | NoMetadataHeader | Legacy: implies `RefHandling=true`, no metadata |
|
| 101 | NoMetadataHeader | Legacy: implies `RefHandling=true`, no metadata |
|
||||||
| 102 | PropertySkip | `[102]` — marks skipped property (default/null value) |
|
| 102 | PropertySkip | `[102]` — marks skipped property (default/null value) |
|
||||||
|
|
||||||
### FixStr (103–134) — short UTF-8 strings
|
### FixStr (135–166) — short universal string marker
|
||||||
|
|
||||||
Short strings (any UTF-8 content) encoded in a single marker byte + raw UTF-8 bytes (no length prefix):
|
Short strings encoded in a single marker byte + raw UTF-8 bytes (no length prefix — charLength
|
||||||
|
encoded in the marker offset). The H2Q6 reorganization merged the formerly-split FixStr (UTF-8 at
|
||||||
|
103-134) and FixStrAscii (135-166) into one universal FixStr block at 135-166; codepoints 103..134
|
||||||
|
are now part of the Reserved Range (see below).
|
||||||
|
|
||||||
```
|
```
|
||||||
[FixStrBase + byteLength] [UTF-8 bytes]
|
[FixStrBase + charLength] [UTF-8 bytes]
|
||||||
```
|
```
|
||||||
|
|
||||||
- Length range: 0–31 **bytes** (`FixStrBase=103`, `FixStrMax=134`)
|
- Length range: 0..31 chars (`FixStrBase = 135`, `FixStrMax = 166`, `FixStrMaxLength = 31`)
|
||||||
- Saves 1 byte vs `String` marker + VarUInt length
|
- `FixStrCount = FixStrMaxLength + 1 = 32` — single source of truth for the FixStr slot count,
|
||||||
|
also the wire-format offset for the long-form `String` marker's VarUInt charLength (`wireLen =
|
||||||
|
charLength - FixStrCount`). If the FixStr range ever expands (e.g. 32 → 64 slots), this constant
|
||||||
|
updates the offset everywhere consistently.
|
||||||
|
- Saves header bytes vs `String` marker + VarUInt length (1 marker byte total vs 3+ byte header)
|
||||||
- Content semantics: UTF-8 (may contain multi-byte sequences for non-ASCII chars)
|
- Content semantics: UTF-8 (may contain multi-byte sequences for non-ASCII chars)
|
||||||
- Reader dispatches via the (universal-)UTF-8 decode path
|
- ASCII discriminator: the reader uses the post-decode `excess == 0` check; ASCII-only strings
|
||||||
|
bypass the UTF-8 decode via `Encoding.Latin1.GetString` SIMD byte→char widen.
|
||||||
|
|
||||||
### FixStrAscii (135–166) — short ASCII strings
|
### Reserved ranges
|
||||||
|
|
||||||
Short ASCII-only strings encoded in a single marker byte + raw ASCII bytes:
|
- **103..134** (29 slots): freed by H2Q6 FixStr unification (formerly the non-ASCII FixStr range).
|
||||||
|
Active reservations per `BINARY_TODO.md` marker-tier reorganization plan:
|
||||||
```
|
- 104: `StringInternFirstSmall` (active)
|
||||||
[FixStrAsciiBase + byteLength] [ASCII bytes]
|
- 105: `StringInternFirstMedium` (active)
|
||||||
```
|
- 106..134: reserved for `ACCORE-BIN-T-L9Y3` FixArray short-list count (16 values), `S5L8`
|
||||||
|
sentinel-length tiers (5 values), `S2X9` markerless schema lane (4 values), `F3W6` dedicated
|
||||||
- Length range: 0–31 **bytes** = chars (1:1 for ASCII) (`FixStrAsciiBase=135`, `FixStrAsciiMax=166`)
|
FastWire string marker (1 value), general reserve (3 values)
|
||||||
- Same wire size as `FixStr` (1 marker byte + bytes), but the marker IS the ASCII-validity contract
|
- **168..169** (2 slots): freed by `String` marker unification (formerly StringLen16, StringLen32).
|
||||||
- Reader byte→char widens directly (`Encoding.Latin1.GetString` SIMD-accelerated path) — no UTF-8 decode, no run-time `Ascii.IsValid` scan
|
Available for the upcoming `FixStr` range expansion (32→64 chars) — would extend FixStr from
|
||||||
- Writer chooses between `FixStrAscii` and `FixStr` post-encode via `bytesWritten == charLength`
|
`135..166` to `135..198`, absorbing 168/169 and pushing the next free slot to 199.
|
||||||
|
- **170..175** (6 slots): pre-existing reserve for future string-related markers (e.g., compressed
|
||||||
Codepoints **168–175** are reserved for future string-related markers (e.g., compressed / base64 / mixed-ASCII variants), keeping the 91–167 range a single contiguous string-marker block.
|
/ base64 / mixed-ASCII variants), keeping the 91..167 range a single contiguous string-marker block.
|
||||||
|
|
||||||
### TinyInt (192–255)
|
### TinyInt (192–255)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,73 @@ This page covers planned work for the **binary serializer core** (format, SGen,
|
||||||
> **Archived entries**: see `BINARY_TODO_2026_04.md` and `BINARY_TODO_2026_05.md` (year-month bucket archives per LLMP-DEC retention policy).
|
> **Archived entries**: see `BINARY_TODO_2026_04.md` and `BINARY_TODO_2026_05.md` (year-month bucket archives per LLMP-DEC retention policy).
|
||||||
> Archive files are not auto-loaded — read on demand if relevant context is suspected (regression hint, supersession reference, ID lookup for archived entry).
|
> Archive files are not auto-loaded — read on demand if relevant context is suspected (regression hint, supersession reference, ID lookup for archived entry).
|
||||||
|
|
||||||
|
## ACCORE-BIN-T-V3P9: ~~Prefix-tier VarUInt + String marker unification + FixStrCount constant~~
|
||||||
|
**Status:** Closed (2026-05-26) · **Priority:** ~~P1~~ · **Type:** ~~Performance + Wire format~~
|
||||||
|
|
||||||
|
~~Three coordinated changes (single wire-format-breaking iteration, no FormatVersion bump per user
|
||||||
|
decision — old payloads not deserializable with the new code):~~
|
||||||
|
|
||||||
|
### Resolution (2026-05-26)
|
||||||
|
|
||||||
|
**1. Prefix-tier VarUInt encoding (replaces LEB128).** `WriteVarUIntMultiByteUnsafe` and `ReadVarUInt`
|
||||||
|
in `AcBinarySerializer.BinarySerializationContext` + `AcBinaryDeserializer.BinaryDeserializationContext`
|
||||||
|
+ `BufferWriterBinaryOutput` rewritten from LEB128 continuation-loop to UTF-8-style prefix-tier
|
||||||
|
encoding. First-byte prefix bits (`0xxxxxxx` / `10xxxxxx` / `110xxxxx` / `1110xxxx` / `1111xxxx`)
|
||||||
|
determine total size in O(1); remaining bytes are fixed-size little-endian. Wire-size identical to
|
||||||
|
LEB128 across all 5 tiers (7/14/21/28/32 bit) — auto-inc IDs pay the same byte count. Decode wins:
|
||||||
|
1 if-ladder + 1 fix-sized load on 3+ byte tiers instead of N×ReadByte + N×shift + N×continuation-check.
|
||||||
|
Encoder uses `BitOperations.Log2` + single `Unsafe.WriteUnaligned<uint>` store for the 2/3/4-byte tiers
|
||||||
|
(branch count reduced from 4 to 1). See `BINARY_FORMAT.md` Variable-Length Encoding section for the
|
||||||
|
tier table.
|
||||||
|
|
||||||
|
**2. String marker unification (`StringLen8/16/32` → single `String` marker).** The 3-marker
|
||||||
|
magnitude-tier dispatch (167 `StringLen8` + 168 `StringLen16` + 169 `StringLen32`) was redundant
|
||||||
|
once VarUInt itself became prefix-tier. Collapsed into a single `String` marker at 167 with VarUInt
|
||||||
|
charLength + slot. Slots 168, 169 freed for future marker allocation. Wire layout:
|
||||||
|
`[String:1] [VarUInt(charLength - FixStrCount)] [excess slot:1|2|4] [UTF-8 bytes]`. The slot-size
|
||||||
|
(1/2/4) is still derived from `charLength` via `GetUniversalStringExcessSlotSize`. `BinaryTypeCode.cs`,
|
||||||
|
`WriteStringWithDispatch`, `ReadUniversalLongString*`, `TryReadStringProperty`, `AcBinaryDeserializer`
|
||||||
|
reader registration / PopulateProperty switch / Skip path, and `AcBinarySourceGenerator.GenReader`
|
||||||
|
emit all updated. `StringAscii` alias deleted.
|
||||||
|
|
||||||
|
**3. `FixStrCount` constant.** New `public const int FixStrCount = FixStrMaxLength + 1` in
|
||||||
|
`BinaryTypeCode.cs`. Single source of truth for the FixStr slot count (= 32), also the wire-format
|
||||||
|
offset for the long-form `String` marker's VarUInt charLength (`wireLen = charLength - FixStrCount`).
|
||||||
|
If the FixStr range ever expands (e.g. 32 → 64 slots), this constant updates the offset everywhere
|
||||||
|
consistently — both the writer (`charLength - FixStrCount`) and reader (`wireLen + FixStrCount`).
|
||||||
|
|
||||||
|
**4. `EnsureAvailable` micro-optimization.** Combined `if (!TInput.IsTrustedSingleSegment && ...)`
|
||||||
|
short-circuit (was two separate ifs). Single branch, better Tier-0 / cold-path / AOT codegen.
|
||||||
|
Hot-path JIT (Tier-1) was already CSE-equivalent.
|
||||||
|
|
||||||
|
**5. `ReadVarUInt` incremental-byte slow path.** The 4/5-byte tier slow path now accepts `b0/b1/b2`
|
||||||
|
as parameters (no re-read of bytes the inline fast path already consumed). Inline fast path covers
|
||||||
|
1/2/3-byte tiers (cross-segment safe via `ReadByte()` → `EnsureAvailable(1)` JIT-eliminate on
|
||||||
|
ArrayBinaryInput).
|
||||||
|
|
||||||
|
**6. `WriteStringWithDispatch` if-cascade reorganization.** Replaced 4 `isFixStr ? ... : ...`
|
||||||
|
ternaries with a single explicit `if (isFixStr) { ... } else { ... }` block. Each branch holds
|
||||||
|
its own constants (no cross-branch ternary CSE pressure on the JIT). Tier-0 / cold-path codegen
|
||||||
|
slightly cleaner; Tier-1 hot-path was already CSE-equivalent (bench-confirmed neutral, but smaller
|
||||||
|
IL — `?:` in C# is 4 separate `brfalse/brtrue` instructions, not 1).
|
||||||
|
|
||||||
|
### Acceptance criteria met
|
||||||
|
|
||||||
|
- ✅ Full solution build (`AyCode.Core.sln`) — 0 errors.
|
||||||
|
- ✅ Benchmark snapshot (Latin1Short, 2026-05-26 14:20): AcBinary vs MemoryPack Ser median **−3.1%**
|
||||||
|
(was −1.8% before), Ser geo **−3.2%** (was −2.5%). Per-cell improvements 0.3..2.2% on Ser,
|
||||||
|
2.2..3.7% on Deser. Bench file: `Test_Benchmark_Results/Benchmark/Console.FullBenchmark_Release_2026-05-26_14-20-29.LLM`.
|
||||||
|
- ✅ Doc-sync: `BINARY_FORMAT.md` Variable-Length Encoding + Strings + FixStr sections updated to
|
||||||
|
reflect the new wire layout. Reserved-range table added (103..134, 168..169, 170..175 buckets).
|
||||||
|
|
||||||
|
### Wire-format breaking note
|
||||||
|
|
||||||
|
This change set is **wire-format breaking** — payloads serialized by the pre-V3P9 code (LEB128 +
|
||||||
|
StringLen8/16/32) are NOT deserializable by the new code. Per user decision the `FormatVersion`
|
||||||
|
header byte was NOT bumped (silent breaking; AcBinary is consumer-private, no cross-deployment
|
||||||
|
compatibility surface). If future versioned compat is desired, a `FormatVersion 1 → 2` bump would
|
||||||
|
be the conventional approach.
|
||||||
|
|
||||||
## ACCORE-BIN-T-N4P8: ~~SGen reference-property null-check parity across all four emit branches~~
|
## ACCORE-BIN-T-N4P8: ~~SGen reference-property null-check parity across all four emit branches~~
|
||||||
**Status:** Closed (2026-05-23) · **Priority:** ~~P1~~ · **Type:** ~~Bug fix~~
|
**Status:** Closed (2026-05-23) · **Priority:** ~~P1~~ · **Type:** ~~Bug fix~~
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue