Refactor string interning to use position-based cache

Implement a new position-based string interning mechanism in AcBinarySerializer/AcBinaryDeserializer. This approach tracks stream positions for interned strings, ensuring 100% reliable cache matching during deserialization, even when strings are skipped or reordered. The serializer now writes (position, cacheIndex) pairs in the footer for all repeated strings, and the deserializer uses this mapping for robust cache population. Removes the old buffer-based interned string logic, updates all relevant code paths, and simplifies interned string handling for greater correctness and maintainability. Also updates benchmarks and test data construction to use the new interning mode.
This commit is contained in:
Loretta 2026-01-27 13:02:16 +01:00
parent 11ac2beb71
commit 466782007d
6 changed files with 311 additions and 229 deletions

View File

@ -16,7 +16,7 @@ public class QuickBenchmark
private static readonly MessagePackSerializerOptions MsgPackOptions = private static readonly MessagePackSerializerOptions MsgPackOptions =
ContractlessStandardResolver.Options.WithCompression(MessagePackCompression.None); ContractlessStandardResolver.Options.WithCompression(MessagePackCompression.None);
private const int DefaultIterations = 1000; private const int DefaultIterations = 10;
#region Helper Methods #region Helper Methods
@ -426,8 +426,8 @@ public class QuickBenchmark
sharedUser: sharedUser, sharedUser: sharedUser,
sharedMetadata: sharedMeta); sharedMetadata: sharedMeta);
var singleOptions = AcBinarySerializerOptions.FastMode; var singleOptions = AcBinarySerializerOptions.WithoutReferenceHandling;
singleOptions.UseStringInterning = StringInterningMode.None; singleOptions.UseStringInterning = StringInterningMode.All;
Console.WriteLine("=== MINIMAL WARMUP TEST ==="); Console.WriteLine("=== MINIMAL WARMUP TEST ===");
Console.WriteLine(); Console.WriteLine();

View File

@ -17,11 +17,23 @@ public static partial class AcBinaryDeserializer
{ {
private readonly ReadOnlySpan<byte> _buffer; private readonly ReadOnlySpan<byte> _buffer;
private int _position; private int _position;
private List<string>? _internedStrings;
private List<string>? _propertyNames; private List<string>? _propertyNames;
//private Dictionary<int, object>? _objectReferences;
private Dictionary<int, string>? _stringCache; private Dictionary<int, string>? _stringCache;
/// <summary>
/// Footer entry for position-based string interning.
/// </summary>
private struct DupEntry
{
public int Position; // Stream position where string was first written
public int CacheIndex; // Index in _internStringCache
}
// Position-based string interning: 100% reliable cache matching
private DupEntry[]? _dupEntries; // Footer: (position, cacheIndex) pairs sorted by position
private string[]? _internStringCache; // Cache for duplicated strings only
private int _dupCheckIndex; // Current position in _dupEntries
/// <summary> /// <summary>
/// Heap-allocated context class for IId-based reference tracking. /// Heap-allocated context class for IId-based reference tracking.
/// Also holds Options - all options-derived properties delegate to ContextClass.Options. /// Also holds Options - all options-derived properties delegate to ContextClass.Options.
@ -68,10 +80,14 @@ public static partial class AcBinaryDeserializer
{ {
_buffer = data; _buffer = data;
_position = 0; _position = 0;
_internedStrings = null;
_propertyNames = null; _propertyNames = null;
//_objectReferences = null;
_stringCache = null; _stringCache = null;
// Position-based string interning fields
_dupEntries = null;
_internStringCache = null;
_dupCheckIndex = 0;
HasMetadata = false; HasMetadata = false;
IsMergeMode = false; IsMergeMode = false;
RemoveOrphanedItems = false; RemoveOrphanedItems = false;
@ -98,7 +114,6 @@ public static partial class AcBinaryDeserializer
var marker = ReadByteInternal(); var marker = ReadByteInternal();
var hasPropertyTable = false; var hasPropertyTable = false;
var hasInternTable = false;
var hasInternFooter = false; var hasInternFooter = false;
var footerPosition = 0; var footerPosition = 0;
@ -151,29 +166,18 @@ public static partial class AcBinaryDeserializer
} }
} }
// Legacy: interned strings in header // Footer-based: read string intern indices from footer
if (hasInternTable)
{
var internCount = (int)ReadVarUInt();
_internedStrings = new List<string>(internCount);
for (var i = 0; i < internCount; i++)
{
_internedStrings.Add(ReadHeaderString());
}
}
// Footer-based: read interned strings from footer, then return to data position
if (hasInternFooter && footerPosition > 0) if (hasInternFooter && footerPosition > 0)
{ {
ReadFooterStrings(footerPosition); ReadFooterStringIndices(footerPosition);
} }
} }
/// <summary> /// <summary>
/// Reads interned strings from footer position, then returns to data position. /// Reads string intern footer: [dupCount][(position, cacheIndex), ...]
/// Uses seek to footer, read strings, seek back to data. /// Position-based format for 100% reliable cache matching.
/// </summary> /// </summary>
private void ReadFooterStrings(int footerPosition) private void ReadFooterStringIndices(int footerPosition)
{ {
// Save current position (start of data) // Save current position (start of data)
var dataPosition = _position; var dataPosition = _position;
@ -181,12 +185,25 @@ public static partial class AcBinaryDeserializer
// Seek to footer // Seek to footer
_position = footerPosition; _position = footerPosition;
// Read interned strings // Read dup count and (position, cacheIndex) pairs
var internCount = (int)ReadVarUInt(); var dupCount = (int)ReadVarUInt();
_internedStrings = new List<string>(internCount); if (dupCount == 0)
for (var i = 0; i < internCount; i++)
{ {
_internedStrings.Add(ReadHeaderString()); _dupEntries = Array.Empty<DupEntry>();
_internStringCache = Array.Empty<string>();
}
else
{
_dupEntries = new DupEntry[dupCount];
for (var i = 0; i < dupCount; i++)
{
var position = (int)ReadVarUInt();
var cacheIndex = (int)ReadVarUInt();
_dupEntries[i] = new DupEntry { Position = position, CacheIndex = cacheIndex };
}
// Cache size: dupCount (cacheIndex is always 0, 1, 2, ..., dupCount-1)
_internStringCache = new string[dupCount];
} }
// Seek back to data position // Seek back to data position
@ -540,23 +557,49 @@ public static partial class AcBinaryDeserializer
_position += count; _position += count;
} }
/// <summary>
/// Registers an interned string during body read (StringInternNew).
/// Uses position-based check for 100% reliable cache matching.
/// </summary>
/// <param name="value">The string value read from stream</param>
/// <param name="streamPosition">Stream position BEFORE reading the string (type code position)</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
public int RegisterInternedString(string value) public void RegisterInternedString(string value, int streamPosition)
{ {
_internedStrings ??= new List<string>(); // Fast path: no duplicates or already processed all
_internedStrings.Add(value); var entries = _dupEntries;
return _internedStrings.Count - 1; if (entries == null || (uint)_dupCheckIndex >= (uint)entries.Length)
return;
// Check if this position matches the next expected duplicate
ref var entry = ref entries[_dupCheckIndex];
if (entry.Position == streamPosition)
{
_internStringCache![entry.CacheIndex] = value;
_dupCheckIndex++;
}
} }
/// <summary>
/// Gets an interned string by cache index (StringInterned type code).
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
public string GetInternedString(int index) public string GetInternedString(int cacheIndex)
{ {
if (_internedStrings == null || (uint)index >= (uint)_internedStrings.Count) if (_internStringCache == null || (uint)cacheIndex >= (uint)_internStringCache.Length)
{ {
throw new AcBinaryDeserializationException($"Invalid interned string index '{index}'.", _position); throw new AcBinaryDeserializationException($"Invalid interned string cache index '{cacheIndex}'.", _position);
} }
return _internedStrings[index]; var result = _internStringCache[cacheIndex];
if (result == null)
{
throw new AcBinaryDeserializationException(
$"Interned string at cache index '{cacheIndex}' was not populated.",
_position);
}
return result;
} }
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]

View File

@ -60,7 +60,12 @@ public static partial class AcBinaryDeserializer
RegisterReader(BinaryTypeCode.String, static (ref BinaryDeserializationContext ctx, Type _, int _) => ReadPlainString(ref ctx)); RegisterReader(BinaryTypeCode.String, static (ref BinaryDeserializationContext ctx, Type _, int _) => ReadPlainString(ref ctx));
RegisterReader(BinaryTypeCode.StringInterned, static (ref BinaryDeserializationContext ctx, Type _, int _) => ctx.GetInternedString((int)ctx.ReadVarUInt())); RegisterReader(BinaryTypeCode.StringInterned, static (ref BinaryDeserializationContext ctx, Type _, int _) => ctx.GetInternedString((int)ctx.ReadVarUInt()));
RegisterReader(BinaryTypeCode.StringEmpty, static (ref BinaryDeserializationContext _, Type _, int _) => string.Empty); RegisterReader(BinaryTypeCode.StringEmpty, static (ref BinaryDeserializationContext _, Type _, int _) => string.Empty);
RegisterReader(BinaryTypeCode.StringInternNew, static (ref BinaryDeserializationContext ctx, Type _, int _) => ReadAndRegisterInternedString(ref ctx)); // StringInternNew: position is captured as Position-1 (after type code was read)
RegisterReader(BinaryTypeCode.StringInternNew, static (ref BinaryDeserializationContext ctx, Type _, int _) =>
{
var streamPosition = ctx.Position - 1; // Position before type code
return ReadAndRegisterInternedString(ref ctx, streamPosition);
});
RegisterReader(BinaryTypeCode.DateTime, static (ref BinaryDeserializationContext ctx, Type _, int _) => ctx.ReadDateTimeUnsafe()); RegisterReader(BinaryTypeCode.DateTime, static (ref BinaryDeserializationContext ctx, Type _, int _) => ctx.ReadDateTimeUnsafe());
RegisterReader(BinaryTypeCode.DateTimeOffset, static (ref BinaryDeserializationContext ctx, Type _, int _) => ctx.ReadDateTimeOffsetUnsafe()); RegisterReader(BinaryTypeCode.DateTimeOffset, static (ref BinaryDeserializationContext ctx, Type _, int _) => ctx.ReadDateTimeOffsetUnsafe());
RegisterReader(BinaryTypeCode.TimeSpan, static (ref BinaryDeserializationContext ctx, Type _, int _) => ctx.ReadTimeSpanUnsafe()); RegisterReader(BinaryTypeCode.TimeSpan, static (ref BinaryDeserializationContext ctx, Type _, int _) => ctx.ReadTimeSpanUnsafe());
@ -136,6 +141,7 @@ public static partial class AcBinaryDeserializer
{ {
context.ReadHeader(); context.ReadHeader();
var result = ReadValue(ref context, targetType, 0); var result = ReadValue(ref context, targetType, 0);
// Position-based string interning - no validation needed
return (T?)result; return (T?)result;
} }
catch (AcBinaryDeserializationException) catch (AcBinaryDeserializationException)
@ -175,7 +181,9 @@ public static partial class AcBinaryDeserializer
try try
{ {
context.ReadHeader(); context.ReadHeader();
return ReadValue(ref context, targetType, 0); var result = ReadValue(ref context, targetType, 0);
// Position-based string interning - no validation needed
return result;
} }
catch (AcBinaryDeserializationException) catch (AcBinaryDeserializationException)
{ {
@ -201,6 +209,7 @@ public static partial class AcBinaryDeserializer
{ {
context.ReadHeader(); context.ReadHeader();
var node = (AcExpressionNode?)ReadValue(ref context, typeof(AcExpressionNode), 0); var node = (AcExpressionNode?)ReadValue(ref context, typeof(AcExpressionNode), 0);
// Position-based string interning - no validation needed
if (node == null) return null; if (node == null) return null;
var entityType = AcSerializerCommon.GetExpressionEntityType(targetExpressionType); var entityType = AcSerializerCommon.GetExpressionEntityType(targetExpressionType);
@ -269,6 +278,8 @@ public static partial class AcBinaryDeserializer
$"Cannot populate type '{targetType.Name}' from binary type code {typeCode}", $"Cannot populate type '{targetType.Name}' from binary type code {typeCode}",
context.Position, targetType); context.Position, targetType);
} }
// Position-based string interning - no validation needed
} }
catch (AcBinaryDeserializationException) catch (AcBinaryDeserializationException)
{ {
@ -333,6 +344,7 @@ public static partial class AcBinaryDeserializer
if (elementMetadata.IsComplexType && elementMetadata.IsIId && elementMetadata.IdGetter != null) if (elementMetadata.IsComplexType && elementMetadata.IsIId && elementMetadata.IdGetter != null)
{ {
MergeIIdCollectionWithMetadata(ref context, targetList, elementType, wrapper, 0); MergeIIdCollectionWithMetadata(ref context, targetList, elementType, wrapper, 0);
// Position-based string interning - no validation needed
return; return;
} }
} }
@ -346,6 +358,8 @@ public static partial class AcBinaryDeserializer
$"Cannot populate type '{targetType.Name}' from binary type code {typeCode}", $"Cannot populate type '{targetType.Name}' from binary type code {typeCode}",
context.Position, targetType); context.Position, targetType);
} }
// Position-based string interning - no validation needed
} }
catch (AcBinaryDeserializationException) catch (AcBinaryDeserializationException)
{ {
@ -389,6 +403,7 @@ public static partial class AcBinaryDeserializer
{ {
context.ReadHeader(); context.ReadHeader();
var result = ReadValue(ref context, targetType, 0); var result = ReadValue(ref context, targetType, 0);
// Position-based string interning - no validation needed
return new BinaryDeserializeChain<T>(dataArray, options, chainTracker, (T?)result); return new BinaryDeserializeChain<T>(dataArray, options, chainTracker, (T?)result);
} }
catch catch
@ -433,6 +448,7 @@ public static partial class AcBinaryDeserializer
{ {
context.ReadHeader(); context.ReadHeader();
var result = ReadValue(ref context, targetType, 0); var result = ReadValue(ref context, targetType, 0);
// Position-based string interning - no validation needed
return (TResult?)result; return (TResult?)result;
} }
catch (AcBinaryDeserializationException) { throw; } catch (AcBinaryDeserializationException) { throw; }
@ -474,6 +490,7 @@ public static partial class AcBinaryDeserializer
context.Position, targetType); context.Position, targetType);
} }
// Position-based string interning - no validation needed
return this; return this;
} }
catch (AcBinaryDeserializationException) { throw; } catch (AcBinaryDeserializationException) { throw; }
@ -764,15 +781,16 @@ public static partial class AcBinaryDeserializer
} }
/// <summary> /// <summary>
/// <20>j intern<72>lt string olvas<61>sa <20>s regisztr<74>l<EFBFBD>sa az intern t<>bl<62>ba. /// Read new interned string and register it in the intern cache.
/// Position is captured BEFORE the type code was read (by caller).
/// </summary> /// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
private static string ReadAndRegisterInternedString(ref BinaryDeserializationContext context) private static string ReadAndRegisterInternedString(ref BinaryDeserializationContext context, int streamPosition)
{ {
var length = (int)context.ReadVarUInt(); var length = (int)context.ReadVarUInt();
if (length == 0) return string.Empty; if (length == 0) return string.Empty;
var str = context.ReadStringUtf8(length); var str = context.ReadStringUtf8(length);
context.RegisterInternedString(str); context.RegisterInternedString(str, streamPosition);
return str; return str;
} }
@ -780,7 +798,7 @@ public static partial class AcBinaryDeserializer
/// Read a string and register it in the intern table for future references. /// Read a string and register it in the intern table for future references.
/// </summary> /// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
private static string ReadAndInternString(ref BinaryDeserializationContext context) private static string ReadAndInternString(ref BinaryDeserializationContext context, int streamPosition)
{ {
var length = (int)context.ReadVarUInt(); var length = (int)context.ReadVarUInt();
if (length == 0) return string.Empty; if (length == 0) return string.Empty;
@ -788,7 +806,7 @@ public static partial class AcBinaryDeserializer
// Always register strings that meet the minimum intern length threshold // Always register strings that meet the minimum intern length threshold
if (str.Length >= context.MinStringInternLength) if (str.Length >= context.MinStringInternLength)
{ {
context.RegisterInternedString(str); context.RegisterInternedString(str, streamPosition);
} }
return str; return str;
@ -1286,6 +1304,8 @@ public static partial class AcBinaryDeserializer
private static void SkipValue(ref BinaryDeserializationContext context, BinaryDeserializeTypeMetadata metaData) private static void SkipValue(ref BinaryDeserializationContext context, BinaryDeserializeTypeMetadata metaData)
{ {
// Capture position before reading type code (needed for string interning)
var streamPosition = context.Position;
var typeCode = context.ReadByte(); var typeCode = context.ReadByte();
if (typeCode == BinaryTypeCode.Null) return; if (typeCode == BinaryTypeCode.Null) return;
@ -1353,8 +1373,8 @@ public static partial class AcBinaryDeserializer
context.ReadVarUInt(); context.ReadVarUInt();
return; return;
case BinaryTypeCode.StringInternNew: case BinaryTypeCode.StringInternNew:
// <EFBFBD>j intern<72>lt string - regisztr<74>lni kell m<>g skip eset<65>n is // New interned string - must register even when skipping
SkipAndRegisterInternedString(ref context); SkipAndRegisterInternedString(ref context, streamPosition);
return; return;
case BinaryTypeCode.ByteArray: case BinaryTypeCode.ByteArray:
var byteLen = (int)context.ReadVarUInt(); var byteLen = (int)context.ReadVarUInt();
@ -1394,22 +1414,26 @@ public static partial class AcBinaryDeserializer
} }
/// <summary> /// <summary>
/// <EFBFBD>j intern<72>lt string kihagy<67>sa - DE regisztr<74>lni kell! /// Skip a new interned string - must still register in cache.
/// </summary> /// </summary>
/// <param name="context">Deserialization context</param>
/// <param name="streamPosition">Position before the type code was read</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void SkipAndRegisterInternedString(ref BinaryDeserializationContext context) private static void SkipAndRegisterInternedString(ref BinaryDeserializationContext context, int streamPosition)
{ {
var byteLen = (int)context.ReadVarUInt(); var byteLen = (int)context.ReadVarUInt();
if (byteLen == 0) return; if (byteLen == 0) return;
var str = context.ReadStringUtf8(byteLen); var str = context.ReadStringUtf8(byteLen);
context.RegisterInternedString(str); context.RegisterInternedString(str, streamPosition);
} }
/// <summary> /// <summary>
/// Skip a string but still register it in the intern table if it meets the length threshold. /// Skip a string but still register it in the intern table if it meets the length threshold.
/// </summary> /// </summary>
/// <param name="context">Deserialization context</param>
/// <param name="streamPosition">Position before the type code was read</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void SkipAndInternString(ref BinaryDeserializationContext context) private static void SkipAndInternString(ref BinaryDeserializationContext context, int streamPosition)
{ {
var byteLen = (int)context.ReadVarUInt(); var byteLen = (int)context.ReadVarUInt();
if (byteLen == 0) return; if (byteLen == 0) return;
@ -1417,7 +1441,7 @@ public static partial class AcBinaryDeserializer
var str = context.ReadStringUtf8(byteLen); var str = context.ReadStringUtf8(byteLen);
if (str.Length >= context.MinStringInternLength) if (str.Length >= context.MinStringInternLength)
{ {
context.RegisterInternedString(str); context.RegisterInternedString(str, streamPosition);
} }
} }

View File

@ -63,13 +63,18 @@ public static partial class AcBinarySerializer
// Use shared reference tracker from AcSerializerCommon // Use shared reference tracker from AcSerializerCommon
//private readonly AcSerializerCommon.SerializationReferenceTracker _refTracker = new(); //private readonly AcSerializerCommon.SerializationReferenceTracker _refTracker = new();
private Dictionary<string, int>? _internedStrings; /// <summary>
private List<string>? _internedStringList; /// String intern entry for tracking string occurrences.
/// StreamPosition-based approach for 100% reliable cache matching.
/// </summary>
private struct StringInternEntry
{
public int StreamPosition; // Position in stream where string was first written
public int CacheIndex; // Dense cache index (0, 1, 2, ...) - assigned at 2nd occurrence; -1 = first occurrence only
}
// Single contiguous buffer for all interned string UTF8 bytes (reused across serializations) private Dictionary<string, StringInternEntry>? _stringInternMap;
private byte[]? _internedStringBuffer; private int _nextCacheIndex; // Next dense cache index to assign
private int _internedStringBufferPos;
private List<int>? _internedStringLengths;
private Dictionary<string, int>? _propertyNames; private Dictionary<string, int>? _propertyNames;
private List<string>? _propertyNameList; private List<string>? _propertyNameList;
@ -139,15 +144,11 @@ public static partial class AcBinarySerializer
_position = 0; _position = 0;
//_refTracker.Reset(); //_refTracker.Reset();
ClearAndTrimIfNeeded(_internedStrings, InitialInternCapacity * 4); ClearAndTrimIfNeeded(_stringInternMap, InitialInternCapacity * 4);
ClearAndTrimIfNeeded(_propertyNames, InitialPropertyNameCapacity * 4); ClearAndTrimIfNeeded(_propertyNames, InitialPropertyNameCapacity * 4);
_propertyNameList?.Clear(); _propertyNameList?.Clear();
_internedStringList?.Clear(); _nextCacheIndex = 0;
_internedStringLengths?.Clear();
// Reset intern buffer position (no deallocation - buffer is reused!)
_internedStringBufferPos = 0;
if (_propertyIndexBuffer != null && _propertyIndexBuffer.Length > PropertyIndexBufferMaxCache) if (_propertyIndexBuffer != null && _propertyIndexBuffer.Length > PropertyIndexBufferMaxCache)
{ {
@ -183,81 +184,89 @@ public static partial class AcBinarySerializer
_propertyStateBuffer = null; _propertyStateBuffer = null;
} }
// _internedStringBuffer is a simple byte[] - no pool return needed, GC handles it
_internedStringBuffer = null;
} }
#region String Interning #region String Interning
/// <summary> /// <summary>
/// Registers a string for interning. Returns the index of the string. /// Tries to intern a string. Returns true if string was seen before (write index).
/// Uses single contiguous buffer for UTF8 bytes to minimize allocations. /// Returns false if first occurrence (write inline).
/// Uses stream position for 100% reliable deserializer cache matching.
/// </summary> /// </summary>
/// <param name="value">The string value to intern</param>
/// <param name="streamPosition">Current stream position (before writing the string)</param>
/// <param name="cacheIndex">Output: cache index for 2+ occurrence, -1 for 1st occurrence</param>
/// <returns>True if 2+ occurrence (write cacheIndex), false if 1st occurrence (write inline)</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
public int RegisterInternedString(string value) public bool TryGetInternedString(string value, int streamPosition, out int cacheIndex)
{ {
_internedStrings ??= new Dictionary<string, int>(InitialInternCapacity, StringComparer.Ordinal); _stringInternMap ??= new Dictionary<string, StringInternEntry>(InitialInternCapacity, StringComparer.Ordinal);
_internedStringList ??= new List<string>(InitialInternCapacity);
_internedStringLengths ??= new List<int>(InitialInternCapacity);
// Single operation: lookup + conditional add ref var entry = ref CollectionsMarshal.GetValueRefOrNullRef(_stringInternMap, value);
ref var index = ref CollectionsMarshal.GetValueRefOrAddDefault(_internedStrings, value, out var exists);
if (exists) if (!Unsafe.IsNullRef(ref entry))
{ {
return index; // 2+ occurrence: assign CacheIndex if first repeat
if (entry.CacheIndex < 0)
{
entry.CacheIndex = _nextCacheIndex++;
}
cacheIndex = entry.CacheIndex;
return true;
} }
// New string - add to list and write UTF8 to buffer // 1st occurrence: store stream position
index = _internedStringList.Count; _stringInternMap[value] = new StringInternEntry
_internedStringList.Add(value);
// Calculate UTF8 byte length
var utf8Length = Ascii.IsValid(value) ? value.Length : Utf8NoBom.GetByteCount(value);
// Ensure intern buffer has capacity
EnsureInternBufferCapacity(utf8Length);
// Write UTF8 bytes to contiguous buffer
if (Ascii.IsValid(value))
{ {
Ascii.FromUtf16(value.AsSpan(), _internedStringBuffer.AsSpan(_internedStringBufferPos, utf8Length), out _); StreamPosition = streamPosition,
} CacheIndex = -1 // Not assigned until 2nd occurrence
else };
{ cacheIndex = -1;
Utf8NoBom.GetBytes(value.AsSpan(), _internedStringBuffer.AsSpan(_internedStringBufferPos, utf8Length)); return false;
}
_internedStringLengths.Add(utf8Length);
_internedStringBufferPos += utf8Length;
return index;
} }
/// <summary> /// <summary>
/// Ensures the intern buffer has enough capacity for additional bytes. /// Returns true if there are any interned strings that occurred more than once.
/// Initial size is calculated from MaxStringInternLength * InitialInternCapacity.
/// </summary> /// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)] public bool HasInternedStrings => _stringInternMap is { Count: > 0 };
private void EnsureInternBufferCapacity(int additionalBytes)
/// <summary>
/// Gets the count of strings that occurred more than once (for footer).
/// </summary>
public int GetDupCount() => _nextCacheIndex;
/// <summary>
/// Writes the footer with (position, cacheIndex) pairs sorted by position.
/// Position-based approach ensures 100% reliable cache matching in deserializer.
/// </summary>
public void WriteInternedStringFooter()
{ {
var required = _internedStringBufferPos + additionalBytes; if (_stringInternMap == null || _nextCacheIndex == 0) return;
if (_internedStringBuffer == null) // Collect entries with CacheIndex >= 0 (occurred more than once)
// We need to sort by StreamPosition for deserializer sequential access
Span<(int Position, int CacheIndex)> entries = _nextCacheIndex <= 64
? stackalloc (int, int)[_nextCacheIndex]
: new (int, int)[_nextCacheIndex];
var idx = 0;
foreach (var entry in _stringInternMap.Values)
{ {
// Initial size: MaxStringInternLength * InitialInternCapacity (e.g., 64 * 32 = 2048) if (entry.CacheIndex >= 0)
var initialSize = MaxStringInternLength * InitialInternCapacity; {
_internedStringBuffer = new byte[Math.Max(initialSize, required)]; entries[idx++] = (entry.StreamPosition, entry.CacheIndex);
return; }
} }
if (required <= _internedStringBuffer.Length) // Sort by StreamPosition (ascending) for deserializer sequential check
{ entries.Sort((a, b) => a.Position.CompareTo(b.Position));
return;
}
// Grow buffer (double size) // Write pairs: (position, cacheIndex)
var newSize = Math.Max(_internedStringBuffer.Length * 2, required); for (var i = 0; i < _nextCacheIndex; i++)
Array.Resize(ref _internedStringBuffer, newSize); {
WriteVarUInt((uint)entries[i].Position);
WriteVarUInt((uint)entries[i].CacheIndex);
}
} }
#endregion #endregion
@ -948,7 +957,8 @@ public static partial class AcBinarySerializer
public void FinalizeHeaderSections() public void FinalizeHeaderSections()
{ {
var hasPropertyNames = UseMetadata && _propertyNameList is { Count: > 0 }; var hasPropertyNames = UseMetadata && _propertyNameList is { Count: > 0 };
var hasInternTable = UseStringInterning && _internedStringList is { Count: > 0 }; var dupCount = UseStringInterning ? GetDupCount() : 0;
var hasInternTable = dupCount > 0;
// Calculate property names header size (strings go to footer now) // Calculate property names header size (strings go to footer now)
var headerPayloadSize = 0; var headerPayloadSize = 0;
@ -976,12 +986,12 @@ public static partial class AcBinarySerializer
} }
} }
// Footer-based string interning: write strings at the end // Footer: write indices of strings that occurred more than once
var footerPosition = 0; var footerPosition = 0;
if (hasInternTable) if (hasInternTable)
{ {
footerPosition = _position; footerPosition = _position;
WriteFooterStrings(); WriteFooterStringIndices(dupCount);
} }
// Write header // Write header
@ -1009,23 +1019,19 @@ public static partial class AcBinarySerializer
} }
/// <summary> /// <summary>
/// Writes interned strings to the footer (end of stream). /// Writes the footer with total count (for verification) + dup count + indices.
/// Uses contiguous buffer - no re-encoding needed. /// Footer format: [totalStringCount][dupCount][dupIndex0][dupIndex1]...
/// </summary> /// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
private void WriteFooterStrings() /// <summary>
/// Writes footer: [dupCount][(position, cacheIndex), ...]
/// Position-based format for 100% reliable deserializer matching.
/// </summary>
private void WriteFooterStringIndices(int dupCount)
{ {
WriteVarUInt((uint)_internedStringList!.Count); // Dup count + (position, cacheIndex) pairs
WriteVarUInt((uint)dupCount);
// Write from contiguous buffer using stored lengths WriteInternedStringFooter();
var offset = 0;
for (var i = 0; i < _internedStringLengths!.Count; i++)
{
var length = _internedStringLengths[i];
WriteVarUInt((uint)length);
WriteBytes(_internedStringBuffer.AsSpan(offset, length));
offset += length;
}
} }
/// <summary> /// <summary>

View File

@ -759,7 +759,7 @@ public static partial class AcBinarySerializer
/// <summary> /// <summary>
/// Optimized string writer with FixStr for short strings. /// Optimized string writer with FixStr for short strings.
/// Uses stackalloc for small strings to avoid allocations. /// New interning strategy: inline on first occurrence, index on 2+.
/// </summary> /// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void WriteString(string value, BinarySerializationContext context) private static void WriteString(string value, BinarySerializationContext context)
@ -776,12 +776,21 @@ public static partial class AcBinarySerializer
&& value.Length >= context.MinStringInternLength && value.Length >= context.MinStringInternLength
&& (context.MaxStringInternLength == 0 || value.Length <= context.MaxStringInternLength)) && (context.MaxStringInternLength == 0 || value.Length <= context.MaxStringInternLength))
{ {
var index = context.RegisterInternedString(value); // Capture position BEFORE writing - this is where deserializer will be when reading
var streamPosition = context.Position;
if (context.TryGetInternedString(value, streamPosition, out var index))
{
// 2+ occurrence: write index reference
context.WriteByte(BinaryTypeCode.StringInterned);
context.WriteVarUInt((uint)index);
return;
}
#if DEBUG #if DEBUG
context.OnStringInterned?.Invoke(context.CurrentPropertyPath, value); context.OnStringInterned?.Invoke(context.CurrentPropertyPath, value);
#endif #endif
context.WriteByte(BinaryTypeCode.StringInterned); // 1st occurrence: write inline with StringInternNew type code
context.WriteVarUInt((uint)index); context.WriteByte(BinaryTypeCode.StringInternNew);
context.WriteStringUtf8(value);
return; return;
} }